2015-06-01 14:13:45

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list

Generalize the post_schedule() stuff into a balance callback list.
This allows us to more easily use it outside of schedule() and cross
sched_class.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 36 ++++++++++++++++++++++++------------
kernel/sched/deadline.c | 19 ++++++++++---------
kernel/sched/rt.c | 25 +++++++++++--------------
kernel/sched/sched.h | 19 +++++++++++++++++--
4 files changed, 62 insertions(+), 37 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2277,23 +2277,35 @@ static struct rq *finish_task_switch(str
#ifdef CONFIG_SMP

/* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
{
- if (rq->post_schedule) {
- unsigned long flags;
+ struct callback_head *head, *next;
+ void (*func)(struct rq *rq);
+ unsigned long flags;

- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rq->curr->sched_class->post_schedule)
- rq->curr->sched_class->post_schedule(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ head = rq->balance_callback;
+ rq->balance_callback = NULL;
+ while (head) {
+ func = (void (*)(struct rq *))head->func;
+ next = head->next;
+ head->next = NULL;
+ head = next;

- rq->post_schedule = 0;
+ func(rq);
}
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+ if (unlikely(rq->balance_callback))
+ __balance_callback(rq);
}

#else

-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
{
}

@@ -2311,7 +2323,7 @@ asmlinkage __visible void schedule_tail(
/* finish_task_switch() drops rq->lock and enables preemtion */
preempt_disable();
rq = finish_task_switch(prev);
- post_schedule(rq);
+ balance_callback(rq);
preempt_enable();

if (current->set_child_tid)
@@ -2822,7 +2834,7 @@ static void __sched __schedule(void)
} else
raw_spin_unlock_irq(&rq->lock);

- post_schedule(rq);
+ balance_callback(rq);
}

static inline void sched_submit_work(struct task_struct *tsk)
@@ -7216,7 +7228,7 @@ void __init sched_init(void)
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
- rq->post_schedule = 0;
+ rq->balance_callback = NULL;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,9 +213,16 @@ static inline bool need_pull_dl_task(str
return dl_task(prev);
}

-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+
+static void push_dl_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
{
- rq->post_schedule = has_pushable_dl_tasks(rq);
+ if (!has_pushable_dl_tasks(rq))
+ return;
+
+ queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
}

static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -1126,7 +1133,7 @@ struct task_struct *pick_next_task_dl(st
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);

- set_post_schedule(rq);
+ queue_push_tasks(rq);

return p;
}
@@ -1544,11 +1551,6 @@ static int pull_dl_task(struct rq *this_
return ret;
}

-static void post_schedule_dl(struct rq *rq)
-{
- push_dl_tasks(rq);
-}
-
/*
* Since the task is not running and a reschedule is not going to happen
* anytime soon on its runqueue, we try pushing it away now.
@@ -1784,7 +1786,6 @@ const struct sched_class dl_sched_class
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
- .post_schedule = post_schedule_dl,
.task_woken = task_woken_dl,
#endif

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -354,13 +354,16 @@ static inline int has_pushable_tasks(str
return !plist_head_empty(&rq->rt.pushable_tasks);
}

-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
+
+static void push_rt_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
{
- /*
- * We detect this state here so that we can avoid taking the RQ
- * lock again later if there is no need to push
- */
- rq->post_schedule = has_pushable_tasks(rq);
+ if (!has_pushable_tasks(rq))
+ return;
+
+ queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
}

static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -417,7 +420,7 @@ static inline int pull_rt_task(struct rq
return 0;
}

-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
@@ -1497,7 +1500,7 @@ pick_next_task_rt(struct rq *rq, struct
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);

- set_post_schedule(rq);
+ queue_push_tasks(rq);

return p;
}
@@ -2042,11 +2045,6 @@ static int pull_rt_task(struct rq *this_
return ret;
}

-static void post_schedule_rt(struct rq *rq)
-{
- push_rt_tasks(rq);
-}
-
/*
* If we are not running and we are not going to reschedule soon, we should
* try to push tasks away now
@@ -2318,7 +2316,6 @@ const struct sched_class rt_sched_class
.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
- .post_schedule = post_schedule_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
#endif
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -624,9 +624,10 @@ struct rq {
unsigned long cpu_capacity;
unsigned long cpu_capacity_orig;

+ struct callback_head *balance_callback;
+
unsigned char idle_balance;
/* For active balancing */
- int post_schedule;
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
@@ -695,6 +696,21 @@ struct rq {
#endif
};

+static inline void
+queue_balance_callback(struct rq *rq,
+ struct callback_head *head,
+ void (*func)(struct rq *rq))
+{
+ lockdep_assert_held(&rq->lock);
+
+ if (unlikely(head->next))
+ return;
+
+ head->func = (void (*)(struct callback_head *))func;
+ head->next = rq->balance_callback;
+ rq->balance_callback = head;
+}
+
static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
@@ -1192,7 +1208,6 @@ struct sched_class {
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);

- void (*post_schedule) (struct rq *this_rq);
void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);



2015-06-03 04:42:18

by Kamalesh Babulal

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list

* Peter Zijlstra <[email protected]> [2015-06-01 15:58:19]:

[...]

> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -213,9 +213,16 @@ static inline bool need_pull_dl_task(str
> return dl_task(prev);
> }
>
> -static inline void set_post_schedule(struct rq *rq)
> +static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
> +
> +static void push_dl_tasks(struct rq *);
> +
> +static inline void queue_push_tasks(struct rq *rq)
> {
> - rq->post_schedule = has_pushable_dl_tasks(rq);
> + if (!has_pushable_dl_tasks(rq))
> + return;
> +
> + queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
> }

When compiling with CONFIG_SMP=n, following build warning is triggered:
CC kernel/sched/deadline.o
kernel/sched/deadline.c: In function ‘pick_next_task_dl’:
kernel/sched/deadline.c:1136:2: error: implicit declaration of function ‘queue_push_tasks’ [-Werror=implicit-function-declaration]
queue_push_tasks(rq);
^
cc1: some warnings being treated as errors

set_post_schedule() exist for CONFIG_SMP=n case, which was not modified to
queue_push_tasks().

[...]
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -624,9 +624,10 @@ struct rq {
> unsigned long cpu_capacity;
> unsigned long cpu_capacity_orig;
>
> + struct callback_head *balance_callback;
> +
> unsigned char idle_balance;
> /* For active balancing */
> - int post_schedule;
> int active_balance;
> int push_cpu;
> struct cpu_stop_work active_balance_work;
> @@ -695,6 +696,21 @@ struct rq {
> #endif
> };
>
> +static inline void
> +queue_balance_callback(struct rq *rq,
> + struct callback_head *head,
> + void (*func)(struct rq *rq))
> +{
> + lockdep_assert_held(&rq->lock);
> +
> + if (unlikely(head->next))
> + return;
> +
> + head->func = (void (*)(struct callback_head *))func;
> + head->next = rq->balance_callback;
> + rq->balance_callback = head;
> +}
> +

While compiling with CONFIG_SMP=n, another build error is seen:

In file included from kernel/sched/core.c:86:0:
kernel/sched/sched.h: In function ‘queue_balance_callback’:
kernel/sched/sched.h:710:17: error: ‘struct rq’ has no member named ‘balance_callback’
head->next = rq->balance_callback;
^
kernel/sched/sched.h:711:4: error: ‘struct rq’ has no member named ‘balance_callback’
rq->balance_callback = head;
^

Guarding queue_balance_callback() with #ifdef CONFIG_SMP fixes
the issue, as all of the call sites are also with #ifdef CONFIG_SMP

Thanks,
Kamalesh

2015-06-03 07:21:42

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list

On Wed, Jun 03, 2015 at 10:11:59AM +0530, Kamalesh Babulal wrote:
> set_post_schedule() exist for CONFIG_SMP=n case, which was not modified to
> queue_push_tasks().

Yeah, already noticed the SMP=n borkage, fixed it after posting the rfc.

2015-06-03 08:34:12

by pang.xunlei

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list

Hi Peter,

This may increase the overhead of schedule() a bit, as it will have
more work to do.

check_class_changed():
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
/* Possble rq->lock 'hole'. */
p->sched_class->switched_to(rq, p);

For above cases, why can't we just add a judgement in switched_to_fair()
as follows:
if (rq != task_rq(p))
return;

switched_to_rt() and switched_to_dl() already did the similar judgement.

Then the following operation is usually task_rq_unlock() or the like,
seems no other harm with the old rq of p.

-Xunlei

Peter Zijlstra <[email protected]> wrote 2015-06-01 PM 09:58:19:
> [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback
list
>
> Generalize the post_schedule() stuff into a balance callback list.
> This allows us to more easily use it outside of schedule() and cross
> sched_class.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

--------------------------------------------------------
ZTE Information Security Notice: The information contained in this mail (and any attachment transmitted herewith) is privileged and confidential and is intended for the exclusive use of the addressee(s). If you are not an intended recipient, any disclosure, reproduction, distribution or other dissemination or use of the information contained is strictly prohibited. If you have received this mail in error, please delete it and notify us immediately.

2015-06-03 08:55:42

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list

A: Because it messes up the order in which people normally read text.
Q: Why is top-posting such a bad thing?
A: Top-posting.
Q: What is the most annoying thing in e-mail?

On Wed, Jun 03, 2015 at 04:24:05PM +0800, [email protected] wrote:
> Hi Peter,
>
> This may increase the overhead of schedule() a bit, as it will have
> more work to do.

How so? It replaces the post_schedule() muck and should not be more
expensive than that.

It will make sched_setscheduler() etc.. a little more expensive, but
that doesn't matter, those are not critical things at all.

> check_class_changed():
> if (prev_class->switched_from)
> prev_class->switched_from(rq, p);
> /* Possble rq->lock 'hole'. */
> p->sched_class->switched_to(rq, p);
>
> For above cases, why can't we just add a judgement in switched_to_fair()
> as follows:
> if (rq != task_rq(p))
> return;

Because its too easy to get wrong. There have been many instances of
bugs caused by this dropping of rq->lock.

And sure you can patch it up, once you find it, but I would really
rather prevent these things if at all possible.

2015-06-05 07:14:09

by pang.xunlei

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list

Hi Peter,

Peter Zijlstra <[email protected]> wrote 2015-06-03 PM 04:55:27:
>
> Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance
callback list
>
> A: Because it messes up the order in which people normally read text.
> Q: Why is top-posting such a bad thing?
> A: Top-posting.
> Q: What is the most annoying thing in e-mail?

Sure, will follow. Thanks for the reminding.

>
> On Wed, Jun 03, 2015 at 04:24:05PM +0800, [email protected] wrote:
> > Hi Peter,
> >
> > This may increase the overhead of schedule() a bit, as it will have
> > more work to do.
>
> How so? It replaces the post_schedule() muck and should not be more
> expensive than that.
>
> It will make sched_setscheduler() etc.. a little more expensive, but
> that doesn't matter, those are not critical things at all.

Another side effect it may have is that it will introduce some latency,
because we have to wait for next schedule() point to do the balancing.
prio_changed_rt()->pull_rt_task() is not rare cases when using PI futex.

-Xunlei

>
> > check_class_changed():
> > if (prev_class->switched_from)
> > prev_class->switched_from(rq, p);
> > /* Possble rq->lock 'hole'. */
> > p->sched_class->switched_to(rq, p);
> >
> > For above cases, why can't we just add a judgement in
switched_to_fair()
> > as follows:
> > if (rq != task_rq(p))
> > return;
>
> Because its too easy to get wrong. There have been many instances of
> bugs caused by this dropping of rq->lock.
>
> And sure you can patch it up, once you find it, but I would really
> rather prevent these things if at all possible.

--------------------------------------------------------
ZTE Information Security Notice: The information contained in this mail (and any attachment transmitted herewith) is privileged and confidential and is intended for the exclusive use of the addressee(s). If you are not an intended recipient, any disclosure, reproduction, distribution or other dissemination or use of the information contained is strictly prohibited. If you have received this mail in error, please delete it and notify us immediately.

2015-06-05 07:19:43

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list

On Fri, Jun 05, 2015 at 03:13:38PM +0800, [email protected] wrote:
> > It will make sched_setscheduler() etc.. a little more expensive, but
> > that doesn't matter, those are not critical things at all.
>
> Another side effect it may have is that it will introduce some latency,
> because we have to wait for next schedule() point to do the balancing.
> prio_changed_rt()->pull_rt_task() is not rare cases when using PI futex.

See patch 3, that adds the balance_callback() to sched_setscheduler()
and rt_mutex_setprio() to avoid just that.