This series aims to get rid of some places where locks of two RQs are held
at the same time.
Patch [1/5] is a preparation/cleanup. It replaces old (.on_rq == 1) with new
(.on_rq == ONRQ_QUEUED) everywhere. No functional changes.
Patch [2/5] is main in the series. It introduces new state: ONRQ_MIGRATING
and teaches scheduler to understand it (we need a little changes predominantly
in try_to_wake_up()). This will be used in the following way:
(we are changing task's rq)
raw_spin_lock(&src_rq->lock);
dequeue_task(src_rq, p, 0);
p->on_rq = ONRQ_MIGRATING;
set_task_cpu(p, dst_cpu);
raw_spin_unlock(&src_rq->lock);
raw_spin_lock(&dst_rq->lock);
p->on_rq = ONRQ_QUEUED;
enqueue_task(dst_rq, p, 0);
raw_spin_unlock(&dst_rq->lock);
Patches [3-5/5] remove double locks and use new ONRQ_MIGRATING state.
They allow unlocked using of 3-4 function, which looks safe for me.
The series doesn't add any overhead, and it shouldn't worsen performance,
I think. It improves granularity, and it's possible to imagine situations,
which will be happier without double rq lock.
v2: Changes in [2/5] and [5/5].
---
Kirill Tkhai (5):
sched: Wrapper for checking task_struct's .on_rq
sched: Teach scheduler to understand ONRQ_MIGRATING state
sched: Remove double_rq_lock() from __migrate_task()
sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop()
sched/fair: Remove double_lock_balance() from load_balance()
kernel/sched/core.c | 123 +++++++++++++++++++++++--------------
kernel/sched/deadline.c | 14 ++--
kernel/sched/fair.c | 155 ++++++++++++++++++++++++++++++----------------
kernel/sched/rt.c | 16 ++---
kernel/sched/sched.h | 13 ++++
kernel/sched/stop_task.c | 2 -
6 files changed, 206 insertions(+), 117 deletions(-)
--
Signed-off-by: Kirill Tkhai <[email protected]>
Use task_queued() everywhere instead of raw .on_rq check.
No functional changes.
The only exception is we do not use wrapper in check_for_tasks()
of file kernel/cpu.c, because it requires export of task_queued()
in global header files. Next patch in series would return it back,
so it doesn't matter.
Signed-off-by: Kirill Tkhai <[email protected]>
---
kernel/sched/core.c | 82 +++++++++++++++++++++++-----------------------
kernel/sched/deadline.c | 14 ++++----
kernel/sched/fair.c | 22 ++++++------
kernel/sched/rt.c | 16 ++++-----
kernel/sched/sched.h | 7 ++++
kernel/sched/stop_task.c | 2 +
6 files changed, 75 insertions(+), 68 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7bc599d..205f99a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1043,7 +1043,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
+ if (task_queued(rq->curr) && test_tsk_need_resched(rq->curr))
rq->skip_clock_update = 1;
}
@@ -1088,7 +1088,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
- if (p->on_rq) {
+ if (task_queued(p)) {
struct rq *src_rq, *dst_rq;
src_rq = task_rq(p);
@@ -1214,7 +1214,7 @@ static int migration_cpu_stop(void *data);
unsigned long wait_task_inactive(struct task_struct *p, long match_state)
{
unsigned long flags;
- int running, on_rq;
+ int running, queued;
unsigned long ncsw;
struct rq *rq;
@@ -1252,7 +1252,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
rq = task_rq_lock(p, &flags);
trace_sched_wait_task(p);
running = task_running(rq, p);
- on_rq = p->on_rq;
+ queued = task_queued(p);
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1284,7 +1284,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
- if (unlikely(on_rq)) {
+ if (unlikely(queued)) {
ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1478,7 +1478,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags);
- p->on_rq = 1;
+ p->on_rq = ONRQ_QUEUED;
/* if a worker is waking up, notify workqueue */
if (p->flags & PF_WQ_WORKER)
@@ -1537,7 +1537,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
int ret = 0;
rq = __task_rq_lock(p);
- if (p->on_rq) {
+ if (task_queued(p)) {
/* check_preempt_curr() may use rq clock */
update_rq_clock(rq);
ttwu_do_wakeup(rq, p, wake_flags);
@@ -1678,7 +1678,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
- if (p->on_rq && ttwu_remote(p, wake_flags))
+ if (task_queued(p) && ttwu_remote(p, wake_flags))
goto stat;
#ifdef CONFIG_SMP
@@ -1742,7 +1742,7 @@ static void try_to_wake_up_local(struct task_struct *p)
if (!(p->state & TASK_NORMAL))
goto out;
- if (!p->on_rq)
+ if (!task_queued(p))
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
ttwu_do_wakeup(rq, p, 0);
@@ -2095,7 +2095,7 @@ void wake_up_new_task(struct task_struct *p)
init_task_runnable_average(p);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
- p->on_rq = 1;
+ p->on_rq = ONRQ_QUEUED;
trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
@@ -2444,7 +2444,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
* project cycles that may never be accounted to this
* thread, breaking clock_gettime().
*/
- if (task_current(rq, p) && p->on_rq) {
+ if (task_current(rq, p) && task_queued(p)) {
update_rq_clock(rq);
ns = rq_clock_task(rq) - p->se.exec_start;
if ((s64)ns < 0)
@@ -2490,7 +2490,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* If we see ->on_cpu without ->on_rq, the task is leaving, and has
* been accounted, so we're correct here as well.
*/
- if (!p->on_cpu || !p->on_rq)
+ if (!p->on_cpu || !task_queued(p))
return p->se.sum_exec_runtime;
#endif
@@ -2794,7 +2794,7 @@ static void __sched __schedule(void)
switch_count = &prev->nvcsw;
}
- if (prev->on_rq || rq->skip_clock_update < 0)
+ if (task_queued(prev) || rq->skip_clock_update < 0)
update_rq_clock(rq);
next = pick_next_task(rq, prev);
@@ -2959,7 +2959,7 @@ EXPORT_SYMBOL(default_wake_function);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, on_rq, running, enqueue_flag = 0;
+ int oldprio, queued, running, enqueue_flag = 0;
struct rq *rq;
const struct sched_class *prev_class;
@@ -2988,9 +2988,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
prev_class = p->sched_class;
- on_rq = p->on_rq;
+ queued = task_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
@@ -3030,7 +3030,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, p, enqueue_flag);
check_class_changed(rq, p, prev_class, oldprio);
@@ -3041,7 +3041,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
void set_user_nice(struct task_struct *p, long nice)
{
- int old_prio, delta, on_rq;
+ int old_prio, delta, queued;
unsigned long flags;
struct rq *rq;
@@ -3062,8 +3062,8 @@ void set_user_nice(struct task_struct *p, long nice)
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
- on_rq = p->on_rq;
- if (on_rq)
+ queued = task_queued(p);
+ if (queued)
dequeue_task(rq, p, 0);
p->static_prio = NICE_TO_PRIO(nice);
@@ -3072,7 +3072,7 @@ void set_user_nice(struct task_struct *p, long nice)
p->prio = effective_prio(p);
delta = p->prio - old_prio;
- if (on_rq) {
+ if (queued) {
enqueue_task(rq, p, 0);
/*
* If the task increased its priority or is running and
@@ -3338,7 +3338,7 @@ static int __sched_setscheduler(struct task_struct *p,
{
int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, on_rq, running;
+ int retval, oldprio, oldpolicy = -1, queued, running;
int policy = attr->sched_policy;
unsigned long flags;
const struct sched_class *prev_class;
@@ -3535,9 +3535,9 @@ static int __sched_setscheduler(struct task_struct *p,
return 0;
}
- on_rq = p->on_rq;
+ queued = task_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
@@ -3547,7 +3547,7 @@ static int __sched_setscheduler(struct task_struct *p,
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (queued) {
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
@@ -4564,7 +4564,7 @@ void init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->curr = rq->idle = idle;
- idle->on_rq = 1;
+ idle->on_rq = ONRQ_QUEUED;
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
@@ -4641,7 +4641,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (p->on_rq) {
+ if (task_queued(p)) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, p, &flags);
@@ -4691,7 +4691,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* If we're not on a rq, the next wake-up will ensure we're
* placed properly.
*/
- if (p->on_rq) {
+ if (task_queued(p)) {
dequeue_task(rq_src, p, 0);
set_task_cpu(p, dest_cpu);
enqueue_task(rq_dest, p, 0);
@@ -4732,13 +4732,13 @@ void sched_setnuma(struct task_struct *p, int nid)
{
struct rq *rq;
unsigned long flags;
- bool on_rq, running;
+ bool queued, running;
rq = task_rq_lock(p, &flags);
- on_rq = p->on_rq;
+ queued = task_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
@@ -4747,7 +4747,7 @@ void sched_setnuma(struct task_struct *p, int nid)
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, p, 0);
task_rq_unlock(rq, p, &flags);
}
@@ -7099,13 +7099,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
.sched_policy = SCHED_NORMAL,
};
int old_prio = p->prio;
- int on_rq;
+ int queued;
- on_rq = p->on_rq;
- if (on_rq)
+ queued = task_queued(p);
+ if (queued)
dequeue_task(rq, p, 0);
__setscheduler(rq, p, &attr);
- if (on_rq) {
+ if (queued) {
enqueue_task(rq, p, 0);
resched_curr(rq);
}
@@ -7293,16 +7293,16 @@ void sched_offline_group(struct task_group *tg)
void sched_move_task(struct task_struct *tsk)
{
struct task_group *tg;
- int on_rq, running;
+ int queued, running;
unsigned long flags;
struct rq *rq;
rq = task_rq_lock(tsk, &flags);
running = task_current(rq, tsk);
- on_rq = tsk->on_rq;
+ queued = task_queued(tsk);
- if (on_rq)
+ if (queued)
dequeue_task(rq, tsk, 0);
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
@@ -7315,14 +7315,14 @@ void sched_move_task(struct task_struct *tsk)
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk, on_rq);
+ tsk->sched_class->task_move_group(tsk, queued);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, tsk, 0);
task_rq_unlock(rq, tsk, &flags);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 255ce13..4cc3b14 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -530,7 +530,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
update_rq_clock(rq);
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
- if (p->on_rq) {
+ if (task_queued(p)) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (task_has_dl_policy(rq->curr))
check_preempt_curr_dl(rq, p, 0);
@@ -1030,7 +1030,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
* means a stop task can slip in, in which case we need to
* re-start task selection.
*/
- if (rq->stop && rq->stop->on_rq)
+ if (rq->stop && task_queued(rq->stop))
return RETRY_TASK;
}
@@ -1257,7 +1257,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(later_rq->cpu,
&task->cpus_allowed) ||
- task_running(rq, task) || !task->on_rq)) {
+ task_running(rq, task) || !task_queued(task))) {
double_unlock_balance(rq, later_rq);
later_rq = NULL;
break;
@@ -1296,7 +1296,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
BUG_ON(task_current(rq, p));
BUG_ON(p->nr_cpus_allowed <= 1);
- BUG_ON(!p->on_rq);
+ BUG_ON(!task_queued(p));
BUG_ON(!dl_task(p));
return p;
@@ -1443,7 +1443,7 @@ static int pull_dl_task(struct rq *this_rq)
dl_time_before(p->dl.deadline,
this_rq->dl.earliest_dl.curr))) {
WARN_ON(p == src_rq->curr);
- WARN_ON(!p->on_rq);
+ WARN_ON(!task_queued(p));
/*
* Then we pull iff p has actually an earlier
@@ -1596,7 +1596,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (unlikely(p->dl.dl_throttled))
return;
- if (p->on_rq && rq->curr != p) {
+ if (task_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
/* Only reschedule if pushing failed */
@@ -1614,7 +1614,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (p->on_rq || rq->curr == p) {
+ if (task_queued(p) || rq->curr == p) {
#ifdef CONFIG_SMP
/*
* This might be too much, but unfortunately
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 45943b2..dd90fff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7464,7 +7464,7 @@ static void task_fork_fair(struct task_struct *p)
static void
prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (!p->se.on_rq)
+ if (!task_queued(p))
return;
/*
@@ -7489,11 +7489,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
* switched back to the fair class the enqueue_entity(.flags=0) will
* do the right thing.
*
- * If it's on_rq, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it's !on_rq, then only when
+ * If it's queued, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it's !queued, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!p->on_rq && p->state != TASK_RUNNING) {
+ if (!task_queued(p) && p->state != TASK_RUNNING) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
@@ -7528,7 +7528,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
*/
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
- if (!se->on_rq)
+ if (!task_queued(p))
return;
/*
@@ -7574,7 +7574,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int queued)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq;
@@ -7593,7 +7593,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* fair sleeper stuff for the first placement, but who cares.
*/
/*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
+ * When !queued, vruntime of the task has usually NOT been normalized.
* But there are some cases where it has already been normalized:
*
* - Moving a forked child which is waiting for being woken up by
@@ -7604,14 +7604,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* To prevent boost or penalty in the new cfs_rq caused by delta
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
*/
- if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
- on_rq = 1;
+ if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
+ queued = 1;
- if (!on_rq)
+ if (!queued)
se->vruntime -= cfs_rq_of(se)->min_vruntime;
set_task_rq(p, task_cpu(p));
se->depth = se->parent ? se->parent->depth + 1 : 0;
- if (!on_rq) {
+ if (!queued) {
cfs_rq = cfs_rq_of(se);
se->vruntime += cfs_rq->min_vruntime;
#ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca..9395320 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
* means a dl or stop task can slip in, in which case we need
* to re-start task selection.
*/
- if (unlikely((rq->stop && rq->stop->on_rq) ||
+ if (unlikely((rq->stop && task_queued(rq->stop)) ||
rq->dl.dl_nr_running))
return RETRY_TASK;
}
@@ -1624,7 +1624,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
!cpumask_test_cpu(lowest_rq->cpu,
tsk_cpus_allowed(task)) ||
task_running(rq, task) ||
- !task->on_rq)) {
+ !task_queued(task))) {
double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1658,7 +1658,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(task_current(rq, p));
BUG_ON(p->nr_cpus_allowed <= 1);
- BUG_ON(!p->on_rq);
+ BUG_ON(!task_queued(p));
BUG_ON(!rt_task(p));
return p;
@@ -1809,7 +1809,7 @@ static int pull_rt_task(struct rq *this_rq)
*/
if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
- WARN_ON(!p->on_rq);
+ WARN_ON(!task_queued(p));
/*
* There's a chance that p is higher in priority
@@ -1870,7 +1870,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
BUG_ON(!rt_task(p));
- if (!p->on_rq)
+ if (!task_queued(p))
return;
weight = cpumask_weight(new_mask);
@@ -1936,7 +1936,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
* we may need to handle the pulling of RT tasks
* now.
*/
- if (!p->on_rq || rq->rt.rt_nr_running)
+ if (!task_queued(p) || rq->rt.rt_nr_running)
return;
if (pull_rt_task(rq))
@@ -1970,7 +1970,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
* If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (p->on_rq && rq->curr != p) {
+ if (task_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
/* Don't resched if we changed runqueues */
@@ -1989,7 +1989,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
static void
prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (!p->on_rq)
+ if (!task_queued(p))
return;
if (rq->curr == p) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f..e5a9b6d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -15,6 +15,9 @@
struct rq;
+/* .on_rq states of struct task_struct: */
+#define ONRQ_QUEUED 1
+
extern __read_mostly int scheduler_running;
extern unsigned long calc_load_update;
@@ -942,6 +945,10 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
#endif
}
+static inline int task_queued(struct task_struct *p)
+{
+ return p->on_rq == ONRQ_QUEUED;
+}
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0eda..1a4bb0f 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
{
struct task_struct *stop = rq->stop;
- if (!stop || !stop->on_rq)
+ if (!stop || !task_queued(stop))
return NULL;
put_prev_task(rq, prev);
This is new on_rq state for the cases when task is migrating
from one src_rq to another dst_rq, and there is no necessity
to have both RQs locked at the same time.
We will use the state this way:
raw_spin_lock(&src_rq->lock);
dequeue_task(src_rq, p, 0);
p->on_rq = ONRQ_MIGRATING;
set_task_cpu(p, dst_cpu);
raw_spin_unlock(&src_rq->lock);
raw_spin_lock(&dst_rq->lock);
p->on_rq = ONRQ_QUEUED;
enqueue_task(dst_rq, p, 0);
raw_spin_unlock(&dst_rq->lock);
The profit is that double_rq_lock() is not needed now,
and this may reduce the latencies in some situations.
The logic of try_to_wake_up() remained the same as it
was. Its behaviour changes in a small subset of cases
(when preempted task in ~TASK_RUNNING state is queued
on rq and we are migrating it to another).
We add a loop in the beginning of set_cpus_allowed_ptr.
It's like a handmade spinlock, which is similar
to situation we had before. We used to spin on rq->lock,
now we spin on "again:" label. Of course, it's worse
than arch-dependent spinlock, but we have to have it
here. The function is synchronic, and users used to be
sure that task is using new allowed cpu mask after we
returned. It seems there won't be any signify performance
losses.
I try to seek performance-critical places where we use
set_cpus_allowed_ptr(). Maybe update_migrate_disable()
in RT patch. But it's called the way where the state
ONRQ_MIGRATING is not possible. So, this everything
looks fine for me.
v2: cpu_relax() in set_cpus_allowed_ptr(),
on_rq check in switched_from_fair(),
fixed wrong "unlikely" braces,
task_migrating() primitive,
commentaries.
Signed-off-by: Kirill Tkhai <[email protected]>
---
kernel/sched/core.c | 38 +++++++++++++++++++++++++++++++-------
kernel/sched/fair.c | 2 +-
kernel/sched/sched.h | 6 ++++++
3 files changed, 38 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 205f99a..772f791 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1214,7 +1214,7 @@ static int migration_cpu_stop(void *data);
unsigned long wait_task_inactive(struct task_struct *p, long match_state)
{
unsigned long flags;
- int running, queued;
+ int running, on_rq;
unsigned long ncsw;
struct rq *rq;
@@ -1252,7 +1252,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
rq = task_rq_lock(p, &flags);
trace_sched_wait_task(p);
running = task_running(rq, p);
- queued = task_queued(p);
+ on_rq = p->on_rq;
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1284,7 +1284,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
- if (unlikely(queued)) {
+ if (unlikely(on_rq)) {
ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1491,10 +1491,18 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
- check_preempt_curr(rq, p, wake_flags);
trace_sched_wakeup(p, true);
p->state = TASK_RUNNING;
+
+ /*
+ * We've changed the state, other actions will be done
+ * in the place, where the migration has started.
+ */
+ if (task_migrating(p))
+ return;
+
+ check_preempt_curr(rq, p, wake_flags);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p);
@@ -1537,9 +1545,15 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
int ret = 0;
rq = __task_rq_lock(p);
- if (task_queued(p)) {
+ /*
+ * Task is queued or it is migrating. In the second case
+ * it will be queued by migration code with TASK_RUNNING
+ * state, which we set in ttwu_do_wakeup().
+ */
+ if (p->on_rq) {
/* check_preempt_curr() may use rq clock */
- update_rq_clock(rq);
+ if (task_queued(p))
+ update_rq_clock(rq);
ttwu_do_wakeup(rq, p, wake_flags);
ret = 1;
}
@@ -1678,7 +1692,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
- if (task_queued(p) && ttwu_remote(p, wake_flags))
+ if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
#ifdef CONFIG_SMP
@@ -1693,6 +1707,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_rmb();
+ BUG_ON(p->on_rq);
+
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -4623,8 +4639,16 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
struct rq *rq;
unsigned int dest_cpu;
int ret = 0;
+again:
+ while (unlikely(task_migrating(p)))
+ cpu_relax();
rq = task_rq_lock(p, &flags);
+ /* Check again with rq locked */
+ if (unlikely(task_migrating(p))) {
+ task_rq_unlock(rq, p, &flags);
+ goto again;
+ }
if (cpumask_equal(&p->cpus_allowed, new_mask))
goto out;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd90fff..a8f8ca0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7493,7 +7493,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
* have normalized the vruntime, if it's !queued, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!task_queued(p) && p->state != TASK_RUNNING) {
+ if (!p->on_rq && p->state != TASK_RUNNING) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e5a9b6d..f6773d7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -17,6 +17,7 @@ struct rq;
/* .on_rq states of struct task_struct: */
#define ONRQ_QUEUED 1
+#define ONRQ_MIGRATING 2
extern __read_mostly int scheduler_running;
@@ -950,6 +951,11 @@ static inline int task_queued(struct task_struct *p)
return p->on_rq == ONRQ_QUEUED;
}
+static inline int task_migrating(struct task_struct *p)
+{
+ return p->on_rq == ONRQ_MIGRATING;
+}
+
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
Let's use ONRQ_MIGRATING instead.
Signed-off-by: Kirill Tkhai <[email protected]>
---
kernel/sched/core.c | 23 +++++++++++++++--------
1 file changed, 15 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 772f791..9bfc4a9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4693,20 +4693,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
*/
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
- struct rq *rq_dest, *rq_src;
+ struct rq *rq;
int ret = 0;
if (unlikely(!cpu_active(dest_cpu)))
return ret;
- rq_src = cpu_rq(src_cpu);
- rq_dest = cpu_rq(dest_cpu);
+ rq = cpu_rq(src_cpu);
raw_spin_lock(&p->pi_lock);
- double_rq_lock(rq_src, rq_dest);
+ raw_spin_lock(&rq->lock);
/* Already moved. */
if (task_cpu(p) != src_cpu)
goto done;
+
/* Affinity changed (again). */
if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
goto fail;
@@ -4716,15 +4716,22 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* placed properly.
*/
if (task_queued(p)) {
- dequeue_task(rq_src, p, 0);
+ dequeue_task(rq, p, 0);
+ p->on_rq = ONRQ_MIGRATING;
set_task_cpu(p, dest_cpu);
- enqueue_task(rq_dest, p, 0);
- check_preempt_curr(rq_dest, p, 0);
+ raw_spin_unlock(&rq->lock);
+
+ rq = cpu_rq(dest_cpu);
+ raw_spin_lock(&rq->lock);
+ BUG_ON(task_rq(p) != rq);
+ p->on_rq = ONRQ_QUEUED;
+ enqueue_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
}
done:
ret = 1;
fail:
- double_rq_unlock(rq_src, rq_dest);
+ raw_spin_unlock(&rq->lock);
raw_spin_unlock(&p->pi_lock);
return ret;
}
Bad situation:
double_lock_balance() drops busiest_rq lock. The busiest_rq is *busiest*,
and a lot of tasks and context switches there. We are dropping the lock
and waiting for it again.
Let's just detach the task and once finally unlock it!
Warning: this admits unlocked using of can_migrate_task(), throttled_lb_pair(),
and task_hot(). I added comments about that.
Signed-off-by: Kirill Tkhai <[email protected]>
---
kernel/sched/fair.c | 54 +++++++++++++++++++++++++++++++++++----------------
1 file changed, 37 insertions(+), 17 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a8f8ca0..a1b74f2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3297,6 +3297,8 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
* Ensure that neither of the group entities corresponding to src_cpu or
* dest_cpu are members of a throttled hierarchy when performing group
* load-balance operations.
+ *
+ * Note: RQs are not locked.
*/
static inline int throttled_lb_pair(struct task_group *tg,
int src_cpu, int dest_cpu)
@@ -5127,7 +5129,9 @@ static void move_task(struct task_struct *p, struct lb_env *env)
}
/*
- * Is this task likely cache-hot:
+ * Is this task likely cache-hot?
+ *
+ * Note: env->dst_rq is unlocked, but rcu_read_lock() is held.
*/
static int task_hot(struct task_struct *p, struct lb_env *env)
{
@@ -5247,6 +5251,8 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ *
+ * Note: env->dest_rq is not locked.
*/
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
@@ -5336,13 +5342,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
}
/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * detach_one_task tries to dequeue exactly one task from env->src_rq, as
* part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
+ * Returns a task if successful and NULL otherwise.
*
- * Called with both runqueues locked.
+ * Called with env->src_rq locked.
*/
-static int move_one_task(struct lb_env *env)
+static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p, *n;
@@ -5350,16 +5356,20 @@ static int move_one_task(struct lb_env *env)
if (!can_migrate_task(p, env))
continue;
- move_task(p, env);
+ deactivate_task(env->src_rq, p, 0);
+ p->on_rq = ONRQ_MIGRATING;
+ set_task_cpu(p, env->dst_cpu);
+
/*
- * Right now, this is only the second place move_task()
- * is called, so we can safely collect move_task()
- * stats here rather than inside move_task().
+ * Right now, this is only the second place where
+ * lb_gained[env->idle] is updated (other is move_tasks)
+ * so we can safely collect stats here rather than
+ * inside move_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
- return 1;
+ return p;
}
- return 0;
+ return NULL;
}
static const unsigned int sched_nr_migrate_break = 32;
@@ -6913,6 +6923,7 @@ static int active_load_balance_cpu_stop(void *data)
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
+ struct task_struct *p = NULL;
raw_spin_lock_irq(&busiest_rq->lock);
@@ -6932,9 +6943,6 @@ static int active_load_balance_cpu_stop(void *data)
*/
BUG_ON(busiest_rq == target_rq);
- /* move a task from busiest_rq to target_rq */
- double_lock_balance(busiest_rq, target_rq);
-
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
@@ -6955,16 +6963,28 @@ static int active_load_balance_cpu_stop(void *data)
schedstat_inc(sd, alb_count);
- if (move_one_task(&env))
+ p = detach_one_task(&env);
+ if (p)
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
}
rcu_read_unlock();
- double_unlock_balance(busiest_rq, target_rq);
out_unlock:
busiest_rq->active_balance = 0;
- raw_spin_unlock_irq(&busiest_rq->lock);
+ raw_spin_unlock(&busiest_rq->lock);
+
+ if (p) {
+ raw_spin_lock(&target_rq->lock);
+ BUG_ON(task_rq(p) != target_rq);
+ p->on_rq = ONRQ_QUEUED;
+ activate_task(target_rq, p, 0);
+ check_preempt_curr(target_rq, p, 0);
+ raw_spin_unlock(&target_rq->lock);
+ }
+
+ local_irq_enable();
+
return 0;
}
Keep on_rq = ONRQ_MIGRATING, while task is migrating, instead.
v2: Added missed check_preempt_curr() in attach_tasks().
Signed-off-by: Kirill Tkhai <[email protected]>
---
kernel/sched/fair.c | 85 +++++++++++++++++++++++++++++++++------------------
1 file changed, 55 insertions(+), 30 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a1b74f2..a47fb3f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4706,9 +4706,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
/*
- * This is possible from callers such as move_task(), in which we
- * unconditionally check_prempt_curr() after an enqueue (which may have
- * lead to a throttle). This both saves work and prevents false
+ * This is possible from callers, in which we unconditionally
+ * check_prempt_curr() after an enqueue (which may have lead
+ * to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
@@ -5114,20 +5114,22 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
+ struct list_head tasks;
};
/*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
+ * detach_task - detach a task from its runqueue for migration.
+ * The runqueue must be locked.
*/
-static void move_task(struct task_struct *p, struct lb_env *env)
+static void detach_task(struct task_struct *p, struct lb_env *env)
{
deactivate_task(env->src_rq, p, 0);
+ list_add(&p->se.group_node, &env->tasks);
+ p->on_rq = ONRQ_MIGRATING;
set_task_cpu(p, env->dst_cpu);
- activate_task(env->dst_rq, p, 0);
- check_preempt_curr(env->dst_rq, p, 0);
}
+
/*
* Is this task likely cache-hot?
*
@@ -5362,9 +5364,9 @@ static struct task_struct *detach_one_task(struct lb_env *env)
/*
* Right now, this is only the second place where
- * lb_gained[env->idle] is updated (other is move_tasks)
+ * lb_gained[env->idle] is updated (other is detach_tasks)
* so we can safely collect stats here rather than
- * inside move_tasks().
+ * inside detach_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
return p;
@@ -5375,18 +5377,18 @@ static struct task_struct *detach_one_task(struct lb_env *env)
static const unsigned int sched_nr_migrate_break = 32;
/*
- * move_tasks tries to move up to imbalance weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
+ * detach_tasks tries to detach up to imbalance weighted load from busiest_rq,
+ * as part of a balancing operation within domain "sd".
+ * Returns number of detached tasks if successful and 0 otherwise.
*
- * Called with both runqueues locked.
+ * Called with env->src_rq locked.
*/
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
struct task_struct *p;
unsigned long load;
- int pulled = 0;
+ int detached = 0;
if (env->imbalance <= 0)
return 0;
@@ -5417,14 +5419,15 @@ static int move_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;
- move_task(p, env);
- pulled++;
+ detach_task(p, env);
+
+ detached++;
env->imbalance -= load;
#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
- * kernels will stop after the first task is pulled to minimize
+ * kernels will stop after the first task is detached to minimize
* the critical section.
*/
if (env->idle == CPU_NEWLY_IDLE)
@@ -5444,13 +5447,28 @@ static int move_tasks(struct lb_env *env)
}
/*
- * Right now, this is one of only two places move_task() is called,
- * so we can safely collect move_task() stats here rather than
- * inside move_task().
+ * Right now, this is one of only two places we collect this stat
+ * so we can safely collect detach_one_task() stats here rather
+ * than inside detach_one_task().
*/
- schedstat_add(env->sd, lb_gained[env->idle], pulled);
+ schedstat_add(env->sd, lb_gained[env->idle], detached);
+
+ return detached;
+}
+
+static void attach_tasks(struct lb_env *env)
+{
+ struct list_head *tasks = &env->tasks;
+ struct task_struct *p;
- return pulled;
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
+ BUG_ON(task_rq(p) != env->dst_rq);
+ list_del_init(&p->se.group_node);
+ p->on_rq = ONRQ_QUEUED;
+ activate_task(env->dst_rq, p, 0);
+ check_preempt_curr(env->dst_rq, p, 0);
+ }
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6559,6 +6577,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
+ .tasks = LIST_HEAD_INIT(env.tasks),
};
/*
@@ -6608,16 +6627,22 @@ static int load_balance(int this_cpu, struct rq *this_rq,
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
- local_irq_save(flags);
- double_rq_lock(env.dst_rq, busiest);
+ raw_spin_lock_irqsave(&busiest->lock, flags);
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
- cur_ld_moved = move_tasks(&env);
- ld_moved += cur_ld_moved;
- double_rq_unlock(env.dst_rq, busiest);
+ cur_ld_moved = detach_tasks(&env);
+ raw_spin_unlock(&busiest->lock);
+
+ if (cur_ld_moved) {
+ raw_spin_lock(&env.dst_rq->lock);
+ attach_tasks(&env);
+ raw_spin_unlock(&env.dst_rq->lock);
+ ld_moved += cur_ld_moved;
+ }
+
local_irq_restore(flags);
/*
@@ -6753,7 +6778,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
- * move_tasks).
+ * detach_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
Hi Kirill,
I'll try to read this series later, just one silly question for now.
On 07/26, Kirill Tkhai wrote:
>
> Patch [2/5] is main in the series. It introduces new state: ONRQ_MIGRATING
> and teaches scheduler to understand it (we need a little changes predominantly
> in try_to_wake_up()). This will be used in the following way:
>
> (we are changing task's rq)
>
> raw_spin_lock(&src_rq->lock);
> dequeue_task(src_rq, p, 0);
> p->on_rq = ONRQ_MIGRATING;
> set_task_cpu(p, dst_cpu);
> raw_spin_unlock(&src_rq->lock);
>
> raw_spin_lock(&dst_rq->lock);
> p->on_rq = ONRQ_QUEUED;
> enqueue_task(dst_rq, p, 0);
> raw_spin_unlock(&dst_rq->lock);
Hmm. And what if the code above doesn't hold p->pi_lock (4/5) and, say,
__sched_setscheduler() does fair_sched_class->rt_sched_class transition
in between?
ONRQ_MIGRATING helps to avoid the wrong dequeue + enqueue, but I am not
sure about check_class_changed().
Say, switched_from_fair() will use dst_rq even if p was never queued on
this rq... This only affects the .decay_count logic, perhaps this is fine,
I simply do not know what this code does.
What about switched_to_rt() ? we lose the push_rt_task() logic... Hmm,
which I can't understand too ;)
And we also lose ENQUEUE_HEAD in this case, but this looks fine.
In short: could you confirm there are no problems here?
Oleg.
On 26.07.2014 23:39, Oleg Nesterov wrote:
> Hi Kirill,
>
> I'll try to read this series later, just one silly question for now.
>
> On 07/26, Kirill Tkhai wrote:
>>
>> Patch [2/5] is main in the series. It introduces new state: ONRQ_MIGRATING
>> and teaches scheduler to understand it (we need a little changes predominantly
>> in try_to_wake_up()). This will be used in the following way:
>>
>> (we are changing task's rq)
>>
>> raw_spin_lock(&src_rq->lock);
>> dequeue_task(src_rq, p, 0);
>> p->on_rq = ONRQ_MIGRATING;
>> set_task_cpu(p, dst_cpu);
>> raw_spin_unlock(&src_rq->lock);
>>
>> raw_spin_lock(&dst_rq->lock);
>> p->on_rq = ONRQ_QUEUED;
>> enqueue_task(dst_rq, p, 0);
>> raw_spin_unlock(&dst_rq->lock);
>
> Hmm. And what if the code above doesn't hold p->pi_lock (4/5) and, say,
> __sched_setscheduler() does fair_sched_class->rt_sched_class transition
> in between?
>
> ONRQ_MIGRATING helps to avoid the wrong dequeue + enqueue, but I am not
> sure about check_class_changed().
>
> Say, switched_from_fair() will use dst_rq even if p was never queued on
> this rq... This only affects the .decay_count logic, perhaps this is fine,
> I simply do not know what this code does.
You're right. We have to check for "task_migrating" in switched_from_fair().
One more place is switched_from_dl().
> What about switched_to_rt() ? we lose the push_rt_task() logic... Hmm,
> which I can't understand too ;)
>
> And we also lose ENQUEUE_HEAD in this case, but this looks fine.
>
> In short: could you confirm there are no problems here?
This will be the reason of some RT/DL imbalance. We need a method how to
avoid this.
Maybe, it would be good to call something like check_class_changed()
at the end of migration process. We just need to save task's class
before migration and compare with the class after migration (for [3/5],
__migrate_task()). For [4/5] and [5/5] the class is always fair_sched_class.
Thanks for the comments. I'll think how to fix this in a good way,
and update the series.
Kirill
On Sat, Jul 26, 2014 at 06:59:21PM +0400, Kirill Tkhai wrote:
> The profit is that double_rq_lock() is not needed now,
> and this may reduce the latencies in some situations.
> We add a loop in the beginning of set_cpus_allowed_ptr.
> It's like a handmade spinlock, which is similar
> to situation we had before. We used to spin on rq->lock,
> now we spin on "again:" label. Of course, it's worse
> than arch-dependent spinlock, but we have to have it
> here.
> @@ -4623,8 +4639,16 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
> struct rq *rq;
> unsigned int dest_cpu;
> int ret = 0;
> +again:
> + while (unlikely(task_migrating(p)))
> + cpu_relax();
>
> rq = task_rq_lock(p, &flags);
> + /* Check again with rq locked */
> + if (unlikely(task_migrating(p))) {
> + task_rq_unlock(rq, p, &flags);
> + goto again;
> + }
>
> if (cpumask_equal(&p->cpus_allowed, new_mask))
> goto out;
So I really dislike that, esp since you're now talking of adding more of
this goo all over the place.
I'll ask again, why isn't this in task_rq_lock() and co?
Also, you really need to talk the spin bounded, otherwise your two
quoted paragraphs above are in contradiction. Now I think you can
actually make an argument that way, so that's good.
В Пн, 28/07/2014 в 10:01 +0200, Peter Zijlstra пишет:
> On Sat, Jul 26, 2014 at 06:59:21PM +0400, Kirill Tkhai wrote:
>
> > The profit is that double_rq_lock() is not needed now,
> > and this may reduce the latencies in some situations.
>
> > We add a loop in the beginning of set_cpus_allowed_ptr.
> > It's like a handmade spinlock, which is similar
> > to situation we had before. We used to spin on rq->lock,
> > now we spin on "again:" label. Of course, it's worse
> > than arch-dependent spinlock, but we have to have it
> > here.
>
> > @@ -4623,8 +4639,16 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
> > struct rq *rq;
> > unsigned int dest_cpu;
> > int ret = 0;
> > +again:
> > + while (unlikely(task_migrating(p)))
> > + cpu_relax();
> >
> > rq = task_rq_lock(p, &flags);
> > + /* Check again with rq locked */
> > + if (unlikely(task_migrating(p))) {
> > + task_rq_unlock(rq, p, &flags);
> > + goto again;
> > + }
> >
> > if (cpumask_equal(&p->cpus_allowed, new_mask))
> > goto out;
>
> So I really dislike that, esp since you're now talking of adding more of
> this goo all over the place.
>
> I'll ask again, why isn't this in task_rq_lock() and co?
I thought, this may give a little profit in cases of priority inheritance etc.
But since this is spreading throughout the scheduler, I'm agree with you.
It's better to place this in task_rq_lock() etc. This will decide all
the problems that we have discussed with Oleg.
> Also, you really need to talk the spin bounded, otherwise your two
> quoted paragraphs above are in contradiction. Now I think you can
> actually make an argument that way, so that's good.
Thanks,
Kirill
On 07/28, Kirill Tkhai wrote:
>
> You're right. We have to check for "task_migrating" in switched_from_fair().
> One more place is switched_from_dl().
Or we can simply check task_migrating() in __sched_setscheduler(), like
set_cpus_allowed() does.
This probably means that you should add this check into task_rq_lock() as
Peter suggests. Or at least add another task_rq_lock_xxx() helper.
Oleg.
В Пн, 28/07/2014 в 13:05 +0400, Kirill Tkhai пишет:
> В Пн, 28/07/2014 в 10:01 +0200, Peter Zijlstra пишет:
> > On Sat, Jul 26, 2014 at 06:59:21PM +0400, Kirill Tkhai wrote:
> >
> > > The profit is that double_rq_lock() is not needed now,
> > > and this may reduce the latencies in some situations.
> >
> > > We add a loop in the beginning of set_cpus_allowed_ptr.
> > > It's like a handmade spinlock, which is similar
> > > to situation we had before. We used to spin on rq->lock,
> > > now we spin on "again:" label. Of course, it's worse
> > > than arch-dependent spinlock, but we have to have it
> > > here.
> >
> > > @@ -4623,8 +4639,16 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
> > > struct rq *rq;
> > > unsigned int dest_cpu;
> > > int ret = 0;
> > > +again:
> > > + while (unlikely(task_migrating(p)))
> > > + cpu_relax();
> > >
> > > rq = task_rq_lock(p, &flags);
> > > + /* Check again with rq locked */
> > > + if (unlikely(task_migrating(p))) {
> > > + task_rq_unlock(rq, p, &flags);
> > > + goto again;
> > > + }
> > >
> > > if (cpumask_equal(&p->cpus_allowed, new_mask))
> > > goto out;
> >
> > So I really dislike that, esp since you're now talking of adding more of
> > this goo all over the place.
> >
> > I'll ask again, why isn't this in task_rq_lock() and co?
>
> I thought, this may give a little profit in cases of priority inheritance etc.
> But since this is spreading throughout the scheduler, I'm agree with you.
> It's better to place this in task_rq_lock() etc. This will decide all
> the problems that we have discussed with Oleg.
>
> > Also, you really need to talk the spin bounded, otherwise your two
> > quoted paragraphs above are in contradiction. Now I think you can
> > actually make an argument that way, so that's good.
How about this? Everything is inside task_rq_lock() now. The patch
became much less.
From: Kirill Tkhai <[email protected]>
sched: Teach scheduler to understand ONRQ_MIGRATING state
This is new on_rq state for the cases when task is migrating
from one src_rq to another dst_rq, and there is no necessity
to have both RQs locked at the same time.
We will use the state this way:
raw_spin_lock(&src_rq->lock);
dequeue_task(src_rq, p, 0);
p->on_rq = ONRQ_MIGRATING;
set_task_cpu(p, dst_cpu);
raw_spin_unlock(&src_rq->lock);
raw_spin_lock(&dst_rq->lock);
p->on_rq = ONRQ_QUEUED;
enqueue_task(dst_rq, p, 0);
raw_spin_unlock(&dst_rq->lock);
The profit is that double_rq_lock() is not needed now,
and this may reduce the latencies in some situations.
v2.1: Place task_migrating() into task_rq_lock() and
__task_rq_lock().
Signed-off-by: Kirill Tkhai <[email protected]>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26aa7bc..00d7bcc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -333,7 +333,8 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p)))
+ if (likely(rq == task_rq(p) &&
+ !task_migrating(p)))
return rq;
raw_spin_unlock(&rq->lock);
}
@@ -352,7 +353,8 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
raw_spin_lock_irqsave(&p->pi_lock, *flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p)))
+ if (likely(rq == task_rq(p) &&
+ !task_migrating(p)))
return rq;
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
@@ -1678,7 +1680,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
- if (task_queued(p) && ttwu_remote(p, wake_flags))
+ if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
#ifdef CONFIG_SMP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e5a9b6d..f6773d7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -17,6 +17,7 @@ struct rq;
/* .on_rq states of struct task_struct: */
#define ONRQ_QUEUED 1
+#define ONRQ_MIGRATING 2
extern __read_mostly int scheduler_running;
@@ -950,6 +951,11 @@ static inline int task_queued(struct task_struct *p)
return p->on_rq == ONRQ_QUEUED;
}
+static inline int task_migrating(struct task_struct *p)
+{
+ return p->on_rq == ONRQ_MIGRATING;
+}
+
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
On Tue, Jul 29, 2014 at 01:53:02PM +0400, Kirill Tkhai wrote:
> From: Kirill Tkhai <[email protected]>
>
> sched: Teach scheduler to understand ONRQ_MIGRATING state
>
> This is new on_rq state for the cases when task is migrating
> from one src_rq to another dst_rq, and there is no necessity
> to have both RQs locked at the same time.
>
> We will use the state this way:
>
> raw_spin_lock(&src_rq->lock);
> dequeue_task(src_rq, p, 0);
> p->on_rq = ONRQ_MIGRATING;
> set_task_cpu(p, dst_cpu);
> raw_spin_unlock(&src_rq->lock);
>
> raw_spin_lock(&dst_rq->lock);
> p->on_rq = ONRQ_QUEUED;
> enqueue_task(dst_rq, p, 0);
> raw_spin_unlock(&dst_rq->lock);
>
> The profit is that double_rq_lock() is not needed now,
> and this may reduce the latencies in some situations.
You forgot to explain how the spinning on task_migrated() is bounded and
thus doesn't make your beginning and end contradict itself.
> Signed-off-by: Kirill Tkhai <[email protected]>
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 26aa7bc..00d7bcc 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -333,7 +333,8 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
> for (;;) {
> rq = task_rq(p);
> raw_spin_lock(&rq->lock);
> - if (likely(rq == task_rq(p)))
> + if (likely(rq == task_rq(p) &&
> + !task_migrating(p)))
> return rq;
> raw_spin_unlock(&rq->lock);
> }
I would prefer an extra spin-loop like so, that avoids us spinning on
the rq-lock, which serves no purpose.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2676866b4394..1e65a0bdbbc3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -331,9 +331,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
lockdep_assert_held(&p->pi_lock);
for (;;) {
+ while (task_migrating(p))
+ cpu_relax();
+
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p)))
+ if (likely(rq == task_rq(p) && !task_migrating(p)))
return rq;
raw_spin_unlock(&rq->lock);
}
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e5a9b6d..f6773d7 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -17,6 +17,7 @@ struct rq;
>
> /* .on_rq states of struct task_struct: */
The 'normal' way to write that is: task_struct::on_rq
> #define ONRQ_QUEUED 1
> +#define ONRQ_MIGRATING 2
>
> extern __read_mostly int scheduler_running;
>
On Sat, Jul 26, 2014 at 06:59:52PM +0400, Kirill Tkhai wrote:
> Keep on_rq = ONRQ_MIGRATING, while task is migrating, instead.
>
> v2: Added missed check_preempt_curr() in attach_tasks().
vN thingies go below the ---, they're pointless to preserve. Which then
turns this Changelog into something that's entirely too short.
> Signed-off-by: Kirill Tkhai <[email protected]>
> ---
> kernel/sched/fair.c | 85 +++++++++++++++++++++++++++++++++------------------
> 1 file changed, 55 insertions(+), 30 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a1b74f2..a47fb3f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4706,9 +4706,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> return;
>
> /*
> - * This is possible from callers such as move_task(), in which we
> - * unconditionally check_prempt_curr() after an enqueue (which may have
> - * lead to a throttle). This both saves work and prevents false
> + * This is possible from callers, in which we unconditionally
> + * check_prempt_curr() after an enqueue (which may have lead
> + * to a throttle). This both saves work and prevents false
> * next-buddy nomination below.
> */
It would be good to retain the reference to code that does that.
> if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
> @@ -5114,20 +5114,22 @@ struct lb_env {
> unsigned int loop_max;
>
> enum fbq_type fbq_type;
> + struct list_head tasks;
> };
>
> /*
> - * move_task - move a task from one runqueue to another runqueue.
> - * Both runqueues must be locked.
> + * detach_task - detach a task from its runqueue for migration.
> + * The runqueue must be locked.
> */
> -static void move_task(struct task_struct *p, struct lb_env *env)
> +static void detach_task(struct task_struct *p, struct lb_env *env)
> {
> deactivate_task(env->src_rq, p, 0);
> + list_add(&p->se.group_node, &env->tasks);
> + p->on_rq = ONRQ_MIGRATING;
> set_task_cpu(p, env->dst_cpu);
> - activate_task(env->dst_rq, p, 0);
> - check_preempt_curr(env->dst_rq, p, 0);
> }
>
> +
We don't need more whitespace here, do we?
> /*
> * Is this task likely cache-hot?
> *
> @@ -5375,18 +5377,18 @@ static struct task_struct *detach_one_task(struct lb_env *env)
> static const unsigned int sched_nr_migrate_break = 32;
>
> /*
> - * move_tasks tries to move up to imbalance weighted load from busiest to
> - * this_rq, as part of a balancing operation within domain "sd".
> - * Returns 1 if successful and 0 otherwise.
> + * detach_tasks tries to detach up to imbalance weighted load from busiest_rq,
> + * as part of a balancing operation within domain "sd".
> + * Returns number of detached tasks if successful and 0 otherwise.
> *
> - * Called with both runqueues locked.
> + * Called with env->src_rq locked.
We should avoid comments like that, and instead use assertions to
enforce them.
> */
> -static int move_tasks(struct lb_env *env)
> +static int detach_tasks(struct lb_env *env)
> {
> struct list_head *tasks = &env->src_rq->cfs_tasks;
> struct task_struct *p;
> unsigned long load;
> - int pulled = 0;
> + int detached = 0;
Like so:
lockdep_assert_held(&env->src_rq->lock);
>
> if (env->imbalance <= 0)
> return 0;
This one could use a comment to tell its the complement to
detach_tasks()
> +static void attach_tasks(struct lb_env *env)
> +{
> + struct list_head *tasks = &env->tasks;
> + struct task_struct *p;
And here we obviously want:
lockdep_assert_held(&env->dst_rq->lock);
> + while (!list_empty(tasks)) {
> + p = list_first_entry(tasks, struct task_struct, se.group_node);
> + BUG_ON(task_rq(p) != env->dst_rq);
> + list_del_init(&p->se.group_node);
> + p->on_rq = ONRQ_QUEUED;
> + activate_task(env->dst_rq, p, 0);
> + check_preempt_curr(env->dst_rq, p, 0);
> + }
> }
> @@ -6608,16 +6627,22 @@ static int load_balance(int this_cpu, struct rq *this_rq,
> env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
>
> more_balance:
> + raw_spin_lock_irqsave(&busiest->lock, flags);
>
> /*
> * cur_ld_moved - load moved in current iteration
> * ld_moved - cumulative load moved across iterations
> */
> + cur_ld_moved = detach_tasks(&env);
> + raw_spin_unlock(&busiest->lock);
> +
> + if (cur_ld_moved) {
> + raw_spin_lock(&env.dst_rq->lock);
> + attach_tasks(&env);
> + raw_spin_unlock(&env.dst_rq->lock);
> + ld_moved += cur_ld_moved;
> + }
> +
> local_irq_restore(flags);
I think somewhere here would be a good place to put a comment on how all
this is still 'bounded'.
On 07/29, Kirill Tkhai wrote:
>
> How about this? Everything is inside task_rq_lock() now. The patch
> became much less.
And with this change task_migrating() is not possible under
task_rq_lock() or __task_rq_lock(). This means that 1/5 can be simplified
too.
__migrate_swap_task() is probably the notable exception...
Off-topic, but it takes 2 ->pi_lock's. This means it can deadlock with
try_to_wake_up_local() (if a 3rd process does ttwu() and waits for
->on_cpu == 0). But I guess __migrate_swap_task() should not play with
PF_WQ_WORKER threads.
Oleg.
В Вт, 29/07/2014 в 18:19 +0200, Oleg Nesterov пишет:
> On 07/29, Kirill Tkhai wrote:
> >
> > How about this? Everything is inside task_rq_lock() now. The patch
> > became much less.
>
> And with this change task_migrating() is not possible under
> task_rq_lock() or __task_rq_lock(). This means that 1/5 can be simplified
> too.
It seems to me it won't be useless anyway. In every place we underline
that a task is exactly queued or dequeued, so it's not necessary to remember
whether it is migrating or not. This is a cleanup, though it's big.
> __migrate_swap_task() is probably the notable exception...
>
> Off-topic, but it takes 2 ->pi_lock's. This means it can deadlock with
> try_to_wake_up_local() (if a 3rd process does ttwu() and waits for
> ->on_cpu == 0). But I guess __migrate_swap_task() should not play with
> PF_WQ_WORKER threads.
Hmm.. I'm surprised, PF_WQ_WORKER threads may be unbound. But it seems
we still can't pass them to try_to_wake_up_local.
Regards,
Kirill
On 07/30, Kirill Tkhai wrote:
>
> В Вт, 29/07/2014 в 18:19 +0200, Oleg Nesterov пишет:
> > On 07/29, Kirill Tkhai wrote:
> > >
> > > How about this? Everything is inside task_rq_lock() now. The patch
> > > became much less.
> >
> > And with this change task_migrating() is not possible under
> > task_rq_lock() or __task_rq_lock(). This means that 1/5 can be simplified
> > too.
>
> It seems to me it won't be useless anyway. In every place we underline
> that a task is exactly queued or dequeued, so it's not necessary to remember
> whether it is migrating or not. This is a cleanup, though it's big.
But, otoh, when you read the code which does "if (task_queued())" it is not
clear whether this code knows that task_migrating() is not possible, or we
should treat the task_migrating() state specially.
But I agree, this is subjective, I leave this to you and Peter.
> > __migrate_swap_task() is probably the notable exception...
> >
> > Off-topic, but it takes 2 ->pi_lock's. This means it can deadlock with
> > try_to_wake_up_local() (if a 3rd process does ttwu() and waits for
> > ->on_cpu == 0). But I guess __migrate_swap_task() should not play with
> > PF_WQ_WORKER threads.
>
> Hmm.. I'm surprised, PF_WQ_WORKER threads may be unbound. But it seems
> we still can't pass them to try_to_wake_up_local.
Why? See wq_worker_sleeping/try_to_wake_up_local in __schedule().
But perhaps I misunderstood you, and probably I was not clear. If
wq_worker_sleeping() returns !NULL then both task should be local, surely
we do not want to migrate them.
Oleg.
В Ср, 30/07/2014 в 16:41 +0200, Oleg Nesterov пишет:
> On 07/30, Kirill Tkhai wrote:
> >
> > В Вт, 29/07/2014 в 18:19 +0200, Oleg Nesterov пишет:
> > > On 07/29, Kirill Tkhai wrote:
> > > >
> > > > How about this? Everything is inside task_rq_lock() now. The patch
> > > > became much less.
> > >
> > > And with this change task_migrating() is not possible under
> > > task_rq_lock() or __task_rq_lock(). This means that 1/5 can be simplified
> > > too.
> >
> > It seems to me it won't be useless anyway. In every place we underline
> > that a task is exactly queued or dequeued, so it's not necessary to remember
> > whether it is migrating or not. This is a cleanup, though it's big.
>
> But, otoh, when you read the code which does "if (task_queued())" it is not
> clear whether this code knows that task_migrating() is not possible, or we
> should treat the task_migrating() state specially.
>
> But I agree, this is subjective, I leave this to you and Peter.
>
> > > __migrate_swap_task() is probably the notable exception...
> > >
> > > Off-topic, but it takes 2 ->pi_lock's. This means it can deadlock with
> > > try_to_wake_up_local() (if a 3rd process does ttwu() and waits for
> > > ->on_cpu == 0). But I guess __migrate_swap_task() should not play with
> > > PF_WQ_WORKER threads.
> >
> > Hmm.. I'm surprised, PF_WQ_WORKER threads may be unbound. But it seems
> > we still can't pass them to try_to_wake_up_local.
>
> Why? See wq_worker_sleeping/try_to_wake_up_local in __schedule().
>
> But perhaps I misunderstood you, and probably I was not clear. If
> wq_worker_sleeping() returns !NULL then both task should be local, surely
> we do not want to migrate them.
I mean it was surprising for me that PF_WQ_WORKER threads may be unbound...
I hope wq_worker_sleeping() does not return them.
Nothing important from me about this question.