LinuxLists.cc - [PATCH v3 0/5] sched: Add on_rq states and remove several double rq locks

2014-07-30 21:44:37

Subject: [PATCH v3 0/5] sched: Add on_rq states and remove several double rq locks

This series aims to get rid of some places where locks of two RQs are held
at the same time.

Patch [1/5] is a preparation/cleanup. It replaces old (task_struct::on_rq == 1)
with new (task_struct::on_rq == ONRQ_QUEUED) everywhere. No functional changes.

Patch [2/5] is main in the series. It introduces new ONRQ_MIGRATING state and
teaches scheduler to understand it (we need little changes in try_to_wake_up()
and task_rq_lock() family). This will be used in the following way:

(we are changing task's rq)

raw_spin_lock(&src_rq->lock);

p = ...; /* Some src_rq task */

dequeue_task(src_rq, p, 0);
p->on_rq = ONRQ_MIGRATING;
set_task_cpu(p, dst_cpu);
raw_spin_unlock(&src_rq->lock);

/*
* Now p os dequeued, and both
* RQ locks are unlocked, but
* its on_rq is not zero.
* Nobody can manipulate p
* while it's migrating,
* even when spinlocks are
* unlocked.
*/

raw_spin_lock(&dst_rq->lock);
p->on_rq = ONRQ_QUEUED;
enqueue_task(dst_rq, p, 0);
raw_spin_unlock(&dst_rq->lock);

Patches [3,4,5/5] remove double locks and use new ONRQ_MIGRATING state.
They allow unlocked using of 3-4 function, which looks safe for me.

The profit is double_rq_lock() is not need now, so we reduce the
total time when RQs are locked.

---

Kirill Tkhai (5):
sched: Wrapper for checking task_struct::on_rq
sched: Teach scheduler to understand ONRQ_MIGRATING state
sched: Remove double_rq_lock() from __migrate_task()
sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop()
sched/fair: Remove double_lock_balance() from load_balance()

kernel/sched/core.c | 115 +++++++++++++++++--------------
kernel/sched/deadline.c | 14 ++--
kernel/sched/fair.c | 172 +++++++++++++++++++++++++++++++---------------
kernel/sched/rt.c | 16 ++--
kernel/sched/sched.h | 13 +++
kernel/sched/stop_task.c | 2 -
6 files changed, 211 insertions(+), 121 deletions(-)

--
Signed-off-by: Kirill Tkhai <[email protected]>

2014-07-30 21:42:56

by Kirill Tkhai

[permalink] [raw]

Subject: [PATCH v3 2/5] sched: Teach scheduler to understand ONRQ_MIGRATING state

This is a new state which will be used to indicate that
a task is in a process of migrating between two RQs. It
allows to get rid of double_rq_lock(), which we used to
use to change rq of a queued task before.

Let's consider the example. To move a task between src_rq
and dst_rq we will do the following:

raw_spin_lock(&src_rq->lock);
/* p is a task which is queued on src_rq */
p = ...;

dequeue_task(src_rq, p, 0);
p->on_rq = ONRQ_MIGRATING;
set_task_cpu(p, dst_cpu);
raw_spin_unlock(&src_rq->lock);

/*
* Both of RQs are unlocked here.
* Task p is dequeued from src_rq
* but its on_rq is not zero.
*/

raw_spin_lock(&dst_rq->lock);
p->on_rq = ONRQ_QUEUED;
enqueue_task(dst_rq, p, 0);
raw_spin_unlock(&dst_rq->lock);

When p->on_rq is ONRQ_MIGRATING, task is considered
as "migrating", and other parallel scheduler actions
with it are not available for parallel caller. The
parallel caller spins till migration is completed.

The unavailable actions are changing of cpu affinity,
changing of priority etc, in other words all the
functionality which used to require task_rq(p)->lock
before (and related to the task).

To implement ONRQ_MIGRATING support we primarily are
using the following fact. Most of scheduler users
(from which we are protecting a migrating task) use
task_rq_lock() and __task_rq_lock() to get the lock
of task_rq(p). These primitives know that task's cpu
may change, and they are spining while the lock of
the right RQ is not held. We add one more condition
into them, so they will be also spinning until the
migration is finished.
---
kernel/sched/core.c | 14 +++++++++++---
kernel/sched/sched.h | 6 ++++++
2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26aa7bc..75b0517 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -331,9 +331,13 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
lockdep_assert_held(&p->pi_lock);

for (;;) {
+ while (unlikely(task_migrating(p)))
+ cpu_relax();
+
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p)))
+ if (likely(rq == task_rq(p) &&
+ !task_migrating(p)))
return rq;
raw_spin_unlock(&rq->lock);
}
@@ -349,10 +353,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
struct rq *rq;

for (;;) {
+ while (unlikely(task_migrating(p)))
+ cpu_relax();
+
raw_spin_lock_irqsave(&p->pi_lock, *flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p)))
+ if (likely(rq == task_rq(p) &&
+ !task_migrating(p)))
return rq;
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
@@ -1678,7 +1686,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);

- if (task_queued(p) && ttwu_remote(p, wake_flags))
+ if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;

#ifdef CONFIG_SMP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2c83b6e..ac7c1c8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -17,6 +17,7 @@ struct rq;

/* task_struct::on_rq states: */
#define ONRQ_QUEUED 1
+#define ONRQ_MIGRATING 2

extern __read_mostly int scheduler_running;

@@ -950,6 +951,11 @@ static inline int task_queued(struct task_struct *p)
return p->on_rq == ONRQ_QUEUED;
}

+static inline int task_migrating(struct task_struct *p)
+{
+ return p->on_rq == ONRQ_MIGRATING;
+}
+
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif

2014-07-30 21:43:09

by Kirill Tkhai

[permalink] [raw]

Subject: [PATCH v3 3/5] sched: Remove double_rq_lock() from __migrate_task()

Let's use ONRQ_MIGRATING instead.

Signed-off-by: Kirill Tkhai <[email protected]>
---
kernel/sched/core.c | 23 +++++++++++++++--------
1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 75b0517..9494daf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4681,20 +4681,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
*/
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
- struct rq *rq_dest, *rq_src;
+ struct rq *rq;
int ret = 0;

if (unlikely(!cpu_active(dest_cpu)))
return ret;

- rq_src = cpu_rq(src_cpu);
- rq_dest = cpu_rq(dest_cpu);
+ rq = cpu_rq(src_cpu);

raw_spin_lock(&p->pi_lock);
- double_rq_lock(rq_src, rq_dest);
+ raw_spin_lock(&rq->lock);
/* Already moved. */
if (task_cpu(p) != src_cpu)
goto done;
+
/* Affinity changed (again). */
if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
goto fail;
@@ -4704,15 +4704,22 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* placed properly.
*/
if (task_queued(p)) {
- dequeue_task(rq_src, p, 0);
+ dequeue_task(rq, p, 0);
+ p->on_rq = ONRQ_MIGRATING;
set_task_cpu(p, dest_cpu);
- enqueue_task(rq_dest, p, 0);
- check_preempt_curr(rq_dest, p, 0);
+ raw_spin_unlock(&rq->lock);
+
+ rq = cpu_rq(dest_cpu);
+ raw_spin_lock(&rq->lock);
+ BUG_ON(task_rq(p) != rq);
+ p->on_rq = ONRQ_QUEUED;
+ enqueue_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
}
done:
ret = 1;
fail:
- double_rq_unlock(rq_src, rq_dest);
+ raw_spin_unlock(&rq->lock);
raw_spin_unlock(&p->pi_lock);
return ret;
}

2014-07-30 21:43:16

by Kirill Tkhai

[permalink] [raw]

Subject: [PATCH v3 5/5] sched/fair: Remove double_lock_balance() from load_balance()

Keep on_rq = ONRQ_MIGRATING, while task is migrating, instead.

Signed-off-by: Kirill Tkhai <[email protected]>
---
kernel/sched/fair.c | 99 ++++++++++++++++++++++++++++++++++++---------------
1 file changed, 70 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e2ee4b..475d05e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4706,7 +4706,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;

/*
- * This is possible from callers such as move_task(), in which we
+ * This is possible from callers such as attach_tasks(), in which we
* unconditionally check_prempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
@@ -5114,18 +5114,20 @@ struct lb_env {
unsigned int loop_max;

enum fbq_type fbq_type;
+ struct list_head tasks;
};

/*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
+ * detach_task - detach a task for migration specified in env
*/
-static void move_task(struct task_struct *p, struct lb_env *env)
+static void detach_task(struct task_struct *p, struct lb_env *env)
{
+ lockdep_assert_held(&env->src_rq->lock);
+
deactivate_task(env->src_rq, p, 0);
+ list_add(&p->se.group_node, &env->tasks);
+ p->on_rq = ONRQ_MIGRATING;
set_task_cpu(p, env->dst_cpu);
- activate_task(env->dst_rq, p, 0);
- check_preempt_curr(env->dst_rq, p, 0);
}

/*
@@ -5363,9 +5365,9 @@ static struct task_struct *detach_one_task(struct lb_env *env)

/*
* Right now, this is only the second place where
- * lb_gained[env->idle] is updated (other is move_tasks)
+ * lb_gained[env->idle] is updated (other is detach_tasks)
* so we can safely collect stats here rather than
- * inside move_tasks().
+ * inside detach_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
return p;
@@ -5376,18 +5378,18 @@ static struct task_struct *detach_one_task(struct lb_env *env)
static const unsigned int sched_nr_migrate_break = 32;

/*
- * move_tasks tries to move up to imbalance weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
+ * detach_tasks tries to detach up to imbalance weighted load from busiest_rq,
+ * as part of a balancing operation within domain "sd".
+ * Returns number of detached tasks if successful and 0 otherwise.
*/
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
struct task_struct *p;
unsigned long load;
- int pulled = 0;
+ int detached = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);

if (env->imbalance <= 0)
return 0;
@@ -5418,14 +5420,15 @@ static int move_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;

- move_task(p, env);
- pulled++;
+ detach_task(p, env);
+
+ detached++;
env->imbalance -= load;

#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
- * kernels will stop after the first task is pulled to minimize
+ * kernels will stop after the first task is detached to minimize
* the critical section.
*/
if (env->idle == CPU_NEWLY_IDLE)
@@ -5445,13 +5448,31 @@ static int move_tasks(struct lb_env *env)
}

/*
- * Right now, this is one of only two places move_task() is called,
- * so we can safely collect move_task() stats here rather than
- * inside move_task().
+ * Right now, this is one of only two places we collect this stat
+ * so we can safely collect detach_one_task() stats here rather
+ * than inside detach_one_task().
*/
- schedstat_add(env->sd, lb_gained[env->idle], pulled);
+ schedstat_add(env->sd, lb_gained[env->idle], detached);

- return pulled;
+ return detached;
+}
+
+/* Attach tasks previously detached in detach_tasks() */
+static void attach_tasks(struct lb_env *env)
+{
+ struct list_head *tasks = &env->tasks;
+ struct task_struct *p;
+
+ lockdep_assert_held(&env->dst_rq->lock);
+
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
+ BUG_ON(task_rq(p) != env->dst_rq);
+ list_del_init(&p->se.group_node);
+ p->on_rq = ONRQ_QUEUED;
+ activate_task(env->dst_rq, p, 0);
+ check_preempt_curr(env->dst_rq, p, 0);
+ }
}

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6561,6 +6582,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
+ .tasks = LIST_HEAD_INIT(env.tasks),
};

/*
@@ -6610,16 +6632,35 @@ static int load_balance(int this_cpu, struct rq *this_rq,
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);

more_balance:
- local_irq_save(flags);
- double_rq_lock(env.dst_rq, busiest);
+ raw_spin_lock_irqsave(&busiest->lock, flags);

/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
- cur_ld_moved = move_tasks(&env);
- ld_moved += cur_ld_moved;
- double_rq_unlock(env.dst_rq, busiest);
+ cur_ld_moved = detach_tasks(&env);
+
+ /*
+ * We've detached some tasks from busiest_rq. Every
+ * task is masked "ONRQ_MIGRATED", so we can safely
+ * unlock busiest->lock, and we are able to be sure
+ * that nobody can manipulate the tasks in parallel.
+ * See task_rq_lock() family for the details.
+ */
+
+ raw_spin_unlock(&busiest->lock);
+
+ if (cur_ld_moved) {
+ raw_spin_lock(&env.dst_rq->lock);
+ /*
+ * Attach the tasks to env->dst_rq
+ * and mask them "ONRQ_QUEUED".
+ */
+ attach_tasks(&env);
+ raw_spin_unlock(&env.dst_rq->lock);
+ ld_moved += cur_ld_moved;
+ }
+
local_irq_restore(flags);

/*
@@ -6755,7 +6796,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
- * move_tasks).
+ * detach_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;

2014-07-30 21:43:40

by Kirill Tkhai

[permalink] [raw]

Subject: [PATCH v3 4/5] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop()

Bad situation:

double_lock_balance() drops busiest_rq lock. The busiest_rq is *busiest*,
and a lot of tasks and context switches there. We are dropping the lock
and waiting for it again.

Let's just detach the task and once finally unlock it!

Warning: this admits unlocked using of can_migrate_task(), throttled_lb_pair(),
and task_hot(). I've added lockdep asserts to point on this.

Signed-off-by: Kirill Tkhai <[email protected]>
---
kernel/sched/fair.c | 55 +++++++++++++++++++++++++++++++++++----------------
1 file changed, 38 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 48420ae..8e2ee4b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3297,6 +3297,8 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
* Ensure that neither of the group entities corresponding to src_cpu or
* dest_cpu are members of a throttled hierarchy when performing group
* load-balance operations.
+ *
+ * Note: RQs may be unlocked.
*/
static inline int throttled_lb_pair(struct task_group *tg,
int src_cpu, int dest_cpu)
@@ -5133,6 +5135,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;

+ lockdep_assert_held(&env->src_rq->lock);
+
if (p->sched_class != &fair_sched_class)
return 0;

@@ -5252,6 +5256,9 @@ static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
@@ -5336,30 +5343,34 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
}

/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * detach_one_task tries to dequeue exactly one task from env->src_rq, as
* part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
+ * Returns a task if successful and NULL otherwise.
*/
-static int move_one_task(struct lb_env *env)
+static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p, *n;

+ lockdep_assert_held(&env->src_rq->lock);
+
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;

- move_task(p, env);
+ deactivate_task(env->src_rq, p, 0);
+ p->on_rq = ONRQ_MIGRATING;
+ set_task_cpu(p, env->dst_cpu);
+
/*
- * Right now, this is only the second place move_task()
- * is called, so we can safely collect move_task()
- * stats here rather than inside move_task().
+ * Right now, this is only the second place where
+ * lb_gained[env->idle] is updated (other is move_tasks)
+ * so we can safely collect stats here rather than
+ * inside move_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
- return 1;
+ return p;
}
- return 0;
+ return NULL;
}

static const unsigned int sched_nr_migrate_break = 32;
@@ -6914,6 +6925,7 @@ static int active_load_balance_cpu_stop(void *data)
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
+ struct task_struct *p = NULL;

raw_spin_lock_irq(&busiest_rq->lock);

@@ -6933,9 +6945,6 @@ static int active_load_balance_cpu_stop(void *data)
*/
BUG_ON(busiest_rq == target_rq);

- /* move a task from busiest_rq to target_rq */
- double_lock_balance(busiest_rq, target_rq);
-
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
@@ -6956,16 +6965,28 @@ static int active_load_balance_cpu_stop(void *data)

schedstat_inc(sd, alb_count);

- if (move_one_task(&env))
+ p = detach_one_task(&env);
+ if (p)
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
}
rcu_read_unlock();
- double_unlock_balance(busiest_rq, target_rq);
out_unlock:
busiest_rq->active_balance = 0;
- raw_spin_unlock_irq(&busiest_rq->lock);
+ raw_spin_unlock(&busiest_rq->lock);
+
+ if (p) {
+ raw_spin_lock(&target_rq->lock);
+ BUG_ON(task_rq(p) != target_rq);
+ p->on_rq = ONRQ_QUEUED;
+ activate_task(target_rq, p, 0);
+ check_preempt_curr(target_rq, p, 0);
+ raw_spin_unlock(&target_rq->lock);
+ }
+
+ local_irq_enable();
+
return 0;
}

2014-07-30 21:42:53

by Kirill Tkhai

[permalink] [raw]

Subject: [PATCH v3 1/5] sched: Wrapper for checking task_struct::on_rq

Implement task_queued() and use it everywhere instead of on_rq check.
No functional changes.

The only exception is we do not use the wrapper in check_for_tasks(),
because it requires to export task_queued() in global header files.
Next patch in series would return it back, so it doesn't matter.

Signed-off-by: Kirill Tkhai <[email protected]>
---
kernel/sched/core.c | 82 +++++++++++++++++++++++-----------------------
kernel/sched/deadline.c | 14 ++++----
kernel/sched/fair.c | 22 ++++++------
kernel/sched/rt.c | 16 ++++-----
kernel/sched/sched.h | 7 ++++
kernel/sched/stop_task.c | 2 +
6 files changed, 75 insertions(+), 68 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2676866..26aa7bc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1043,7 +1043,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
+ if (task_queued(rq->curr) && test_tsk_need_resched(rq->curr))
rq->skip_clock_update = 1;
}

@@ -1088,7 +1088,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

static void __migrate_swap_task(struct task_struct *p, int cpu)
{
- if (p->on_rq) {
+ if (task_queued(p)) {
struct rq *src_rq, *dst_rq;

src_rq = task_rq(p);
@@ -1214,7 +1214,7 @@ static int migration_cpu_stop(void *data);
unsigned long wait_task_inactive(struct task_struct *p, long match_state)
{
unsigned long flags;
- int running, on_rq;
+ int running, queued;
unsigned long ncsw;
struct rq *rq;

@@ -1252,7 +1252,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
rq = task_rq_lock(p, &flags);
trace_sched_wait_task(p);
running = task_running(rq, p);
- on_rq = p->on_rq;
+ queued = task_queued(p);
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1284,7 +1284,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
- if (unlikely(on_rq)) {
+ if (unlikely(queued)) {
ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);

set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1478,7 +1478,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags);
- p->on_rq = 1;
+ p->on_rq = ONRQ_QUEUED;

/* if a worker is waking up, notify workqueue */
if (p->flags & PF_WQ_WORKER)
@@ -1537,7 +1537,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
int ret = 0;

rq = __task_rq_lock(p);
- if (p->on_rq) {
+ if (task_queued(p)) {
/* check_preempt_curr() may use rq clock */
update_rq_clock(rq);
ttwu_do_wakeup(rq, p, wake_flags);
@@ -1678,7 +1678,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);

- if (p->on_rq && ttwu_remote(p, wake_flags))
+ if (task_queued(p) && ttwu_remote(p, wake_flags))
goto stat;

#ifdef CONFIG_SMP
@@ -1742,7 +1742,7 @@ static void try_to_wake_up_local(struct task_struct *p)
if (!(p->state & TASK_NORMAL))
goto out;

- if (!p->on_rq)
+ if (!task_queued(p))
ttwu_activate(rq, p, ENQUEUE_WAKEUP);

ttwu_do_wakeup(rq, p, 0);
@@ -2095,7 +2095,7 @@ void wake_up_new_task(struct task_struct *p)
init_task_runnable_average(p);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
- p->on_rq = 1;
+ p->on_rq = ONRQ_QUEUED;
trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
@@ -2444,7 +2444,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
* project cycles that may never be accounted to this
* thread, breaking clock_gettime().
*/
- if (task_current(rq, p) && p->on_rq) {
+ if (task_current(rq, p) && task_queued(p)) {
update_rq_clock(rq);
ns = rq_clock_task(rq) - p->se.exec_start;
if ((s64)ns < 0)
@@ -2490,7 +2490,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* If we see ->on_cpu without ->on_rq, the task is leaving, and has
* been accounted, so we're correct here as well.
*/
- if (!p->on_cpu || !p->on_rq)
+ if (!p->on_cpu || !task_queued(p))
return p->se.sum_exec_runtime;
#endif

@@ -2794,7 +2794,7 @@ static void __sched __schedule(void)
switch_count = &prev->nvcsw;
}

- if (prev->on_rq || rq->skip_clock_update < 0)
+ if (task_queued(prev) || rq->skip_clock_update < 0)
update_rq_clock(rq);

next = pick_next_task(rq, prev);
@@ -2959,7 +2959,7 @@ EXPORT_SYMBOL(default_wake_function);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, on_rq, running, enqueue_flag = 0;
+ int oldprio, queued, running, enqueue_flag = 0;
struct rq *rq;
const struct sched_class *prev_class;

@@ -2988,9 +2988,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
prev_class = p->sched_class;
- on_rq = p->on_rq;
+ queued = task_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
@@ -3030,7 +3030,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)

if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, p, enqueue_flag);

check_class_changed(rq, p, prev_class, oldprio);
@@ -3041,7 +3041,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)

void set_user_nice(struct task_struct *p, long nice)
{
- int old_prio, delta, on_rq;
+ int old_prio, delta, queued;
unsigned long flags;
struct rq *rq;

@@ -3062,8 +3062,8 @@ void set_user_nice(struct task_struct *p, long nice)
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
- on_rq = p->on_rq;
- if (on_rq)
+ queued = task_queued(p);
+ if (queued)
dequeue_task(rq, p, 0);

p->static_prio = NICE_TO_PRIO(nice);
@@ -3072,7 +3072,7 @@ void set_user_nice(struct task_struct *p, long nice)
p->prio = effective_prio(p);
delta = p->prio - old_prio;

- if (on_rq) {
+ if (queued) {
enqueue_task(rq, p, 0);
/*
* If the task increased its priority or is running and
@@ -3344,7 +3344,7 @@ static int __sched_setscheduler(struct task_struct *p,
{
int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, on_rq, running;
+ int retval, oldprio, oldpolicy = -1, queued, running;
int policy = attr->sched_policy;
unsigned long flags;
const struct sched_class *prev_class;
@@ -3541,9 +3541,9 @@ static int __sched_setscheduler(struct task_struct *p,
return 0;
}

- on_rq = p->on_rq;
+ queued = task_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
@@ -3553,7 +3553,7 @@ static int __sched_setscheduler(struct task_struct *p,

if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (queued) {
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
@@ -4568,7 +4568,7 @@ void init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();

rq->curr = rq->idle = idle;
- idle->on_rq = 1;
+ idle->on_rq = ONRQ_QUEUED;
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
@@ -4645,7 +4645,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out;

dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (p->on_rq) {
+ if (task_queued(p)) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, p, &flags);
@@ -4695,7 +4695,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* If we're not on a rq, the next wake-up will ensure we're
* placed properly.
*/
- if (p->on_rq) {
+ if (task_queued(p)) {
dequeue_task(rq_src, p, 0);
set_task_cpu(p, dest_cpu);
enqueue_task(rq_dest, p, 0);
@@ -4736,13 +4736,13 @@ void sched_setnuma(struct task_struct *p, int nid)
{
struct rq *rq;
unsigned long flags;
- bool on_rq, running;
+ bool queued, running;

rq = task_rq_lock(p, &flags);
- on_rq = p->on_rq;
+ queued = task_queued(p);
running = task_current(rq, p);

- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
@@ -4751,7 +4751,7 @@ void sched_setnuma(struct task_struct *p, int nid)

if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, p, 0);
task_rq_unlock(rq, p, &flags);
}
@@ -7117,13 +7117,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
.sched_policy = SCHED_NORMAL,
};
int old_prio = p->prio;
- int on_rq;
+ int queued;

- on_rq = p->on_rq;
- if (on_rq)
+ queued = task_queued(p);
+ if (queued)
dequeue_task(rq, p, 0);
__setscheduler(rq, p, &attr);
- if (on_rq) {
+ if (queued) {
enqueue_task(rq, p, 0);
resched_curr(rq);
}
@@ -7311,16 +7311,16 @@ void sched_offline_group(struct task_group *tg)
void sched_move_task(struct task_struct *tsk)
{
struct task_group *tg;
- int on_rq, running;
+ int queued, running;
unsigned long flags;
struct rq *rq;

rq = task_rq_lock(tsk, &flags);

running = task_current(rq, tsk);
- on_rq = tsk->on_rq;
+ queued = task_queued(tsk);

- if (on_rq)
+ if (queued)
dequeue_task(rq, tsk, 0);
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
@@ -7333,14 +7333,14 @@ void sched_move_task(struct task_struct *tsk)

#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk, on_rq);
+ tsk->sched_class->task_move_group(tsk, queued);
else
#endif
set_task_rq(tsk, task_cpu(tsk));

if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, tsk, 0);

task_rq_unlock(rq, tsk, &flags);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 255ce13..4cc3b14 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -530,7 +530,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
update_rq_clock(rq);
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
- if (p->on_rq) {
+ if (task_queued(p)) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (task_has_dl_policy(rq->curr))
check_preempt_curr_dl(rq, p, 0);
@@ -1030,7 +1030,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
* means a stop task can slip in, in which case we need to
* re-start task selection.
*/
- if (rq->stop && rq->stop->on_rq)
+ if (rq->stop && task_queued(rq->stop))
return RETRY_TASK;
}

@@ -1257,7 +1257,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(later_rq->cpu,
&task->cpus_allowed) ||
- task_running(rq, task) || !task->on_rq)) {
+ task_running(rq, task) || !task_queued(task))) {
double_unlock_balance(rq, later_rq);
later_rq = NULL;
break;
@@ -1296,7 +1296,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
BUG_ON(task_current(rq, p));
BUG_ON(p->nr_cpus_allowed <= 1);

- BUG_ON(!p->on_rq);
+ BUG_ON(!task_queued(p));
BUG_ON(!dl_task(p));

return p;
@@ -1443,7 +1443,7 @@ static int pull_dl_task(struct rq *this_rq)
dl_time_before(p->dl.deadline,
this_rq->dl.earliest_dl.curr))) {
WARN_ON(p == src_rq->curr);
- WARN_ON(!p->on_rq);
+ WARN_ON(!task_queued(p));

/*
* Then we pull iff p has actually an earlier
@@ -1596,7 +1596,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (unlikely(p->dl.dl_throttled))
return;

- if (p->on_rq && rq->curr != p) {
+ if (task_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
/* Only reschedule if pushing failed */
@@ -1614,7 +1614,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (p->on_rq || rq->curr == p) {
+ if (task_queued(p) || rq->curr == p) {
#ifdef CONFIG_SMP
/*
* This might be too much, but unfortunately
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfa3c86..48420ae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7465,7 +7465,7 @@ static void task_fork_fair(struct task_struct *p)
static void
prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (!p->se.on_rq)
+ if (!task_queued(p))
return;

/*
@@ -7490,11 +7490,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
* switched back to the fair class the enqueue_entity(.flags=0) will
* do the right thing.
*
- * If it's on_rq, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it's !on_rq, then only when
+ * If it's queued, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it's !queued, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!p->on_rq && p->state != TASK_RUNNING) {
+ if (!task_queued(p) && p->state != TASK_RUNNING) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
@@ -7529,7 +7529,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
*/
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
- if (!se->on_rq)
+ if (!task_queued(p))
return;

/*
@@ -7575,7 +7575,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
}

#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int queued)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq;
@@ -7594,7 +7594,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* fair sleeper stuff for the first placement, but who cares.
*/
/*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
+ * When !queued, vruntime of the task has usually NOT been normalized.
* But there are some cases where it has already been normalized:
*
* - Moving a forked child which is waiting for being woken up by
@@ -7605,14 +7605,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* To prevent boost or penalty in the new cfs_rq caused by delta
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
*/
- if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
- on_rq = 1;
+ if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
+ queued = 1;

- if (!on_rq)
+ if (!queued)
se->vruntime -= cfs_rq_of(se)->min_vruntime;
set_task_rq(p, task_cpu(p));
se->depth = se->parent ? se->parent->depth + 1 : 0;
- if (!on_rq) {
+ if (!queued) {
cfs_rq = cfs_rq_of(se);
se->vruntime += cfs_rq->min_vruntime;
#ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca..9395320 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
* means a dl or stop task can slip in, in which case we need
* to re-start task selection.
*/
- if (unlikely((rq->stop && rq->stop->on_rq) ||
+ if (unlikely((rq->stop && task_queued(rq->stop)) ||
rq->dl.dl_nr_running))
return RETRY_TASK;
}
@@ -1624,7 +1624,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
!cpumask_test_cpu(lowest_rq->cpu,
tsk_cpus_allowed(task)) ||
task_running(rq, task) ||
- !task->on_rq)) {
+ !task_queued(task))) {

double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1658,7 +1658,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(task_current(rq, p));
BUG_ON(p->nr_cpus_allowed <= 1);

- BUG_ON(!p->on_rq);
+ BUG_ON(!task_queued(p));
BUG_ON(!rt_task(p));

return p;
@@ -1809,7 +1809,7 @@ static int pull_rt_task(struct rq *this_rq)
*/
if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
- WARN_ON(!p->on_rq);
+ WARN_ON(!task_queued(p));

/*
* There's a chance that p is higher in priority
@@ -1870,7 +1870,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,

BUG_ON(!rt_task(p));

- if (!p->on_rq)
+ if (!task_queued(p))
return;

weight = cpumask_weight(new_mask);
@@ -1936,7 +1936,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
* we may need to handle the pulling of RT tasks
* now.
*/
- if (!p->on_rq || rq->rt.rt_nr_running)
+ if (!task_queued(p) || rq->rt.rt_nr_running)
return;

if (pull_rt_task(rq))
@@ -1970,7 +1970,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
* If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (p->on_rq && rq->curr != p) {
+ if (task_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
/* Don't resched if we changed runqueues */
@@ -1989,7 +1989,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
static void
prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (!p->on_rq)
+ if (!task_queued(p))
return;

if (rq->curr == p) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f..2c83b6e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -15,6 +15,9 @@

struct rq;

+/* task_struct::on_rq states: */
+#define ONRQ_QUEUED 1
+
extern __read_mostly int scheduler_running;

extern unsigned long calc_load_update;
@@ -942,6 +945,10 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
#endif
}

+static inline int task_queued(struct task_struct *p)
+{
+ return p->on_rq == ONRQ_QUEUED;
+}

#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0eda..1a4bb0f 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
{
struct task_struct *stop = rq->stop;

- if (!stop || !stop->on_rq)
+ if (!stop || !task_queued(stop))
return NULL;

put_prev_task(rq, prev);