2020-10-23 10:27:24

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing

In order to minimize the interference of migrate_disable() on lower
priority tasks, which can be deprived of runtime due to being stuck
below a higher priority task. Teach the RT/DL balancers to push away
these higher priority tasks when a lower priority task gets selected
to run on a freshly demoted CPU (pull).

This adds migration interference to the higher priority task, but
restores bandwidth to system that would otherwise be irrevocably lost.
Without this it would be possible to have all tasks on the system
stuck on a single CPU, each task preempted in a migrate_disable()
section with a single high priority task running.

This way we can still approximate running the M highest priority tasks
on the system.

Migrating the top task away is (ofcourse) still subject to
migrate_disable() too, which means the lower task is subject to an
interference equivalent to the worst case migrate_disable() section.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/preempt.h | 38 +++++++++++++++------------
include/linux/sched.h | 3 +-
kernel/sched/core.c | 67 ++++++++++++++++++++++++++++++++++++++++--------
kernel/sched/deadline.c | 29 +++++++++++++++-----
kernel/sched/rt.c | 63 ++++++++++++++++++++++++++++++++++++---------
kernel/sched/sched.h | 32 ++++++++++++++++++++++
6 files changed, 185 insertions(+), 47 deletions(-)

--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -325,24 +325,28 @@ static inline void preempt_notifier_init
#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)

/*
- * Migrate-Disable and why it is (strongly) undesired.
+ * Migrate-Disable and why it is undesired.
*
- * The premise of the Real-Time schedulers we have on Linux
- * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
- * concurrently, provided there are sufficient runnable tasks, also known as
- * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
- * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
- *
- * The correctness of various scheduling models depends on this, but is it
- * broken by migrate_disable() that doesn't imply preempt_disable(). Where
- * preempt_disable() implies an immediate priority ceiling, preemptible
- * migrate_disable() allows nesting.
- *
- * The worst case is that all tasks preempt one another in a migrate_disable()
- * region and stack on a single CPU. This then reduces the available bandwidth
- * to a single CPU. And since Real-Time schedulability theory considers the
- * Worst-Case only, all Real-Time analysis shall revert to single-CPU
- * (instantly solving the SMP analysis problem).
+ * When a preempted task becomes elegible to run under the ideal model (IOW it
+ * becomes one of the M highest priority tasks), it might still have to wait
+ * for the preemptee's migrate_disable() section to complete. Thereby suffering
+ * a reduction in bandwidth in the exact duration of the migrate_disable()
+ * section.
+ *
+ * Per this argument, the change from preempt_disable() to migrate_disable()
+ * gets us:
+ *
+ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
+ * it would have had to wait for the lower priority task.
+ *
+ * - a lower priority tasks; which under preempt_disable() could've instantly
+ * migrated away when another CPU becomes available, is now constrained
+ * by the ability to push the higher priority task away, which might itself be
+ * in a migrate_disable() section, reducing it's available bandwidth.
+ *
+ * IOW it trades latency / moves the interference term, but it stays in the
+ * system, and as long as it remains unbounded, the system is not fully
+ * deterministic.
*
*
* The reason we have it anyway.
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -716,8 +716,9 @@ struct task_struct {
cpumask_t cpus_mask;
void *migration_pending;
#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
- int migration_disabled;
+ unsigned short migration_disabled;
#endif
+ unsigned short migration_flags;

#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1763,11 +1763,6 @@ void migrate_enable(void)
}
EXPORT_SYMBOL_GPL(migrate_enable);

-static inline bool is_migration_disabled(struct task_struct *p)
-{
- return p->migration_disabled;
-}
-
static inline bool rq_has_pinned_tasks(struct rq *rq)
{
return rq->nr_pinned;
@@ -1974,6 +1969,49 @@ static int migration_cpu_stop(void *data
return 0;
}

+int push_cpu_stop(void *arg)
+{
+ struct rq *lowest_rq = NULL, *rq = this_rq();
+ struct task_struct *p = arg;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+
+ if (task_rq(p) != rq)
+ goto out_unlock;
+
+ if (is_migration_disabled(p)) {
+ p->migration_flags |= MDF_PUSH;
+ goto out_unlock;
+ }
+
+ p->migration_flags &= ~MDF_PUSH;
+
+ if (p->sched_class->find_lock_rq)
+ lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+ if (!lowest_rq)
+ goto out_unlock;
+
+ // XXX validate p is still the highest prio task
+ if (task_rq(p) == rq) {
+ deactivate_task(rq, p, 0);
+ set_task_cpu(p, lowest_rq->cpu);
+ activate_task(lowest_rq, p, 0);
+ resched_curr(lowest_rq);
+ }
+
+ double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+ rq->push_busy = false;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+ return 0;
+}
+
/*
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
@@ -2054,6 +2092,14 @@ static int affine_move_task(struct rq *r

/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ struct task_struct *push_task = NULL;
+
+ if ((flags & SCA_MIGRATE_ENABLE) &&
+ (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+ rq->push_busy = true;
+ push_task = get_task_struct(p);
+ }
+
pending = p->migration_pending;
if (pending) {
refcount_inc(&pending->refs);
@@ -2062,6 +2108,11 @@ static int affine_move_task(struct rq *r
}
task_rq_unlock(rq, p, rf);

+ if (push_task) {
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+ p, &rq->push_work);
+ }
+
if (complete)
goto do_complete;

@@ -2098,6 +2149,7 @@ static int affine_move_task(struct rq *r
if (flags & SCA_MIGRATE_ENABLE) {

refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+ p->migration_flags &= ~MDF_PUSH;
task_rq_unlock(rq, p, rf);

pending->arg = (struct migration_arg) {
@@ -2716,11 +2768,6 @@ static inline int __set_cpus_allowed_ptr

static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }

-static inline bool is_migration_disabled(struct task_struct *p)
-{
- return false;
-}
-
static inline bool rq_has_pinned_tasks(struct rq *rq)
{
return false;
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2129,6 +2129,9 @@ static int push_dl_task(struct rq *rq)
return 0;

retry:
+ if (is_migration_disabled(next_task))
+ return 0;
+
if (WARN_ON(next_task == rq->curr))
return 0;

@@ -2206,7 +2209,7 @@ static void push_dl_tasks(struct rq *rq)
static void pull_dl_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, cpu;
- struct task_struct *p;
+ struct task_struct *p, *push_task;
bool resched = false;
struct rq *src_rq;
u64 dmin = LONG_MAX;
@@ -2236,6 +2239,7 @@ static void pull_dl_task(struct rq *this
continue;

/* Might drop this_rq->lock */
+ push_task = NULL;
double_lock_balance(this_rq, src_rq);

/*
@@ -2267,17 +2271,27 @@ static void pull_dl_task(struct rq *this
src_rq->curr->dl.deadline))
goto skip;

- resched = true;
-
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
- dmin = p->dl.deadline;
+ if (is_migration_disabled(p)) {
+ push_task = get_push_task(src_rq);
+ } else {
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, this_cpu);
+ activate_task(this_rq, p, 0);
+ dmin = p->dl.deadline;
+ resched = true;
+ }

/* Is there any other task even earlier? */
}
skip:
double_unlock_balance(this_rq, src_rq);
+
+ if (push_task) {
+ raw_spin_unlock(&this_rq->lock);
+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+ push_task, &src_rq->push_work);
+ raw_spin_lock(&this_rq->lock);
+ }
}

if (resched)
@@ -2524,6 +2538,7 @@ const struct sched_class dl_sched_class
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
.task_woken = task_woken_dl,
+ .find_lock_rq = find_lock_later_rq,
#endif

.task_tick = task_tick_dl,
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pus
* running task can migrate over to a CPU that is running a task
* of lesser priority.
*/
-static int push_rt_task(struct rq *rq)
+static int push_rt_task(struct rq *rq, bool pull)
{
struct task_struct *next_task;
struct rq *lowest_rq;
@@ -1873,6 +1873,34 @@ static int push_rt_task(struct rq *rq)
return 0;

retry:
+ if (is_migration_disabled(next_task)) {
+ struct task_struct *push_task = NULL;
+ int cpu;
+
+ if (!pull || rq->push_busy)
+ return 0;
+
+ cpu = find_lowest_rq(rq->curr);
+ if (cpu == -1 || cpu == rq->cpu)
+ return 0;
+
+ /*
+ * Given we found a CPU with lower priority than @next_task,
+ * therefore it should be running. However we cannot migrate it
+ * to this other CPU, instead attempt to push the current
+ * running task on this CPU away.
+ */
+ push_task = get_push_task(rq);
+ if (push_task) {
+ raw_spin_unlock(&rq->lock);
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+ push_task, &rq->push_work);
+ raw_spin_lock(&rq->lock);
+ }
+
+ return 0;
+ }
+
if (WARN_ON(next_task == rq->curr))
return 0;

@@ -1927,12 +1955,10 @@ static int push_rt_task(struct rq *rq)
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, lowest_rq->cpu);
activate_task(lowest_rq, next_task, 0);
- ret = 1;
-
resched_curr(lowest_rq);
+ ret = 1;

double_unlock_balance(rq, lowest_rq);
-
out:
put_task_struct(next_task);

@@ -1942,7 +1968,7 @@ static int push_rt_task(struct rq *rq)
static void push_rt_tasks(struct rq *rq)
{
/* push_rt_task will return true if it moved an RT */
- while (push_rt_task(rq))
+ while (push_rt_task(rq, false))
;
}

@@ -2095,7 +2121,8 @@ void rto_push_irq_work_func(struct irq_w
*/
if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock);
- push_rt_tasks(rq);
+ while (push_rt_task(rq, true))
+ ;
raw_spin_unlock(&rq->lock);
}

@@ -2120,7 +2147,7 @@ static void pull_rt_task(struct rq *this
{
int this_cpu = this_rq->cpu, cpu;
bool resched = false;
- struct task_struct *p;
+ struct task_struct *p, *push_task;
struct rq *src_rq;
int rt_overload_count = rt_overloaded(this_rq);

@@ -2167,6 +2194,7 @@ static void pull_rt_task(struct rq *this
* double_lock_balance, and another CPU could
* alter this_rq
*/
+ push_task = NULL;
double_lock_balance(this_rq, src_rq);

/*
@@ -2194,11 +2222,14 @@ static void pull_rt_task(struct rq *this
if (p->prio < src_rq->curr->prio)
goto skip;

- resched = true;
-
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
+ if (is_migration_disabled(p)) {
+ push_task = get_push_task(src_rq);
+ } else {
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, this_cpu);
+ activate_task(this_rq, p, 0);
+ resched = true;
+ }
/*
* We continue with the search, just in
* case there's an even higher prio task
@@ -2208,6 +2239,13 @@ static void pull_rt_task(struct rq *this
}
skip:
double_unlock_balance(this_rq, src_rq);
+
+ if (push_task) {
+ raw_spin_unlock(&this_rq->lock);
+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+ push_task, &src_rq->push_work);
+ raw_spin_lock(&this_rq->lock);
+ }
}

if (resched)
@@ -2449,6 +2487,7 @@ const struct sched_class rt_sched_class
.rq_offline = rq_offline_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
+ .find_lock_rq = find_lock_lowest_rq,
#endif

.task_tick = task_tick_rt,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1057,6 +1057,8 @@ struct rq {
#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
unsigned int nr_pinned;
#endif
+ unsigned int push_busy;
+ struct cpu_stop_work push_work;
};

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1084,6 +1086,16 @@ static inline int cpu_of(struct rq *rq)
#endif
}

+#define MDF_PUSH 0x01
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+ return p->migration_disabled;
+#else
+ return false;
+#endif
+}

#ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq);
@@ -1823,6 +1835,8 @@ struct sched_class {

void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
+
+ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
#endif

void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
@@ -1918,6 +1932,24 @@ extern void trigger_load_balance(struct

extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);

+static inline struct task_struct *get_push_task(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ lockdep_assert_held(&rq->lock);
+
+ if (rq->push_busy)
+ return NULL;
+
+ if (p->nr_cpus_allowed == 1)
+ return NULL;
+
+ rq->push_busy = true;
+ return get_task_struct(p);
+}
+
+extern int push_cpu_stop(void *arg);
+
#endif

#ifdef CONFIG_CPU_IDLE



Subject: [tip: sched/core] sched: Fix migrate_disable() vs rt/dl balancing

The following commit has been merged into the sched/core branch of tip:

Commit-ID: a7c81556ec4d341dfdbf2cc478ead89d73e474a7
Gitweb: https://git.kernel.org/tip/a7c81556ec4d341dfdbf2cc478ead89d73e474a7
Author: Peter Zijlstra <[email protected]>
AuthorDate: Mon, 28 Sep 2020 17:06:07 +02:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 10 Nov 2020 18:39:01 +01:00

sched: Fix migrate_disable() vs rt/dl balancing

In order to minimize the interference of migrate_disable() on lower
priority tasks, which can be deprived of runtime due to being stuck
below a higher priority task. Teach the RT/DL balancers to push away
these higher priority tasks when a lower priority task gets selected
to run on a freshly demoted CPU (pull).

This adds migration interference to the higher priority task, but
restores bandwidth to system that would otherwise be irrevocably lost.
Without this it would be possible to have all tasks on the system
stuck on a single CPU, each task preempted in a migrate_disable()
section with a single high priority task running.

This way we can still approximate running the M highest priority tasks
on the system.

Migrating the top task away is (ofcourse) still subject to
migrate_disable() too, which means the lower task is subject to an
interference equivalent to the worst case migrate_disable() section.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Daniel Bristot de Oliveira <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
include/linux/preempt.h | 40 +++++++++++++-----------
include/linux/sched.h | 3 +-
kernel/sched/core.c | 67 ++++++++++++++++++++++++++++++++++------
kernel/sched/deadline.c | 29 ++++++++++++-----
kernel/sched/rt.c | 63 ++++++++++++++++++++++++++++++--------
kernel/sched/sched.h | 32 +++++++++++++++++++-
6 files changed, 186 insertions(+), 48 deletions(-)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 97ba7c9..8b43922 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -325,24 +325,28 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)

/*
- * Migrate-Disable and why it is (strongly) undesired.
- *
- * The premise of the Real-Time schedulers we have on Linux
- * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
- * concurrently, provided there are sufficient runnable tasks, also known as
- * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
- * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
- *
- * The correctness of various scheduling models depends on this, but is it
- * broken by migrate_disable() that doesn't imply preempt_disable(). Where
- * preempt_disable() implies an immediate priority ceiling, preemptible
- * migrate_disable() allows nesting.
- *
- * The worst case is that all tasks preempt one another in a migrate_disable()
- * region and stack on a single CPU. This then reduces the available bandwidth
- * to a single CPU. And since Real-Time schedulability theory considers the
- * Worst-Case only, all Real-Time analysis shall revert to single-CPU
- * (instantly solving the SMP analysis problem).
+ * Migrate-Disable and why it is undesired.
+ *
+ * When a preempted task becomes elegible to run under the ideal model (IOW it
+ * becomes one of the M highest priority tasks), it might still have to wait
+ * for the preemptee's migrate_disable() section to complete. Thereby suffering
+ * a reduction in bandwidth in the exact duration of the migrate_disable()
+ * section.
+ *
+ * Per this argument, the change from preempt_disable() to migrate_disable()
+ * gets us:
+ *
+ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
+ * it would have had to wait for the lower priority task.
+ *
+ * - a lower priority tasks; which under preempt_disable() could've instantly
+ * migrated away when another CPU becomes available, is now constrained
+ * by the ability to push the higher priority task away, which might itself be
+ * in a migrate_disable() section, reducing it's available bandwidth.
+ *
+ * IOW it trades latency / moves the interference term, but it stays in the
+ * system, and as long as it remains unbounded, the system is not fully
+ * deterministic.
*
*
* The reason we have it anyway.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 90a0c92..3af9d52 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -716,8 +716,9 @@ struct task_struct {
cpumask_t cpus_mask;
void *migration_pending;
#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
- int migration_disabled;
+ unsigned short migration_disabled;
#endif
+ unsigned short migration_flags;

#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9ce2fc7..e92d785 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1763,11 +1763,6 @@ void migrate_enable(void)
}
EXPORT_SYMBOL_GPL(migrate_enable);

-static inline bool is_migration_disabled(struct task_struct *p)
-{
- return p->migration_disabled;
-}
-
static inline bool rq_has_pinned_tasks(struct rq *rq)
{
return rq->nr_pinned;
@@ -1972,6 +1967,49 @@ out:
return 0;
}

+int push_cpu_stop(void *arg)
+{
+ struct rq *lowest_rq = NULL, *rq = this_rq();
+ struct task_struct *p = arg;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+
+ if (task_rq(p) != rq)
+ goto out_unlock;
+
+ if (is_migration_disabled(p)) {
+ p->migration_flags |= MDF_PUSH;
+ goto out_unlock;
+ }
+
+ p->migration_flags &= ~MDF_PUSH;
+
+ if (p->sched_class->find_lock_rq)
+ lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+ if (!lowest_rq)
+ goto out_unlock;
+
+ // XXX validate p is still the highest prio task
+ if (task_rq(p) == rq) {
+ deactivate_task(rq, p, 0);
+ set_task_cpu(p, lowest_rq->cpu);
+ activate_task(lowest_rq, p, 0);
+ resched_curr(lowest_rq);
+ }
+
+ double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+ rq->push_busy = false;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+ return 0;
+}
+
/*
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
@@ -2052,6 +2090,14 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag

/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ struct task_struct *push_task = NULL;
+
+ if ((flags & SCA_MIGRATE_ENABLE) &&
+ (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+ rq->push_busy = true;
+ push_task = get_task_struct(p);
+ }
+
pending = p->migration_pending;
if (pending) {
refcount_inc(&pending->refs);
@@ -2060,6 +2106,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
}
task_rq_unlock(rq, p, rf);

+ if (push_task) {
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+ p, &rq->push_work);
+ }
+
if (complete)
goto do_complete;

@@ -2098,6 +2149,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
if (flags & SCA_MIGRATE_ENABLE) {

refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+ p->migration_flags &= ~MDF_PUSH;
task_rq_unlock(rq, p, rf);

pending->arg = (struct migration_arg) {
@@ -2716,11 +2768,6 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,

static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }

-static inline bool is_migration_disabled(struct task_struct *p)
-{
- return false;
-}
-
static inline bool rq_has_pinned_tasks(struct rq *rq)
{
return false;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3d3fd83..eed2e44 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2129,6 +2129,9 @@ static int push_dl_task(struct rq *rq)
return 0;

retry:
+ if (is_migration_disabled(next_task))
+ return 0;
+
if (WARN_ON(next_task == rq->curr))
return 0;

@@ -2206,7 +2209,7 @@ static void push_dl_tasks(struct rq *rq)
static void pull_dl_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, cpu;
- struct task_struct *p;
+ struct task_struct *p, *push_task;
bool resched = false;
struct rq *src_rq;
u64 dmin = LONG_MAX;
@@ -2236,6 +2239,7 @@ static void pull_dl_task(struct rq *this_rq)
continue;

/* Might drop this_rq->lock */
+ push_task = NULL;
double_lock_balance(this_rq, src_rq);

/*
@@ -2267,17 +2271,27 @@ static void pull_dl_task(struct rq *this_rq)
src_rq->curr->dl.deadline))
goto skip;

- resched = true;
-
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
- dmin = p->dl.deadline;
+ if (is_migration_disabled(p)) {
+ push_task = get_push_task(src_rq);
+ } else {
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, this_cpu);
+ activate_task(this_rq, p, 0);
+ dmin = p->dl.deadline;
+ resched = true;
+ }

/* Is there any other task even earlier? */
}
skip:
double_unlock_balance(this_rq, src_rq);
+
+ if (push_task) {
+ raw_spin_unlock(&this_rq->lock);
+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+ push_task, &src_rq->push_work);
+ raw_spin_lock(&this_rq->lock);
+ }
}

if (resched)
@@ -2524,6 +2538,7 @@ const struct sched_class dl_sched_class
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
.task_woken = task_woken_dl,
+ .find_lock_rq = find_lock_later_rq,
#endif

.task_tick = task_tick_dl,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index cf63346..c592e47 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
* running task can migrate over to a CPU that is running a task
* of lesser priority.
*/
-static int push_rt_task(struct rq *rq)
+static int push_rt_task(struct rq *rq, bool pull)
{
struct task_struct *next_task;
struct rq *lowest_rq;
@@ -1873,6 +1873,34 @@ static int push_rt_task(struct rq *rq)
return 0;

retry:
+ if (is_migration_disabled(next_task)) {
+ struct task_struct *push_task = NULL;
+ int cpu;
+
+ if (!pull || rq->push_busy)
+ return 0;
+
+ cpu = find_lowest_rq(rq->curr);
+ if (cpu == -1 || cpu == rq->cpu)
+ return 0;
+
+ /*
+ * Given we found a CPU with lower priority than @next_task,
+ * therefore it should be running. However we cannot migrate it
+ * to this other CPU, instead attempt to push the current
+ * running task on this CPU away.
+ */
+ push_task = get_push_task(rq);
+ if (push_task) {
+ raw_spin_unlock(&rq->lock);
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+ push_task, &rq->push_work);
+ raw_spin_lock(&rq->lock);
+ }
+
+ return 0;
+ }
+
if (WARN_ON(next_task == rq->curr))
return 0;

@@ -1927,12 +1955,10 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, lowest_rq->cpu);
activate_task(lowest_rq, next_task, 0);
- ret = 1;
-
resched_curr(lowest_rq);
+ ret = 1;

double_unlock_balance(rq, lowest_rq);
-
out:
put_task_struct(next_task);

@@ -1942,7 +1968,7 @@ out:
static void push_rt_tasks(struct rq *rq)
{
/* push_rt_task will return true if it moved an RT */
- while (push_rt_task(rq))
+ while (push_rt_task(rq, false))
;
}

@@ -2095,7 +2121,8 @@ void rto_push_irq_work_func(struct irq_work *work)
*/
if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock);
- push_rt_tasks(rq);
+ while (push_rt_task(rq, true))
+ ;
raw_spin_unlock(&rq->lock);
}

@@ -2120,7 +2147,7 @@ static void pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, cpu;
bool resched = false;
- struct task_struct *p;
+ struct task_struct *p, *push_task;
struct rq *src_rq;
int rt_overload_count = rt_overloaded(this_rq);

@@ -2167,6 +2194,7 @@ static void pull_rt_task(struct rq *this_rq)
* double_lock_balance, and another CPU could
* alter this_rq
*/
+ push_task = NULL;
double_lock_balance(this_rq, src_rq);

/*
@@ -2194,11 +2222,14 @@ static void pull_rt_task(struct rq *this_rq)
if (p->prio < src_rq->curr->prio)
goto skip;

- resched = true;
-
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
+ if (is_migration_disabled(p)) {
+ push_task = get_push_task(src_rq);
+ } else {
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, this_cpu);
+ activate_task(this_rq, p, 0);
+ resched = true;
+ }
/*
* We continue with the search, just in
* case there's an even higher prio task
@@ -2208,6 +2239,13 @@ static void pull_rt_task(struct rq *this_rq)
}
skip:
double_unlock_balance(this_rq, src_rq);
+
+ if (push_task) {
+ raw_spin_unlock(&this_rq->lock);
+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+ push_task, &src_rq->push_work);
+ raw_spin_lock(&this_rq->lock);
+ }
}

if (resched)
@@ -2449,6 +2487,7 @@ const struct sched_class rt_sched_class
.rq_offline = rq_offline_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
+ .find_lock_rq = find_lock_lowest_rq,
#endif

.task_tick = task_tick_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 42de140..56992aa 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1057,6 +1057,8 @@ struct rq {
#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
unsigned int nr_pinned;
#endif
+ unsigned int push_busy;
+ struct cpu_stop_work push_work;
};

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1084,6 +1086,16 @@ static inline int cpu_of(struct rq *rq)
#endif
}

+#define MDF_PUSH 0x01
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+ return p->migration_disabled;
+#else
+ return false;
+#endif
+}

#ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq);
@@ -1823,6 +1835,8 @@ struct sched_class {

void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
+
+ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
#endif

void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
@@ -1918,6 +1932,24 @@ extern void trigger_load_balance(struct rq *rq);

extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);

+static inline struct task_struct *get_push_task(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ lockdep_assert_held(&rq->lock);
+
+ if (rq->push_busy)
+ return NULL;
+
+ if (p->nr_cpus_allowed == 1)
+ return NULL;
+
+ rq->push_busy = true;
+ return get_task_struct(p);
+}
+
+extern int push_cpu_stop(void *arg);
+
#endif

#ifdef CONFIG_CPU_IDLE

2020-12-26 13:58:01

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing

Hi Peter

Apologies for the late comments on the patch.

On 10/23/20 12:12, Peter Zijlstra wrote:

[...]

> + * When a preempted task becomes elegible to run under the ideal model (IOW it
> + * becomes one of the M highest priority tasks), it might still have to wait
> + * for the preemptee's migrate_disable() section to complete. Thereby suffering
> + * a reduction in bandwidth in the exact duration of the migrate_disable()
> + * section.
> + *
> + * Per this argument, the change from preempt_disable() to migrate_disable()
> + * gets us:
> + *
> + * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
> + * it would have had to wait for the lower priority task.
> + *
> + * - a lower priority tasks; which under preempt_disable() could've instantly
> + * migrated away when another CPU becomes available, is now constrained
> + * by the ability to push the higher priority task away, which might itself be
> + * in a migrate_disable() section, reducing it's available bandwidth.
> + *
> + * IOW it trades latency / moves the interference term, but it stays in the
> + * system, and as long as it remains unbounded, the system is not fully
> + * deterministic.

The idea makes sense but I'm worried about some implementation details.

Specifically:

* There's no guarantee the target CPU we're pushing to doesn't have
a lower priority task in migration_disabled too. So we could end up
having to push the task again. Although unlikely in practice, but as
I see it the worst case scenario is unbounded here. The planets could
align perfectly for the higher priority task to spend the majority of
its time migrating between cpus that have low priority RT tasks in
migration_disabled regions.

We need to track migration disabled at rq level to fix this.
It might be necessary to track the priority levels that are in
migration_disabled too :-/

* Since this is a voluntary migration, I think we should ensure it is
restricted to cpus_share_cache() to guarantee the price is minimal
and acceptable.

* The push is done via the stopper task; which will steal run time
and could contribute to worst case latency. I think it'd fine in
practice, but PREEMPT_RT folks will know better.

I think the combined effect of above could end up throwing off RT system
designers who could find their high-priority-hard-RT task is missing its
deadline to be nice to lower priority tasks who go often to migration_disabled
regions.

I seem to remember Clark saying in last LPC that few us latency is not unheard
of now.

> +int push_cpu_stop(void *arg)
> +{
> + struct rq *lowest_rq = NULL, *rq = this_rq();
> + struct task_struct *p = arg;
> +
> + raw_spin_lock_irq(&p->pi_lock);
> + raw_spin_lock(&rq->lock);
> +
> + if (task_rq(p) != rq)
> + goto out_unlock;
> +
> + if (is_migration_disabled(p)) {
> + p->migration_flags |= MDF_PUSH;
> + goto out_unlock;
> + }
> +
> + p->migration_flags &= ~MDF_PUSH;
> +
> + if (p->sched_class->find_lock_rq)
> + lowest_rq = p->sched_class->find_lock_rq(p, rq);
> +
> + if (!lowest_rq)
> + goto out_unlock;
> +
> + // XXX validate p is still the highest prio task

The task_rq(p) could have left the migration_disabled region by now too. If we
track that at rq level we could be able to do last minute check to bale out of
this voluntary push.

I think we should check that the lowest_rq is not in migration_disabled region
too otherwise the same task could end up here again.

Need to think more about it, but we might be able to get away with verifying
task_rq(p)->curr and lowest_rq->curr aren't in migration disabled. The only
worry I can think of now is that rq->curr is a similar task to this one. That
is: a higher priority task that has preempted a migration_disabled region.

Verifying that task_cpu(p) and lowest_rq->cpu are in the same llc will help
avoid a costly migration. After all this is a voluntary migration.

Once we do all these bale outs; we might need to rate limit another PULL
triggering this continuously. Need to dig more into that.

> + if (task_rq(p) == rq) {
> + deactivate_task(rq, p, 0);
> + set_task_cpu(p, lowest_rq->cpu);
> + activate_task(lowest_rq, p, 0);
> + resched_curr(lowest_rq);
> + }
> +
> + double_unlock_balance(rq, lowest_rq);
> +
> +out_unlock:
> + rq->push_busy = false;
> + raw_spin_unlock(&rq->lock);
> + raw_spin_unlock_irq(&p->pi_lock);
> +
> + put_task_struct(p);
> + return 0;
> +}

[...]

> +static inline struct task_struct *get_push_task(struct rq *rq)
> +{
> + struct task_struct *p = rq->curr;

Shouldn't we verify the class of the task here? The RT task in migration
disabled could have been preempted by a dl or stopper task. Similarly, the dl
task could have been preempted by a stopper task.

I don't think an RT task should be allowed to push a dl task under any
circumstances?

Cheers

--
Qais Yousef

> +
> + lockdep_assert_held(&rq->lock);
> +
> + if (rq->push_busy)
> + return NULL;
> +
> + if (p->nr_cpus_allowed == 1)
> + return NULL;
> +
> + rq->push_busy = true;
> + return get_task_struct(p);
> +}
> +
> +extern int push_cpu_stop(void *arg);
> +
> #endif
>
> #ifdef CONFIG_CPU_IDLE
>
>

2021-03-05 14:59:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing

On Sat, Dec 26, 2020 at 01:54:45PM +0000, Qais Yousef wrote:
> Hi Peter
>
> Apologies for the late comments on the patch.

Ha!, it seems I too need to apologize for never having actually found
your reply ;-)

> On 10/23/20 12:12, Peter Zijlstra wrote:
>
> [...]
>
> > + * When a preempted task becomes elegible to run under the ideal model (IOW it
> > + * becomes one of the M highest priority tasks), it might still have to wait
> > + * for the preemptee's migrate_disable() section to complete. Thereby suffering
> > + * a reduction in bandwidth in the exact duration of the migrate_disable()
> > + * section.
> > + *
> > + * Per this argument, the change from preempt_disable() to migrate_disable()
> > + * gets us:
> > + *
> > + * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
> > + * it would have had to wait for the lower priority task.
> > + *
> > + * - a lower priority tasks; which under preempt_disable() could've instantly
> > + * migrated away when another CPU becomes available, is now constrained
> > + * by the ability to push the higher priority task away, which might itself be
> > + * in a migrate_disable() section, reducing it's available bandwidth.
> > + *
> > + * IOW it trades latency / moves the interference term, but it stays in the
> > + * system, and as long as it remains unbounded, the system is not fully
> > + * deterministic.
>
> The idea makes sense but I'm worried about some implementation details.
>
> Specifically:
>
> * There's no guarantee the target CPU we're pushing to doesn't have
> a lower priority task in migration_disabled too. So we could end up
> having to push the task again.

I'm not sure I follow, if the CPU we're pushing to has a
migrate_disable() task of lower priority we'll simply preempt it.

IIRC there's conditions for this push:

1) we just did migrate_enable();
2) the task below us also has migrate_disable();
3) the task below us is actually higher priority than
the lowest priority task currently running.

So at that point we shoot our high prio task away, and we aim it at the
lowest prio task.

In order to then shoot it away again, someone else needs to block to
make lower prio task we just preempted elegible again.

Still, possible I suppose.

> Although unlikely in practice, but as
> I see it the worst case scenario is unbounded here. The planets could
> align perfectly for the higher priority task to spend the majority of
> its time migrating between cpus that have low priority RT tasks in
> migration_disabled regions.

I'm thinking it might be limited by the range of priorities. You need to
drop the prio on every round, and you can't keep on dropping priority
levels, at some point we've reached bottom.

> We need to track migration disabled at rq level to fix this.
> It might be necessary to track the priority levels that are in
> migration_disabled too :-/

As a tie breaker, not sure it's worth it.

> * Since this is a voluntary migration, I think we should ensure it is
> restricted to cpus_share_cache() to guarantee the price is minimal
> and acceptable.

That might create conflicting goals wrt the SMP invariant (run the N
highest prio tasks).

> * The push is done via the stopper task; which will steal run time
> and could contribute to worst case latency. I think it'd fine in
> practice, but PREEMPT_RT folks will know better.
>
> I think the combined effect of above could end up throwing off RT system
> designers who could find their high-priority-hard-RT task is missing its
> deadline to be nice to lower priority tasks who go often to migration_disabled
> regions.
>
> I seem to remember Clark saying in last LPC that few us latency is not unheard
> of now.

Those people that care _that_ much typically set hard affinity for their
tasks.

> > +int push_cpu_stop(void *arg)
> > +{
> > + struct rq *lowest_rq = NULL, *rq = this_rq();
> > + struct task_struct *p = arg;
> > +
> > + raw_spin_lock_irq(&p->pi_lock);
> > + raw_spin_lock(&rq->lock);
> > +
> > + if (task_rq(p) != rq)
> > + goto out_unlock;
> > +
> > + if (is_migration_disabled(p)) {
> > + p->migration_flags |= MDF_PUSH;
> > + goto out_unlock;
> > + }
> > +
> > + p->migration_flags &= ~MDF_PUSH;
> > +
> > + if (p->sched_class->find_lock_rq)
> > + lowest_rq = p->sched_class->find_lock_rq(p, rq);
> > +
> > + if (!lowest_rq)
> > + goto out_unlock;
> > +
> > + // XXX validate p is still the highest prio task
>
> The task_rq(p) could have left the migration_disabled region by now too. If we
> track that at rq level we could be able to do last minute check to bale out of
> this voluntary push.
>
> I think we should check that the lowest_rq is not in migration_disabled region
> too otherwise the same task could end up here again.
>
> Need to think more about it, but we might be able to get away with verifying
> task_rq(p)->curr and lowest_rq->curr aren't in migration disabled. The only
> worry I can think of now is that rq->curr is a similar task to this one. That
> is: a higher priority task that has preempted a migration_disabled region.
>
> Verifying that task_cpu(p) and lowest_rq->cpu are in the same llc will help
> avoid a costly migration. After all this is a voluntary migration.
>
> Once we do all these bale outs; we might need to rate limit another PULL
> triggering this continuously. Need to dig more into that.

So we have:

CPU0 CPU1

M-preempted L-running
H-running

And normally we'd have pushed M, but it can't since it have
migration_disabled(). Moving H over L is the next best thing.

> > + if (task_rq(p) == rq) {
> > + deactivate_task(rq, p, 0);
> > + set_task_cpu(p, lowest_rq->cpu);
> > + activate_task(lowest_rq, p, 0);
> > + resched_curr(lowest_rq);
> > + }
> > +
> > + double_unlock_balance(rq, lowest_rq);
> > +
> > +out_unlock:
> > + rq->push_busy = false;
> > + raw_spin_unlock(&rq->lock);
> > + raw_spin_unlock_irq(&p->pi_lock);
> > +
> > + put_task_struct(p);
> > + return 0;
> > +}
>
> [...]
>
> > +static inline struct task_struct *get_push_task(struct rq *rq)
> > +{
> > + struct task_struct *p = rq->curr;
>
> Shouldn't we verify the class of the task here? The RT task in migration
> disabled could have been preempted by a dl or stopper task. Similarly, the dl
> task could have been preempted by a stopper task.
>
> I don't think an RT task should be allowed to push a dl task under any
> circumstances?

Hmm, quite. Fancy doing a patch?

2021-03-05 15:43:34

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing

On 05/03/21 15:56, Peter Zijlstra wrote:
> On Sat, Dec 26, 2020 at 01:54:45PM +0000, Qais Yousef wrote:
>>
>> > +static inline struct task_struct *get_push_task(struct rq *rq)
>> > +{
>> > + struct task_struct *p = rq->curr;
>>
>> Shouldn't we verify the class of the task here? The RT task in migration
>> disabled could have been preempted by a dl or stopper task. Similarly, the dl
>> task could have been preempted by a stopper task.
>>
>> I don't think an RT task should be allowed to push a dl task under any
>> circumstances?
>
> Hmm, quite. Fancy doing a patch?

Last time we talked about this, I looked into

push_rt_task() + find_lowest_rq()

IIRC, with how

find_lowest_rq() + cpupri_find_fitness()

currently work, find_lowest_rq() should return -1 in push_rt_task() if
rq->curr is DL (CPUPRI_INVALID). IOW, Migration-Disabled RT tasks shouldn't
actually interfere with DL tasks (unless a DL task gets scheduled after we
drop the rq lock and kick the stopper, but we have that problem everywhere
including CFS active balance).


Now, for some blabbering. Re SMP invariant; wouldn't we actually want this
to happen? Consider:

MD := Migration-Disabled.

rq
DL
RT3
RT2 (MD) RT1

current DL RT1 idle
CPU0 CPU1 CPU2

If we were to ignore MD, the best spread for this would be something
like:

rq
RT1
DL RT3 RT2

current DL RT3 RT2
CPU0 CPU1 CPU2

Now, with Migration-Disabled we can't move RT2 to CPU2 - it has to stay
on CPU0 for as long as it is Migration-Disabled. Thus, a possible spread
would be:

rq
RT1
RT2 (MD) DL RT3

current RT2 DL RT3
CPU0 CPU1 CPU

If you look closely, this is exactly the same as the previous spread
modulo CPU numbers. IOW, this is (again) a CPU renumbering exercise.

To respect the aforementioned scheduling invariant, we've had to move
that DL task, and while it does add interference, it's similar as to why we
push higher RT priority tasks to make room for lower RT priority, migration
disabled tasks. You get interference caused by a lower-priority entity for
the sake of your SMP scheduling invariant.

2021-03-05 16:52:52

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing

On 03/05/21 15:56, Peter Zijlstra wrote:
> On Sat, Dec 26, 2020 at 01:54:45PM +0000, Qais Yousef wrote:
> > Hi Peter
> >
> > Apologies for the late comments on the patch.
>
> Ha!, it seems I too need to apologize for never having actually found
> your reply ;-)

No worries, thanks for taking the time to answer! :-)

>
> > On 10/23/20 12:12, Peter Zijlstra wrote:
> >
> > [...]
> >
> > > + * When a preempted task becomes elegible to run under the ideal model (IOW it
> > > + * becomes one of the M highest priority tasks), it might still have to wait
> > > + * for the preemptee's migrate_disable() section to complete. Thereby suffering
> > > + * a reduction in bandwidth in the exact duration of the migrate_disable()
> > > + * section.
> > > + *
> > > + * Per this argument, the change from preempt_disable() to migrate_disable()
> > > + * gets us:
> > > + *
> > > + * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
> > > + * it would have had to wait for the lower priority task.
> > > + *
> > > + * - a lower priority tasks; which under preempt_disable() could've instantly
> > > + * migrated away when another CPU becomes available, is now constrained
> > > + * by the ability to push the higher priority task away, which might itself be
> > > + * in a migrate_disable() section, reducing it's available bandwidth.
> > > + *
> > > + * IOW it trades latency / moves the interference term, but it stays in the
> > > + * system, and as long as it remains unbounded, the system is not fully
> > > + * deterministic.
> >
> > The idea makes sense but I'm worried about some implementation details.
> >
> > Specifically:
> >
> > * There's no guarantee the target CPU we're pushing to doesn't have
> > a lower priority task in migration_disabled too. So we could end up
> > having to push the task again.
>
> I'm not sure I follow, if the CPU we're pushing to has a
> migrate_disable() task of lower priority we'll simply preempt it.
>
> IIRC there's conditions for this push:
>
> 1) we just did migrate_enable();
> 2) the task below us also has migrate_disable();
> 3) the task below us is actually higher priority than
> the lowest priority task currently running.
>
> So at that point we shoot our high prio task away, and we aim it at the
> lowest prio task.
>
> In order to then shoot it away again, someone else needs to block to
> make lower prio task we just preempted elegible again.

Okay. I missed that 3rd condition. I understood only 1 and 2 are required.
So we have to have 3 tasks of different priorities on the rq, the middle being
in migrate_disabled.

It is less of a problem in that case.

>
> Still, possible I suppose.
>
> > Although unlikely in practice, but as
> > I see it the worst case scenario is unbounded here. The planets could
> > align perfectly for the higher priority task to spend the majority of
> > its time migrating between cpus that have low priority RT tasks in
> > migration_disabled regions.
>
> I'm thinking it might be limited by the range of priorities. You need to
> drop the prio on every round, and you can't keep on dropping priority
> levels, at some point we've reached bottom.

With that 3rd condition in mind, there has to be an element of bad design to
end up with 3 tasks of different priorities on 1 rq that continuously. The
system has to be in some sort of overloaded state, which is a bigger problem to
address first.

> > > +static inline struct task_struct *get_push_task(struct rq *rq)
> > > +{
> > > + struct task_struct *p = rq->curr;
> >
> > Shouldn't we verify the class of the task here? The RT task in migration
> > disabled could have been preempted by a dl or stopper task. Similarly, the dl
> > task could have been preempted by a stopper task.
> >
> > I don't think an RT task should be allowed to push a dl task under any
> > circumstances?
>
> Hmm, quite. Fancy doing a patch?

I had one. Let me revive and post it next week.

Thanks

--
Qais Yousef

2021-03-05 17:15:14

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing

On 03/05/21 15:41, Valentin Schneider wrote:
> On 05/03/21 15:56, Peter Zijlstra wrote:
> > On Sat, Dec 26, 2020 at 01:54:45PM +0000, Qais Yousef wrote:
> >>
> >> > +static inline struct task_struct *get_push_task(struct rq *rq)
> >> > +{
> >> > + struct task_struct *p = rq->curr;
> >>
> >> Shouldn't we verify the class of the task here? The RT task in migration
> >> disabled could have been preempted by a dl or stopper task. Similarly, the dl
> >> task could have been preempted by a stopper task.
> >>
> >> I don't think an RT task should be allowed to push a dl task under any
> >> circumstances?
> >
> > Hmm, quite. Fancy doing a patch?
>
> Last time we talked about this, I looked into
>
> push_rt_task() + find_lowest_rq()
>
> IIRC, with how
>
> find_lowest_rq() + cpupri_find_fitness()
>
> currently work, find_lowest_rq() should return -1 in push_rt_task() if
> rq->curr is DL (CPUPRI_INVALID). IOW, Migration-Disabled RT tasks shouldn't

[...]

> If you look closely, this is exactly the same as the previous spread
> modulo CPU numbers. IOW, this is (again) a CPU renumbering exercise.

I don't see it a re-numbering exercise. The way I understand it a system
designer doesn't expect their DL task to move because of an RT task. I think we
should try to keep it this way, that's why I asked.

To be fair, I need to look at the code again and understand where I missed that
3rd condition Peter mentioned.

Thanks

--
Qais Yousef

2021-03-10 14:48:39

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing

On 03/05/21 15:41, Valentin Schneider wrote:
> On 05/03/21 15:56, Peter Zijlstra wrote:
> > On Sat, Dec 26, 2020 at 01:54:45PM +0000, Qais Yousef wrote:
> >>
> >> > +static inline struct task_struct *get_push_task(struct rq *rq)
> >> > +{
> >> > + struct task_struct *p = rq->curr;
> >>
> >> Shouldn't we verify the class of the task here? The RT task in migration
> >> disabled could have been preempted by a dl or stopper task. Similarly, the dl
> >> task could have been preempted by a stopper task.
> >>
> >> I don't think an RT task should be allowed to push a dl task under any
> >> circumstances?
> >
> > Hmm, quite. Fancy doing a patch?
>
> Last time we talked about this, I looked into
>
> push_rt_task() + find_lowest_rq()
>
> IIRC, with how
>
> find_lowest_rq() + cpupri_find_fitness()
>
> currently work, find_lowest_rq() should return -1 in push_rt_task() if
> rq->curr is DL (CPUPRI_INVALID). IOW, Migration-Disabled RT tasks shouldn't
> actually interfere with DL tasks (unless a DL task gets scheduled after we
> drop the rq lock and kick the stopper, but we have that problem everywhere
> including CFS active balance).

This makes it less of a problem true, but AFAICT this can still happen in the
pull path.

Anyways, here's the patch as extra bolts and braces to be considered.

Thanks

--
Qais Yousef

--->8----

From 2df733d381f636cc185944c7eda86c824a9a785e Mon Sep 17 00:00:00 2001
From: Qais Yousef <[email protected]>
Date: Tue, 12 Jan 2021 11:54:16 +0000
Subject: [PATCH] sched: Don't push a higher priority class in get_push_task()

Commit a7c81556ec4d ("sched: Fix migrate_disable() vs rt/dl balancing")
will attempt to push/pull a higher priority task if the candidate task
is in migrate_disable() section. This is an attempt to prevent
starvation of these lower priority task that, in theory at least, could
end up in a situation where they're forever in migrate disable section
with no CPU time to run.

One issue with that is get_push_task() assumes rq->curr is of the same
sched_class, which AFAICT is not guaranteed to be true.

This patch adds extra bolts and braces to ensure that this voluntary
push operation is performed on a task of the same scheduling class only.

Otherwise an RT task could end up causing a DL task to be pushed away.
Which breaks the strict priority between sched classes.

We could also end up trying to push the migration task. Which I think is
harmless and is nothing but a wasted effort.

Fixes: a7c81556ec4d ("sched: Fix migrate_disable() vs rt/dl balancing")
Signed-off-by: Qais Yousef <[email protected]>
---
kernel/sched/deadline.c | 2 +-
kernel/sched/rt.c | 4 ++--
kernel/sched/sched.h | 17 ++++++++++++++++-
3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index aac3539aa0fe..afadc7e1f968 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2276,7 +2276,7 @@ static void pull_dl_task(struct rq *this_rq)
goto skip;

if (is_migration_disabled(p)) {
- push_task = get_push_task(src_rq);
+ push_task = get_push_task(src_rq, SCHED_DEADLINE);
} else {
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8f720b71d13d..c2c5c08e3030 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1892,7 +1892,7 @@ static int push_rt_task(struct rq *rq, bool pull)
* to this other CPU, instead attempt to push the current
* running task on this CPU away.
*/
- push_task = get_push_task(rq);
+ push_task = get_push_task(rq, SCHED_FIFO);
if (push_task) {
raw_spin_unlock(&rq->lock);
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
@@ -2225,7 +2225,7 @@ static void pull_rt_task(struct rq *this_rq)
goto skip;

if (is_migration_disabled(p)) {
- push_task = get_push_task(src_rq);
+ push_task = get_push_task(src_rq, SCHED_FIFO);
} else {
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10a1522b1e30..4e156f008d22 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1954,12 +1954,27 @@ extern void trigger_load_balance(struct rq *rq);

extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);

-static inline struct task_struct *get_push_task(struct rq *rq)
+static inline struct task_struct *get_push_task(struct rq *rq, int policy)
{
struct task_struct *p = rq->curr;

lockdep_assert_held(&rq->lock);

+ switch(policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ if (!rt_task(p))
+ return NULL;
+ break;
+ case SCHED_DEADLINE:
+ if (!dl_task(p))
+ return NULL;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
if (rq->push_busy)
return NULL;

--
2.25.1