2020-03-04 17:01:40

by Vineeth Remanan Pillai

[permalink] [raw]
Subject: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.

From: Peter Zijlstra <[email protected]>

Instead of only selecting a local task, select a task for all SMT
siblings for every reschedule on the core (irrespective which logical
CPU does the reschedule).

There could be races in core scheduler where a CPU is trying to pick
a task for its sibling in core scheduler, when that CPU has just been
offlined. We should not schedule any tasks on the CPU in this case.
Return an idle task in pick_next_task for this situation.

NOTE: there is still potential for siblings rivalry.
NOTE: this is far too complicated; but thus far I've failed to
simplify it further.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Julien Desfossez <[email protected]>
Signed-off-by: Vineeth Remanan Pillai <[email protected]>
Signed-off-by: Aaron Lu <[email protected]>
Signed-off-by: Tim Chen <[email protected]>
---
kernel/sched/core.c | 274 ++++++++++++++++++++++++++++++++++++++++++-
kernel/sched/fair.c | 40 +++++++
kernel/sched/sched.h | 6 +-
3 files changed, 318 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 445f0d519336..9a1bd236044e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4253,7 +4253,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
@@ -4309,6 +4309,273 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
BUG();
}

+#ifdef CONFIG_SCHED_CORE
+
+static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
+{
+ return is_idle_task(a) || (a->core_cookie == cookie);
+}
+
+static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
+{
+ if (is_idle_task(a) || is_idle_task(b))
+ return true;
+
+ return a->core_cookie == b->core_cookie;
+}
+
+// XXX fairness/fwd progress conditions
+/*
+ * Returns
+ * - NULL if there is no runnable task for this class.
+ * - the highest priority task for this runqueue if it matches
+ * rq->core->core_cookie or its priority is greater than max.
+ * - Else returns idle_task.
+ */
+static struct task_struct *
+pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
+{
+ struct task_struct *class_pick, *cookie_pick;
+ unsigned long cookie = rq->core->core_cookie;
+
+ class_pick = class->pick_task(rq);
+ if (!class_pick)
+ return NULL;
+
+ if (!cookie) {
+ /*
+ * If class_pick is tagged, return it only if it has
+ * higher priority than max.
+ */
+ if (max && class_pick->core_cookie &&
+ prio_less(class_pick, max))
+ return idle_sched_class.pick_task(rq);
+
+ return class_pick;
+ }
+
+ /*
+ * If class_pick is idle or matches cookie, return early.
+ */
+ if (cookie_equals(class_pick, cookie))
+ return class_pick;
+
+ cookie_pick = sched_core_find(rq, cookie);
+
+ /*
+ * If class > max && class > cookie, it is the highest priority task on
+ * the core (so far) and it must be selected, otherwise we must go with
+ * the cookie pick in order to satisfy the constraint.
+ */
+ if (prio_less(cookie_pick, class_pick) &&
+ (!max || prio_less(max, class_pick)))
+ return class_pick;
+
+ return cookie_pick;
+}
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *next, *max = NULL;
+ const struct sched_class *class;
+ const struct cpumask *smt_mask;
+ int i, j, cpu;
+ bool need_sync = false;
+
+ cpu = cpu_of(rq);
+ if (cpu_is_offline(cpu))
+ return idle_sched_class.pick_next_task(rq);
+
+ if (!sched_core_enabled(rq))
+ return __pick_next_task(rq, prev, rf);
+
+ /*
+ * If there were no {en,de}queues since we picked (IOW, the task
+ * pointers are all still valid), and we haven't scheduled the last
+ * pick yet, do so now.
+ */
+ if (rq->core->core_pick_seq == rq->core->core_task_seq &&
+ rq->core->core_pick_seq != rq->core_sched_seq) {
+ WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+
+ next = rq->core_pick;
+ if (next != prev) {
+ put_prev_task(rq, prev);
+ set_next_task(rq, next);
+ }
+ return next;
+ }
+
+ prev->sched_class->put_prev_task(rq, prev);
+ if (!rq->nr_running)
+ newidle_balance(rq, rf);
+
+ smt_mask = cpu_smt_mask(cpu);
+
+ /*
+ * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
+ *
+ * @task_seq guards the task state ({en,de}queues)
+ * @pick_seq is the @task_seq we did a selection on
+ * @sched_seq is the @pick_seq we scheduled
+ *
+ * However, preemptions can cause multiple picks on the same task set.
+ * 'Fix' this by also increasing @task_seq for every pick.
+ */
+ rq->core->core_task_seq++;
+ need_sync = !!rq->core->core_cookie;
+
+ /* reset state */
+ rq->core->core_cookie = 0UL;
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+
+ rq_i->core_pick = NULL;
+
+ if (rq_i->core_forceidle) {
+ need_sync = true;
+ rq_i->core_forceidle = false;
+ }
+
+ if (i != cpu)
+ update_rq_clock(rq_i);
+ }
+
+ /*
+ * Try and select tasks for each sibling in decending sched_class
+ * order.
+ */
+ for_each_class(class) {
+again:
+ for_each_cpu_wrap(i, smt_mask, cpu) {
+ struct rq *rq_i = cpu_rq(i);
+ struct task_struct *p;
+
+ if (cpu_is_offline(i)) {
+ rq_i->core_pick = rq_i->idle;
+ continue;
+ }
+
+ if (rq_i->core_pick)
+ continue;
+
+ /*
+ * If this sibling doesn't yet have a suitable task to
+ * run; ask for the most elegible task, given the
+ * highest priority task already selected for this
+ * core.
+ */
+ p = pick_task(rq_i, class, max);
+ if (!p) {
+ /*
+ * If there weren't no cookies; we don't need
+ * to bother with the other siblings.
+ */
+ if (i == cpu && !need_sync)
+ goto next_class;
+
+ continue;
+ }
+
+ /*
+ * Optimize the 'normal' case where there aren't any
+ * cookies and we don't need to sync up.
+ */
+ if (i == cpu && !need_sync && !p->core_cookie) {
+ next = p;
+ goto done;
+ }
+
+ rq_i->core_pick = p;
+
+ /*
+ * If this new candidate is of higher priority than the
+ * previous; and they're incompatible; we need to wipe
+ * the slate and start over. pick_task makes sure that
+ * p's priority is more than max if it doesn't match
+ * max's cookie.
+ *
+ * NOTE: this is a linear max-filter and is thus bounded
+ * in execution time.
+ */
+ if (!max || !cookie_match(max, p)) {
+ struct task_struct *old_max = max;
+
+ rq->core->core_cookie = p->core_cookie;
+ max = p;
+
+ if (old_max) {
+ for_each_cpu(j, smt_mask) {
+ if (j == i)
+ continue;
+
+ cpu_rq(j)->core_pick = NULL;
+ }
+ goto again;
+ } else {
+ /*
+ * Once we select a task for a cpu, we
+ * should not be doing an unconstrained
+ * pick because it might starve a task
+ * on a forced idle cpu.
+ */
+ need_sync = true;
+ }
+
+ }
+ }
+next_class:;
+ }
+
+ rq->core->core_pick_seq = rq->core->core_task_seq;
+ next = rq->core_pick;
+ rq->core_sched_seq = rq->core->core_pick_seq;
+
+ /*
+ * Reschedule siblings
+ *
+ * NOTE: L1TF -- at this point we're no longer running the old task and
+ * sending an IPI (below) ensures the sibling will no longer be running
+ * their task. This ensures there is no inter-sibling overlap between
+ * non-matching user state.
+ */
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+
+ if (cpu_is_offline(i))
+ continue;
+
+ WARN_ON_ONCE(!rq_i->core_pick);
+
+ if (is_idle_task(rq_i->core_pick) && rq_i->nr_running)
+ rq_i->core_forceidle = true;
+
+ if (i == cpu)
+ continue;
+
+ if (rq_i->curr != rq_i->core_pick)
+ resched_curr(rq_i);
+
+ /* Did we break L1TF mitigation requirements? */
+ WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
+ }
+
+done:
+ set_next_task(rq, next);
+ return next;
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ return __pick_next_task(rq, prev, rf);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
/*
* __schedule() is the main scheduler function.
*
@@ -7074,7 +7341,12 @@ void __init sched_init(void)

#ifdef CONFIG_SCHED_CORE
rq->core = NULL;
+ rq->core_pick = NULL;
rq->core_enabled = 0;
+ rq->core_tree = RB_ROOT;
+ rq->core_forceidle = false;
+
+ rq->core_cookie = 0UL;
#endif
}

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a9eeef896c78..8432de767730 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4080,6 +4080,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_min_vruntime(cfs_rq);
}

+static inline bool
+__entity_slice_used(struct sched_entity *se)
+{
+ return (se->sum_exec_runtime - se->prev_sum_exec_runtime) >
+ sched_slice(cfs_rq_of(se), se);
+}
+
/*
* Preempt the current task with a newly woken task if needed:
*/
@@ -10285,6 +10292,34 @@ static void core_sched_deactivate_fair(struct rq *rq)
#endif
#endif /* CONFIG_SMP */

+#ifdef CONFIG_SCHED_CORE
+/*
+ * If runqueue has only one task which used up its slice and
+ * if the sibling is forced idle, then trigger schedule
+ * to give forced idle task a chance.
+ */
+static void resched_forceidle_sibling(struct rq *rq, struct sched_entity *se)
+{
+ int cpu = cpu_of(rq), sibling_cpu;
+ if (rq->cfs.nr_running > 1 || !__entity_slice_used(se))
+ return;
+
+ for_each_cpu(sibling_cpu, cpu_smt_mask(cpu)) {
+ struct rq *sibling_rq;
+ if (sibling_cpu == cpu)
+ continue;
+ if (cpu_is_offline(sibling_cpu))
+ continue;
+
+ sibling_rq = cpu_rq(sibling_cpu);
+ if (sibling_rq->core_forceidle) {
+ resched_curr(sibling_rq);
+ }
+ }
+}
+#endif
+
+
/*
* scheduler tick hitting a task of our scheduling class.
*
@@ -10308,6 +10343,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)

update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
+
+#ifdef CONFIG_SCHED_CORE
+ if (sched_core_enabled(rq))
+ resched_forceidle_sibling(rq, &curr->se);
+#endif
}

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 03d502357599..a829e26fa43a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1003,11 +1003,16 @@ struct rq {
#ifdef CONFIG_SCHED_CORE
/* per rq */
struct rq *core;
+ struct task_struct *core_pick;
unsigned int core_enabled;
+ unsigned int core_sched_seq;
struct rb_root core_tree;
+ bool core_forceidle;

/* shared state */
unsigned int core_task_seq;
+ unsigned int core_pick_seq;
+ unsigned long core_cookie;
#endif
};

@@ -1867,7 +1872,6 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)

static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
- WARN_ON_ONCE(rq->curr != next);
next->sched_class->set_next_task(rq, next, false);
}

--
2.17.1


2020-04-15 21:35:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.

On Wed, Mar 04, 2020 at 04:59:57PM +0000, vpillai wrote:
> +static struct task_struct *
> +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> +{
> + struct task_struct *next, *max = NULL;
> + const struct sched_class *class;
> + const struct cpumask *smt_mask;
> + int i, j, cpu;
> + bool need_sync = false;

AFAICT that assignment is superfluous. Also, you violated the inverse
x-mas tree.

> +
> + cpu = cpu_of(rq);
> + if (cpu_is_offline(cpu))
> + return idle_sched_class.pick_next_task(rq);

Are we actually hitting this one?

> + if (!sched_core_enabled(rq))
> + return __pick_next_task(rq, prev, rf);
> +
> + /*
> + * If there were no {en,de}queues since we picked (IOW, the task
> + * pointers are all still valid), and we haven't scheduled the last
> + * pick yet, do so now.
> + */
> + if (rq->core->core_pick_seq == rq->core->core_task_seq &&
> + rq->core->core_pick_seq != rq->core_sched_seq) {
> + WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
> +
> + next = rq->core_pick;
> + if (next != prev) {
> + put_prev_task(rq, prev);
> + set_next_task(rq, next);
> + }
> + return next;
> + }
> +
> + prev->sched_class->put_prev_task(rq, prev);
> + if (!rq->nr_running)
> + newidle_balance(rq, rf);

This is wrong per commit:

6e2df0581f56 ("sched: Fix pick_next_task() vs 'change' pattern race")

> + smt_mask = cpu_smt_mask(cpu);
> +
> + /*
> + * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
> + *
> + * @task_seq guards the task state ({en,de}queues)
> + * @pick_seq is the @task_seq we did a selection on
> + * @sched_seq is the @pick_seq we scheduled
> + *
> + * However, preemptions can cause multiple picks on the same task set.
> + * 'Fix' this by also increasing @task_seq for every pick.
> + */
> + rq->core->core_task_seq++;
> + need_sync = !!rq->core->core_cookie;
> +
> + /* reset state */
> + rq->core->core_cookie = 0UL;
> + for_each_cpu(i, smt_mask) {
> + struct rq *rq_i = cpu_rq(i);
> +
> + rq_i->core_pick = NULL;
> +
> + if (rq_i->core_forceidle) {
> + need_sync = true;
> + rq_i->core_forceidle = false;
> + }
> +
> + if (i != cpu)
> + update_rq_clock(rq_i);
> + }
> +
> + /*
> + * Try and select tasks for each sibling in decending sched_class
> + * order.
> + */
> + for_each_class(class) {
> +again:
> + for_each_cpu_wrap(i, smt_mask, cpu) {
> + struct rq *rq_i = cpu_rq(i);
> + struct task_struct *p;
> +
> + if (cpu_is_offline(i)) {
> + rq_i->core_pick = rq_i->idle;
> + continue;
> + }

Why are we polluting the 'fast' path with offline crud? Why isn't this
the natural result of running pick_task() on an empty runqueue?

> +
> + if (rq_i->core_pick)
> + continue;
> +
> + /*
> + * If this sibling doesn't yet have a suitable task to
> + * run; ask for the most elegible task, given the
> + * highest priority task already selected for this
> + * core.
> + */
> + p = pick_task(rq_i, class, max);
> + if (!p) {
> + /*
> + * If there weren't no cookies; we don't need
> + * to bother with the other siblings.
> + */
> + if (i == cpu && !need_sync)
> + goto next_class;
> +
> + continue;
> + }
> +
> + /*
> + * Optimize the 'normal' case where there aren't any
> + * cookies and we don't need to sync up.
> + */
> + if (i == cpu && !need_sync && !p->core_cookie) {
> + next = p;
> + goto done;
> + }
> +
> + rq_i->core_pick = p;
> +
> + /*
> + * If this new candidate is of higher priority than the
> + * previous; and they're incompatible; we need to wipe
> + * the slate and start over. pick_task makes sure that
> + * p's priority is more than max if it doesn't match
> + * max's cookie.
> + *
> + * NOTE: this is a linear max-filter and is thus bounded
> + * in execution time.
> + */
> + if (!max || !cookie_match(max, p)) {
> + struct task_struct *old_max = max;
> +
> + rq->core->core_cookie = p->core_cookie;
> + max = p;
> +
> + if (old_max) {
> + for_each_cpu(j, smt_mask) {
> + if (j == i)
> + continue;
> +
> + cpu_rq(j)->core_pick = NULL;
> + }
> + goto again;
> + } else {
> + /*
> + * Once we select a task for a cpu, we
> + * should not be doing an unconstrained
> + * pick because it might starve a task
> + * on a forced idle cpu.
> + */
> + need_sync = true;
> + }
> +
> + }
> + }
> +next_class:;
> + }
> +
> + rq->core->core_pick_seq = rq->core->core_task_seq;
> + next = rq->core_pick;
> + rq->core_sched_seq = rq->core->core_pick_seq;
> +
> + /*
> + * Reschedule siblings
> + *
> + * NOTE: L1TF -- at this point we're no longer running the old task and
> + * sending an IPI (below) ensures the sibling will no longer be running
> + * their task. This ensures there is no inter-sibling overlap between
> + * non-matching user state.
> + */
> + for_each_cpu(i, smt_mask) {
> + struct rq *rq_i = cpu_rq(i);
> +
> + if (cpu_is_offline(i))
> + continue;

Another one; please explain how an offline cpu can be part of the
smt_mask. Last time I checked it got cleared in stop-machine.

> +
> + WARN_ON_ONCE(!rq_i->core_pick);
> +
> + if (is_idle_task(rq_i->core_pick) && rq_i->nr_running)
> + rq_i->core_forceidle = true;
> +
> + if (i == cpu)
> + continue;
> +
> + if (rq_i->curr != rq_i->core_pick)
> + resched_curr(rq_i);
> +
> + /* Did we break L1TF mitigation requirements? */
> + WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));

That comment is misleading...

> + }
> +
> +done:
> + set_next_task(rq, next);
> + return next;
> +}

----8<----

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a9eeef896c78..8432de767730 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4080,6 +4080,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> update_min_vruntime(cfs_rq);
> }
>
> +static inline bool
> +__entity_slice_used(struct sched_entity *se)
> +{
> + return (se->sum_exec_runtime - se->prev_sum_exec_runtime) >
> + sched_slice(cfs_rq_of(se), se);
> +}
> +
> /*
> * Preempt the current task with a newly woken task if needed:
> */
> @@ -10285,6 +10292,34 @@ static void core_sched_deactivate_fair(struct rq *rq)
> #endif
> #endif /* CONFIG_SMP */
>
> +#ifdef CONFIG_SCHED_CORE
> +/*
> + * If runqueue has only one task which used up its slice and
> + * if the sibling is forced idle, then trigger schedule
> + * to give forced idle task a chance.
> + */
> +static void resched_forceidle_sibling(struct rq *rq, struct sched_entity *se)
> +{
> + int cpu = cpu_of(rq), sibling_cpu;
> + if (rq->cfs.nr_running > 1 || !__entity_slice_used(se))
> + return;
> +
> + for_each_cpu(sibling_cpu, cpu_smt_mask(cpu)) {
> + struct rq *sibling_rq;
> + if (sibling_cpu == cpu)
> + continue;
> + if (cpu_is_offline(sibling_cpu))
> + continue;
> +
> + sibling_rq = cpu_rq(sibling_cpu);
> + if (sibling_rq->core_forceidle) {
> + resched_curr(sibling_rq);
> + }
> + }
> +}
> +#endif
> +
> +
> /*
> * scheduler tick hitting a task of our scheduling class.
> *
> @@ -10308,6 +10343,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>
> update_misfit_status(curr, rq);
> update_overutilized_status(task_rq(curr));
> +
> +#ifdef CONFIG_SCHED_CORE
> + if (sched_core_enabled(rq))
> + resched_forceidle_sibling(rq, &curr->se);
> +#endif
> }
>
> /*

This ^ seems like it should be in it's own patch.

> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 03d502357599..a829e26fa43a 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1003,11 +1003,16 @@ struct rq {
> #ifdef CONFIG_SCHED_CORE
> /* per rq */
> struct rq *core;
> + struct task_struct *core_pick;
> unsigned int core_enabled;
> + unsigned int core_sched_seq;
> struct rb_root core_tree;
> + bool core_forceidle;

Someone forgot that _Bool shouldn't be part of composite types?

> /* shared state */
> unsigned int core_task_seq;
> + unsigned int core_pick_seq;
> + unsigned long core_cookie;
> #endif
> };

2020-04-16 03:42:03

by Yu Chen

[permalink] [raw]
Subject: Re: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.

On Wed, Mar 04, 2020 at 04:59:57PM +0000, vpillai wrote:
> From: Peter Zijlstra <[email protected]>
>
> Instead of only selecting a local task, select a task for all SMT
> siblings for every reschedule on the core (irrespective which logical
> CPU does the reschedule).
>
> There could be races in core scheduler where a CPU is trying to pick
> a task for its sibling in core scheduler, when that CPU has just been
> offlined. We should not schedule any tasks on the CPU in this case.
> Return an idle task in pick_next_task for this situation.
>
> NOTE: there is still potential for siblings rivalry.
> NOTE: this is far too complicated; but thus far I've failed to
> simplify it further.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Julien Desfossez <[email protected]>
> Signed-off-by: Vineeth Remanan Pillai <[email protected]>
> Signed-off-by: Aaron Lu <[email protected]>
> Signed-off-by: Tim Chen <[email protected]>
> ---
[cut]

Hi Vineeth,
An NULL pointer exception was found when testing V5 on top of stable
v5.6.2. And we tried the patch Peter suggested, the NULL pointer
was not found so far. We don't know if this change would help mitigate
the symptom, but it should do no harm to test with this fix applied.

Thanks,
Chenyu

From 6828eaf4611eeb3e1bad3b9a0d4ec53c6fa01fe3 Mon Sep 17 00:00:00 2001
From: Chen Yu <[email protected]>
Date: Thu, 16 Apr 2020 10:51:07 +0800
Subject: [PATCH] sched: Fix pick_next_task() race condition in core scheduling

As Perter mentioned that Commit 6e2df0581f56 ("sched: Fix pick_next_task()
vs 'change' pattern race") has fixed a race condition due to rq->lock
improperly released after put_prev_task(), backport this fix to core
scheduling's pick_next_task() as well.

Without this fix, Aubrey, Long and I found an NULL exception point
triggered within one hour when running RDT MBA(Intel Resource Directory
Technolodge Memory Bandwidth Allocation) benchmarks on a 36 Core(72 HTs)
platform, which tries to dereference a NULL sched_entity:

[ 3618.429053] BUG: kernel NULL pointer dereference, address: 0000000000000160
[ 3618.429039] RIP: 0010:pick_task_fair+0x2e/0xa0
[ 3618.429042] RSP: 0018:ffffc90000317da8 EFLAGS: 00010046
[ 3618.429044] RAX: 0000000000000000 RBX: ffff88afdf4ad100 RCX: 0000000000000001
[ 3618.429045] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88afdf4ad100
[ 3618.429045] RBP: ffffc90000317dc0 R08: 0000000000000048 R09: 0100000000100000
[ 3618.429046] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000
[ 3618.429047] R13: 000000000002d080 R14: ffff88afdf4ad080 R15: 0000000000000014
[ 3618.429048] ? pick_task_fair+0x48/0xa0
[ 3618.429048] pick_next_task+0x34c/0x7e0
[ 3618.429049] ? tick_program_event+0x44/0x70
[ 3618.429049] __schedule+0xee/0x5d0
[ 3618.429050] schedule_idle+0x2c/0x40
[ 3618.429051] do_idle+0x175/0x280
[ 3618.429051] cpu_startup_entry+0x1d/0x30
[ 3618.429052] start_secondary+0x169/0x1c0
[ 3618.429052] secondary_startup_64+0xa4/0xb0

While with this patch applied, no NULL pointer exception was found within
14 hours for now. Although there's no direct evidence this fix would solve
the issue, it does fix a potential race condition.

Signed-off-by: Chen Yu <[email protected]>
---
kernel/sched/core.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 02495d44870f..ef101a3ef583 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4477,9 +4477,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return next;
}

- prev->sched_class->put_prev_task(rq, prev);
- if (!rq->nr_running)
- newidle_balance(rq, rf);
+
+#ifdef CONFIG_SMP
+ for_class_range(class, prev->sched_class, &idle_sched_class) {
+ if (class->balance(rq, prev, rf))
+ break;
+ }
+#endif
+ put_prev_task(rq, prev);

smt_mask = cpu_smt_mask(cpu);

--
2.20.1

2020-04-16 20:46:27

by Vineeth Remanan Pillai

[permalink] [raw]
Subject: Re: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.

> Hi Vineeth,
> An NULL pointer exception was found when testing V5 on top of stable
> v5.6.2. And we tried the patch Peter suggested, the NULL pointer
> was not found so far. We don't know if this change would help mitigate
> the symptom, but it should do no harm to test with this fix applied.
>

Thanks for the patch Chenyu!

I was also looking into this as part of addressing Peter's comments. I
shall include this patch in v6 along with addressing all the issues
that Peter pointed out in this thread.

Thanks,
Vineeth

2020-04-16 23:36:46

by Tim Chen

[permalink] [raw]
Subject: Re: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.



On 4/14/20 6:35 AM, Peter Zijlstra wrote:
> On Wed, Mar 04, 2020 at 04:59:57PM +0000, vpillai wrote:
>> +static struct task_struct *
>> +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>> +{
>> + struct task_struct *next, *max = NULL;
>> + const struct sched_class *class;
>> + const struct cpumask *smt_mask;
>> + int i, j, cpu;
>> + bool need_sync = false;
>
> AFAICT that assignment is superfluous. Also, you violated the inverse
> x-mas tree.
>
>> +
>> + cpu = cpu_of(rq);
>> + if (cpu_is_offline(cpu))
>> + return idle_sched_class.pick_next_task(rq);
>
> Are we actually hitting this one?
>

I did hit this race when I was testing taking cpu offline and online,
which prompted the check of cpu being offline.

Tim

2020-04-17 11:01:36

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.

On Thu, Apr 16, 2020 at 04:32:28PM -0700, Tim Chen wrote:
>
>
> On 4/14/20 6:35 AM, Peter Zijlstra wrote:
> > On Wed, Mar 04, 2020 at 04:59:57PM +0000, vpillai wrote:
> >> +static struct task_struct *
> >> +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> >> +{
> >> + struct task_struct *next, *max = NULL;
> >> + const struct sched_class *class;
> >> + const struct cpumask *smt_mask;
> >> + int i, j, cpu;
> >> + bool need_sync = false;
> >
> > AFAICT that assignment is superfluous. Also, you violated the inverse
> > x-mas tree.
> >
> >> +
> >> + cpu = cpu_of(rq);
> >> + if (cpu_is_offline(cpu))
> >> + return idle_sched_class.pick_next_task(rq);
> >
> > Are we actually hitting this one?
> >
>
> I did hit this race when I was testing taking cpu offline and online,
> which prompted the check of cpu being offline.

This is the schedule from the stop task to the idle task I presume,
there should really not be any other. And at that point the rq had
better be empty, so why didn't the normal task selection work?

2020-04-17 11:20:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.

On Thu, Apr 16, 2020 at 11:39:05AM +0800, Chen Yu wrote:

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 02495d44870f..ef101a3ef583 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4477,9 +4477,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> return next;
> }
>
> - prev->sched_class->put_prev_task(rq, prev);
> - if (!rq->nr_running)
> - newidle_balance(rq, rf);
> +
> +#ifdef CONFIG_SMP
> + for_class_range(class, prev->sched_class, &idle_sched_class) {
> + if (class->balance(rq, prev, rf))
> + break;
> + }
> +#endif
> + put_prev_task(rq, prev);
>
> smt_mask = cpu_smt_mask(cpu);

Instead of duplicating that, how about you put the existing copy in a
function to share? finish_prev_task() perhaps?

Also, can you please make newidle_balance() static again; I forgot doing
that in 6e2df0581f56, which would've made you notice this sooner I
suppose.

2020-04-19 15:33:53

by Yu Chen

[permalink] [raw]
Subject: Re: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.

On Fri, Apr 17, 2020 at 7:18 PM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Apr 16, 2020 at 11:39:05AM +0800, Chen Yu wrote:
>
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 02495d44870f..ef101a3ef583 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -4477,9 +4477,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > return next;
> > }
> >
> > - prev->sched_class->put_prev_task(rq, prev);
> > - if (!rq->nr_running)
> > - newidle_balance(rq, rf);
> > +
> > +#ifdef CONFIG_SMP
> > + for_class_range(class, prev->sched_class, &idle_sched_class) {
> > + if (class->balance(rq, prev, rf))
> > + break;
> > + }
> > +#endif
> > + put_prev_task(rq, prev);
> >
> > smt_mask = cpu_smt_mask(cpu);
>
> Instead of duplicating that, how about you put the existing copy in a
> function to share? finish_prev_task() perhaps?
>
> Also, can you please make newidle_balance() static again; I forgot doing
> that in 6e2df0581f56, which would've made you notice this sooner I
> suppose.
Okay, I'll do that,

Thanks,
Chenyu

2020-05-21 23:16:48

by Joel Fernandes

[permalink] [raw]
Subject: Re: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.

On Wed, Mar 04, 2020 at 04:59:57PM +0000, vpillai wrote:
> From: Peter Zijlstra <[email protected]>
>
> Instead of only selecting a local task, select a task for all SMT
> siblings for every reschedule on the core (irrespective which logical
> CPU does the reschedule).
>
> There could be races in core scheduler where a CPU is trying to pick
> a task for its sibling in core scheduler, when that CPU has just been
> offlined. We should not schedule any tasks on the CPU in this case.
> Return an idle task in pick_next_task for this situation.
>
> NOTE: there is still potential for siblings rivalry.
> NOTE: this is far too complicated; but thus far I've failed to
> simplify it further.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Julien Desfossez <[email protected]>
> Signed-off-by: Vineeth Remanan Pillai <[email protected]>
> Signed-off-by: Aaron Lu <[email protected]>
> Signed-off-by: Tim Chen <[email protected]>
> ---
> kernel/sched/core.c | 274 ++++++++++++++++++++++++++++++++++++++++++-
> kernel/sched/fair.c | 40 +++++++
> kernel/sched/sched.h | 6 +-
> 3 files changed, 318 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 445f0d519336..9a1bd236044e 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4253,7 +4253,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
> * Pick up the highest-prio task:
> */
> static inline struct task_struct *
> -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> +__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> {
> const struct sched_class *class;
> struct task_struct *p;
> @@ -4309,6 +4309,273 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> BUG();
> }
>
> +#ifdef CONFIG_SCHED_CORE
> +
> +static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
> +{
> + return is_idle_task(a) || (a->core_cookie == cookie);
> +}
> +
> +static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
> +{
> + if (is_idle_task(a) || is_idle_task(b))
> + return true;
> +
> + return a->core_cookie == b->core_cookie;
> +}
> +
> +// XXX fairness/fwd progress conditions
> +/*
> + * Returns
> + * - NULL if there is no runnable task for this class.
> + * - the highest priority task for this runqueue if it matches
> + * rq->core->core_cookie or its priority is greater than max.
> + * - Else returns idle_task.
> + */
> +static struct task_struct *
> +pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
> +{
> + struct task_struct *class_pick, *cookie_pick;
> + unsigned long cookie = rq->core->core_cookie;
> +
> + class_pick = class->pick_task(rq);
> + if (!class_pick)
> + return NULL;
> +
> + if (!cookie) {
> + /*
> + * If class_pick is tagged, return it only if it has
> + * higher priority than max.
> + */
> + if (max && class_pick->core_cookie &&
> + prio_less(class_pick, max))
> + return idle_sched_class.pick_task(rq);
> +
> + return class_pick;
> + }
> +
> + /*
> + * If class_pick is idle or matches cookie, return early.
> + */
> + if (cookie_equals(class_pick, cookie))
> + return class_pick;
> +
> + cookie_pick = sched_core_find(rq, cookie);
> +
> + /*
> + * If class > max && class > cookie, it is the highest priority task on
> + * the core (so far) and it must be selected, otherwise we must go with
> + * the cookie pick in order to satisfy the constraint.
> + */
> + if (prio_less(cookie_pick, class_pick) &&
> + (!max || prio_less(max, class_pick)))
> + return class_pick;
> +
> + return cookie_pick;
> +}

I've been hating on this pick_task() routine for a while now :-). If we add
the task to the tag tree as Peter suggested at OSPM for that other issue
Vineeth found, it seems it could be simpler.

This has just been near a compiler so far but how about:

---8<-----------------------

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 005d7f7323e2d..81e23252b6c99 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -182,9 +182,6 @@ static void sched_core_enqueue(struct rq *rq, struct task_struct *p)

rq->core->core_task_seq++;

- if (!p->core_cookie)
- return;
-
node = &rq->core_tree.rb_node;
parent = *node;

@@ -215,7 +212,7 @@ static void sched_core_dequeue(struct rq *rq, struct task_struct *p)

void sched_core_add(struct rq *rq, struct task_struct *p)
{
- if (p->core_cookie && task_on_rq_queued(p))
+ if (task_on_rq_queued(p))
sched_core_enqueue(rq, p);
}

@@ -4563,36 +4560,32 @@ pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *ma
if (!class_pick)
return NULL;

- if (!cookie) {
- /*
- * If class_pick is tagged, return it only if it has
- * higher priority than max.
- */
- if (max && class_pick->core_cookie &&
- prio_less(class_pick, max))
- return idle_sched_class.pick_task(rq);
-
+ if (!max)
return class_pick;
- }

- /*
- * If class_pick is idle or matches cookie, return early.
- */
+ /* Make sure the current max's cookie is core->core_cookie */
+ WARN_ON_ONCE(max->core_cookie != cookie);
+
+ /* If class_pick is idle or matches cookie, play nice. */
if (cookie_equals(class_pick, cookie))
return class_pick;

- cookie_pick = sched_core_find(rq, cookie);
+ /* If class_pick is highest prio, trump max. */
+ if (prio_less(max, class_pick)) {
+
+ /* .. but not before checking if cookie trumps class. */
+ cookie_pick = sched_core_find(rq, cookie);
+ if (prio_less(class_pick, cookie_pick))
+ return cookie_pick;

- /*
- * If class > max && class > cookie, it is the highest priority task on
- * the core (so far) and it must be selected, otherwise we must go with
- * the cookie pick in order to satisfy the constraint.
- */
- if (prio_less(cookie_pick, class_pick) &&
- (!max || prio_less(max, class_pick)))
return class_pick;
+ }

- return cookie_pick;
+ /*
+ * We get here if class_pick was incompatible with max
+ * and lower prio than max. So we have nothing.
+ */
+ return idle_sched_class.pick_task(rq);
}

static struct task_struct *

2020-05-21 23:20:19

by Joel Fernandes

[permalink] [raw]
Subject: Re: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.

On Thu, May 21, 2020 at 07:14:26PM -0400, Joel Fernandes wrote:
> On Wed, Mar 04, 2020 at 04:59:57PM +0000, vpillai wrote:
[snip]
> > + /*
> > + * If class_pick is idle or matches cookie, return early.
> > + */
> > + if (cookie_equals(class_pick, cookie))
> > + return class_pick;
> > +
> > + cookie_pick = sched_core_find(rq, cookie);
> > +
> > + /*
> > + * If class > max && class > cookie, it is the highest priority task on
> > + * the core (so far) and it must be selected, otherwise we must go with
> > + * the cookie pick in order to satisfy the constraint.
> > + */
> > + if (prio_less(cookie_pick, class_pick) &&
> > + (!max || prio_less(max, class_pick)))
> > + return class_pick;
> > +
> > + return cookie_pick;
> > +}
>
> I've been hating on this pick_task() routine for a while now :-). If we add
> the task to the tag tree as Peter suggested at OSPM for that other issue
> Vineeth found, it seems it could be simpler.

Sorry, I meant adding of a 0-tagged (no cookie) task to the tag tree.

thanks,

- Joel

2020-05-22 02:38:44

by Joel Fernandes

[permalink] [raw]
Subject: Re: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.

On Thu, May 21, 2020 at 07:14:26PM -0400, Joel Fernandes wrote:
> On Wed, Mar 04, 2020 at 04:59:57PM +0000, vpillai wrote:
> > From: Peter Zijlstra <[email protected]>
> >
> > Instead of only selecting a local task, select a task for all SMT
> > siblings for every reschedule on the core (irrespective which logical
> > CPU does the reschedule).
> >
> > There could be races in core scheduler where a CPU is trying to pick
> > a task for its sibling in core scheduler, when that CPU has just been
> > offlined. We should not schedule any tasks on the CPU in this case.
> > Return an idle task in pick_next_task for this situation.
> >
> > NOTE: there is still potential for siblings rivalry.
> > NOTE: this is far too complicated; but thus far I've failed to
> > simplify it further.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > Signed-off-by: Julien Desfossez <[email protected]>
> > Signed-off-by: Vineeth Remanan Pillai <[email protected]>
> > Signed-off-by: Aaron Lu <[email protected]>
> > Signed-off-by: Tim Chen <[email protected]>
> > ---
> > kernel/sched/core.c | 274 ++++++++++++++++++++++++++++++++++++++++++-
> > kernel/sched/fair.c | 40 +++++++
> > kernel/sched/sched.h | 6 +-
> > 3 files changed, 318 insertions(+), 2 deletions(-)
> >
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 445f0d519336..9a1bd236044e 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -4253,7 +4253,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
> > * Pick up the highest-prio task:
> > */
> > static inline struct task_struct *
> > -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > +__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > {
> > const struct sched_class *class;
> > struct task_struct *p;
> > @@ -4309,6 +4309,273 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > BUG();
> > }
> >
> > +#ifdef CONFIG_SCHED_CORE
> > +
> > +static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
> > +{
> > + return is_idle_task(a) || (a->core_cookie == cookie);
> > +}
> > +
> > +static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
> > +{
> > + if (is_idle_task(a) || is_idle_task(b))
> > + return true;
> > +
> > + return a->core_cookie == b->core_cookie;
> > +}
> > +
> > +// XXX fairness/fwd progress conditions
> > +/*
> > + * Returns
> > + * - NULL if there is no runnable task for this class.
> > + * - the highest priority task for this runqueue if it matches
> > + * rq->core->core_cookie or its priority is greater than max.
> > + * - Else returns idle_task.
> > + */
> > +static struct task_struct *
> > +pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
> > +{
> > + struct task_struct *class_pick, *cookie_pick;
> > + unsigned long cookie = rq->core->core_cookie;
> > +
> > + class_pick = class->pick_task(rq);
> > + if (!class_pick)
> > + return NULL;
> > +
> > + if (!cookie) {
> > + /*
> > + * If class_pick is tagged, return it only if it has
> > + * higher priority than max.
> > + */
> > + if (max && class_pick->core_cookie &&
> > + prio_less(class_pick, max))
> > + return idle_sched_class.pick_task(rq);
> > +
> > + return class_pick;
> > + }
> > +
> > + /*
> > + * If class_pick is idle or matches cookie, return early.
> > + */
> > + if (cookie_equals(class_pick, cookie))
> > + return class_pick;
> > +
> > + cookie_pick = sched_core_find(rq, cookie);
> > +
> > + /*
> > + * If class > max && class > cookie, it is the highest priority task on
> > + * the core (so far) and it must be selected, otherwise we must go with
> > + * the cookie pick in order to satisfy the constraint.
> > + */
> > + if (prio_less(cookie_pick, class_pick) &&
> > + (!max || prio_less(max, class_pick)))
> > + return class_pick;
> > +
> > + return cookie_pick;
> > +}
>
> I've been hating on this pick_task() routine for a while now :-). If we add
> the task to the tag tree as Peter suggested at OSPM for that other issue
> Vineeth found, it seems it could be simpler.
>
> This has just been near a compiler so far but how about:

Discussed a lot with Vineeth. Below is an improved version of the pick_task()
similification.

It also handles the following "bug" in the existing code as well that Vineeth
brought up in OSPM: Suppose 2 siblings of a core: rq 1 and rq 2.

In priority order (high to low), say we have the tasks:
A - untagged (rq 1)
B - tagged (rq 2)
C - untagged (rq 2)

Say, B and C are in the same scheduling class.

When the pick_next_task() loop runs, it looks at rq 1 and max is A, A is
tenantively selected for rq 1. Then it looks at rq 2 and the class_pick is B.
But that's not compatible with A. So rq 2 gets forced idle.

In reality, rq 2 could have run C instead of idle. The fix is to add C to the
tag tree as Peter suggested in OSPM.

Updated diff below:

---8<-----------------------

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 005d7f7323e2d..625377f393ed3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -182,9 +182,6 @@ static void sched_core_enqueue(struct rq *rq, struct task_struct *p)

rq->core->core_task_seq++;

- if (!p->core_cookie)
- return;
-
node = &rq->core_tree.rb_node;
parent = *node;

@@ -215,7 +212,7 @@ static void sched_core_dequeue(struct rq *rq, struct task_struct *p)

void sched_core_add(struct rq *rq, struct task_struct *p)
{
- if (p->core_cookie && task_on_rq_queued(p))
+ if (task_on_rq_queued(p))
sched_core_enqueue(rq, p);
}

@@ -4556,43 +4553,57 @@ void sched_core_irq_exit(void)
static struct task_struct *
pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
{
- struct task_struct *class_pick, *cookie_pick;
+ struct task_struct *class_pick, *cookie_pick, *rq_pick;
unsigned long cookie = rq->core->core_cookie;

class_pick = class->pick_task(rq);
if (!class_pick)
return NULL;

- if (!cookie) {
- /*
- * If class_pick is tagged, return it only if it has
- * higher priority than max.
- */
- if (max && class_pick->core_cookie &&
- prio_less(class_pick, max))
- return idle_sched_class.pick_task(rq);
+ if (!max)
+ return class_pick;
+
+ /* Make sure the current max's cookie is core->core_cookie */
+ WARN_ON_ONCE(max->core_cookie != cookie);

+ /* Try to play really nice: see if the class's cookie works. */
+ if (cookie_equals(class_pick, cookie))
return class_pick;
- }

/*
- * If class_pick is idle or matches cookie, return early.
+ * From here on, we must return class_pick, cookie_pick or idle.
+ * Following are the cases:
+ * 1 - lowest prio.
+ * 3 - highest prio.
+ *
+ * max class cookie outcome
+ * 1 2 3 cookie
+ * 1 3 2 class
+ * 2 1 3 cookie
+ * 2 3 1 class
+ * 3 1 2 cookie
+ * 3 2 1 cookie
+ * 3 2 - return idle (when no cookie task).
*/
- if (cookie_equals(class_pick, cookie))
- return class_pick;

+ /* First try to find the highest prio of (cookie, class and max). */
cookie_pick = sched_core_find(rq, cookie);
+ if (cookie_pick && prio_less(class_pick, cookie_pick))
+ rq_pick = cookie_pick;
+ else
+ rq_pick = class_pick;
+ if (prio_less(max, rq_pick))
+ return rq_pick;
+
+ /* If we max was greatest, then see if there was a cookie. */
+ if (cookie_pick)
+ return cookie_pick;

/*
- * If class > max && class > cookie, it is the highest priority task on
- * the core (so far) and it must be selected, otherwise we must go with
- * the cookie pick in order to satisfy the constraint.
+ * We get here with if class_pick was incompatible with max
+ * and lower prio than max. So we have nothing.
*/
- if (prio_less(cookie_pick, class_pick) &&
- (!max || prio_less(max, class_pick)))
- return class_pick;
-
- return cookie_pick;
+ return idle_sched_class.pick_task(rq);
}

static struct task_struct *

2020-05-22 03:46:40

by Aaron Lu

[permalink] [raw]
Subject: Re: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.

On Thu, May 21, 2020 at 10:35:56PM -0400, Joel Fernandes wrote:
> Discussed a lot with Vineeth. Below is an improved version of the pick_task()
> similification.
>
> It also handles the following "bug" in the existing code as well that Vineeth
> brought up in OSPM: Suppose 2 siblings of a core: rq 1 and rq 2.
>
> In priority order (high to low), say we have the tasks:
> A - untagged (rq 1)
> B - tagged (rq 2)
> C - untagged (rq 2)
>
> Say, B and C are in the same scheduling class.
>
> When the pick_next_task() loop runs, it looks at rq 1 and max is A, A is
> tenantively selected for rq 1. Then it looks at rq 2 and the class_pick is B.
> But that's not compatible with A. So rq 2 gets forced idle.
>
> In reality, rq 2 could have run C instead of idle. The fix is to add C to the
> tag tree as Peter suggested in OSPM.

I like the idea of adding untagged task to the core tree.

> Updated diff below:
>
> ---8<-----------------------
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 005d7f7323e2d..625377f393ed3 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -182,9 +182,6 @@ static void sched_core_enqueue(struct rq *rq, struct task_struct *p)
>
> rq->core->core_task_seq++;
>
> - if (!p->core_cookie)
> - return;
> -
> node = &rq->core_tree.rb_node;
> parent = *node;
>
> @@ -215,7 +212,7 @@ static void sched_core_dequeue(struct rq *rq, struct task_struct *p)
>
> void sched_core_add(struct rq *rq, struct task_struct *p)
> {
> - if (p->core_cookie && task_on_rq_queued(p))
> + if (task_on_rq_queued(p))
> sched_core_enqueue(rq, p);
> }

It appears there are other call sites of sched_core_enqueue() where
core_cookie is checked: cpu_cgroup_fork() and __sched_write_tag().

2020-05-22 20:15:28

by Joel Fernandes

[permalink] [raw]
Subject: Re: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.

On Fri, May 22, 2020 at 11:44:06AM +0800, Aaron Lu wrote:
[...]
> > Updated diff below:
> >
> > ---8<-----------------------
> >
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 005d7f7323e2d..625377f393ed3 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -182,9 +182,6 @@ static void sched_core_enqueue(struct rq *rq, struct task_struct *p)
> >
> > rq->core->core_task_seq++;
> >
> > - if (!p->core_cookie)
> > - return;
> > -
> > node = &rq->core_tree.rb_node;
> > parent = *node;
> >
> > @@ -215,7 +212,7 @@ static void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> >
> > void sched_core_add(struct rq *rq, struct task_struct *p)
> > {
> > - if (p->core_cookie && task_on_rq_queued(p))
> > + if (task_on_rq_queued(p))
> > sched_core_enqueue(rq, p);
> > }
>
> It appears there are other call sites of sched_core_enqueue() where
> core_cookie is checked: cpu_cgroup_fork() and __sched_write_tag().

Thanks, but looks like pick_task()'s caller also makes various assumptions
about cookie == 0 so all that needs to be vetted again I think.

- Joel