2023-05-31 13:10:31

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 05/15] sched/fair: Implement an EEVDF like policy

Where CFS is currently a WFQ based scheduler with only a single knob,
the weight. The addition of a second, latency oriented parameter,
makes something like WF2Q or EEVDF based a much better fit.

Specifically, EEVDF does EDF like scheduling in the left half of the
tree -- those entities that are owed service. Except because this is a
virtual time scheduler, the deadlines are in virtual time as well,
which is what allows over-subscription.

EEVDF has two parameters:

- weight, or time-slope; which is mapped to nice just as before
- request size, or slice length; which is used to compute
the virtual deadline as: vd_i = ve_i + r_i/w_i

Basically, by setting a smaller slice, the deadline will be earlier
and the task will be more eligible and ran earlier.

Tick driven preemption is driven by request/slice completion; while
wakeup preemption is driven by the deadline.

Because the tree is now effectively an interval tree, and the
selection is no longer 'leftmost', over-scheduling is less of a
problem.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/sched.h | 4
kernel/sched/core.c | 1
kernel/sched/debug.c | 6
kernel/sched/fair.c | 338 +++++++++++++++++++++++++++++++++++++++++-------
kernel/sched/features.h | 3
kernel/sched/sched.h | 4
6 files changed, 308 insertions(+), 48 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -550,6 +550,9 @@ struct sched_entity {
/* For load-balancing: */
struct load_weight load;
struct rb_node run_node;
+ u64 deadline;
+ u64 min_deadline;
+
struct list_head group_node;
unsigned int on_rq;

@@ -558,6 +561,7 @@ struct sched_entity {
u64 prev_sum_exec_runtime;
u64 vruntime;
s64 vlag;
+ u64 slice;

u64 nr_migrations;

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4464,6 +4464,7 @@ static void __sched_fork(unsigned long c
p->se.nr_migrations = 0;
p->se.vruntime = 0;
p->se.vlag = 0;
+ p->se.slice = sysctl_sched_min_granularity;
INIT_LIST_HEAD(&p->se.group_node);

#ifdef CONFIG_FAIR_GROUP_SCHED
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -581,9 +581,13 @@ print_task(struct seq_file *m, struct rq
else
SEQ_printf(m, " %c", task_state_to_char(p));

- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
+ SPLIT_NS(p->se.deadline),
+ SPLIT_NS(p->se.slice),
+ SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -47,6 +47,7 @@
#include <linux/psi.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
+#include <linux/rbtree_augmented.h>

#include <asm/switch_to.h>

@@ -347,6 +348,16 @@ static u64 __calc_delta(u64 delta_exec,
return mul_u64_u32_shr(delta_exec, fact, shift);
}

+/*
+ * delta /= w
+ */
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+{
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+
+ return delta;
+}

const struct sched_class fair_sched_class;

@@ -717,11 +728,62 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)

/*
* lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * However, since V is approximated by the weighted average of all entities it
+ * is possible -- by addition/removal/reweight to the tree -- to move V around
+ * and end up with a larger lag than we started with.
+ *
+ * Limit this to either double the slice length with a minimum of TICK_NSEC
+ * since that is the timing granularity.
+ *
+ * EEVDF gives the following limit for a steady state system:
+ *
+ * -r_max < lag < max(r_max, q)
+ *
+ * XXX could add max_slice to the augmented data to track this.
*/
void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ s64 lag, limit;
+
SCHED_WARN_ON(!se->on_rq);
- se->vlag = avg_vruntime(cfs_rq) - se->vruntime;
+ lag = avg_vruntime(cfs_rq) - se->vruntime;
+
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+ se->vlag = clamp(lag, -limit, limit);
+}
+
+/*
+ * Entity is eligible once it received less service than it ought to have,
+ * eg. lag >= 0.
+ *
+ * lag_i = S - s_i = w_i*(V - v_i)
+ *
+ * lag_i >= 0 -> V >= v_i
+ *
+ * \Sum (v_i - v)*w_i
+ * V = ------------------ + v
+ * \Sum w_i
+ *
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ *
+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ * to the loss in precision caused by the division.
+ */
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ return avg >= entity_key(cfs_rq, se) * load;
}

static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
@@ -740,8 +802,8 @@ static u64 __update_min_vruntime(struct

static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);

u64 vruntime = cfs_rq->min_vruntime;

@@ -752,9 +814,7 @@ static void update_min_vruntime(struct c
curr = NULL;
}

- if (leftmost) { /* non-empty tree */
- struct sched_entity *se = __node_2_se(leftmost);
-
+ if (se) {
if (!curr)
vruntime = se->vruntime;
else
@@ -771,18 +831,50 @@ static inline bool __entity_less(struct
return entity_before(__node_2_se(a), __node_2_se(b));
}

+#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+
+static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (deadline_gt(min_deadline, se, rse))
+ se->min_deadline = rse->min_deadline;
+ }
+}
+
+/*
+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
+ */
+static inline bool min_deadline_update(struct sched_entity *se, bool exit)
+{
+ u64 old_min_deadline = se->min_deadline;
+ struct rb_node *node = &se->run_node;
+
+ se->min_deadline = se->deadline;
+ __update_min_deadline(se, node->rb_right);
+ __update_min_deadline(se, node->rb_left);
+
+ return se->min_deadline == old_min_deadline;
+}
+
+RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
+ run_node, min_deadline, min_deadline_update);
+
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
+ se->min_deadline = se->deadline;
+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ __entity_less, &min_deadline_cb);
}

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ &min_deadline_cb);
avg_vruntime_sub(cfs_rq, se);
}

@@ -806,6 +898,97 @@ static struct sched_entity *__pick_next_
return __node_2_se(next);
}

+static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+
+ /*
+ * If curr is set we have to see if its left of the leftmost entity
+ * still in the tree, provided there was anything in the tree at all.
+ */
+ if (!left || (curr && entity_before(curr, left)))
+ left = curr;
+
+ return left;
+}
+
+/*
+ * Earliest Eligible Virtual Deadline First
+ *
+ * In order to provide latency guarantees for different request sizes
+ * EEVDF selects the best runnable task from two criteria:
+ *
+ * 1) the task must be eligible (must be owed service)
+ *
+ * 2) from those tasks that meet 1), we select the one
+ * with the earliest virtual deadline.
+ *
+ * We can do this in O(log n) time due to an augmented RB-tree. The
+ * tree keeps the entries sorted on service, but also functions as a
+ * heap based on the deadline by keeping:
+ *
+ * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
+ *
+ * Which allows an EDF like search on (sub)trees.
+ */
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct sched_entity *curr = cfs_rq->curr;
+ struct sched_entity *best = NULL;
+
+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
+ curr = NULL;
+
+ while (node) {
+ struct sched_entity *se = __node_2_se(node);
+
+ /*
+ * If this entity is not eligible, try the left subtree.
+ */
+ if (!entity_eligible(cfs_rq, se)) {
+ node = node->rb_left;
+ continue;
+ }
+
+ /*
+ * If this entity has an earlier deadline than the previous
+ * best, take this one. If it also has the earliest deadline
+ * of its subtree, we're done.
+ */
+ if (!best || deadline_gt(deadline, best, se)) {
+ best = se;
+ if (best->deadline == best->min_deadline)
+ break;
+ }
+
+ /*
+ * If the earlest deadline in this subtree is in the fully
+ * eligible left half of our space, go there.
+ */
+ if (node->rb_left &&
+ __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
+ node = node->rb_left;
+ continue;
+ }
+
+ node = node->rb_right;
+ }
+
+ if (!best || (curr && deadline_gt(deadline, best, curr)))
+ best = curr;
+
+ if (unlikely(!best)) {
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ if (left) {
+ pr_err("EEVDF scheduling fail, picking leftmost\n");
+ return left;
+ }
+ }
+
+ return best;
+}
+
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
@@ -840,17 +1023,6 @@ int sched_update_scaling(void)
#endif

/*
- * delta /= w
- */
-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-{
- if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
-
- return delta;
-}
-
-/*
* The idea is to set a period in which each task runs once.
*
* When there are too many tasks (sched_nr_latency) we have to stretch
@@ -915,6 +1087,48 @@ static u64 sched_slice(struct cfs_rq *cf
return slice;
}

+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
+/*
+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+ * this is probably good enough.
+ */
+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if ((s64)(se->vruntime - se->deadline) < 0)
+ return;
+
+ if (sched_feat(EEVDF)) {
+ /*
+ * For EEVDF the virtual time slope is determined by w_i (iow.
+ * nice) while the request time r_i is determined by
+ * sysctl_sched_min_granularity.
+ */
+ se->slice = sysctl_sched_min_granularity;
+
+ /*
+ * The task has consumed its request, reschedule.
+ */
+ if (cfs_rq->nr_running > 1) {
+ resched_curr(rq_of(cfs_rq));
+ clear_buddies(cfs_rq, se);
+ }
+ } else {
+ /*
+ * When many tasks blow up the sched_period; it is possible
+ * that sched_slice() reports unusually large results (when
+ * many tasks are very light for example). Therefore impose a
+ * maximum.
+ */
+ se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency);
+ }
+
+ /*
+ * EEVDF: vd_i = ve_i + r_i / w_i
+ */
+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
+}
+
#include "pelt.h"
#ifdef CONFIG_SMP

@@ -1047,6 +1261,7 @@ static void update_curr(struct cfs_rq *c
schedstat_add(cfs_rq->exec_clock, delta_exec);

curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);

if (entity_is_task(curr)) {
@@ -3521,6 +3736,14 @@ static void reweight_entity(struct cfs_r
* we need to scale se->vlag when w_i changes.
*/
se->vlag = div_s64(se->vlag * old_weight, weight);
+ } else {
+ s64 deadline = se->deadline - se->vruntime;
+ /*
+ * When the weight changes, the virtual time slope changes and
+ * we should adjust the relative virtual deadline accordingly.
+ */
+ deadline = div_s64(deadline * old_weight, weight);
+ se->deadline = se->vruntime + deadline;
}

#ifdef CONFIG_SMP
@@ -4871,6 +5094,7 @@ static inline bool entity_is_long_sleepe
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
+ u64 vslice = calc_delta_fair(se->slice, se);
u64 vruntime = avg_vruntime(cfs_rq);
s64 lag = 0;

@@ -4942,9 +5166,9 @@ place_entity(struct cfs_rq *cfs_rq, stru
*/
load = cfs_rq->avg_load;
if (curr && curr->on_rq)
- load += curr->load.weight;
+ load += scale_load_down(curr->load.weight);

- lag *= load + se->load.weight;
+ lag *= load + scale_load_down(se->load.weight);
if (WARN_ON_ONCE(!load))
load = 1;
lag = div_s64(lag, load);
@@ -4985,6 +5209,19 @@ place_entity(struct cfs_rq *cfs_rq, stru
}

se->vruntime = vruntime;
+
+ /*
+ * When joining the competition; the exisiting tasks will be,
+ * on average, halfway through their slice, as such start tasks
+ * off with half a slice to ease into the competition.
+ */
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && initial)
+ vslice /= 2;
+
+ /*
+ * EEVDF: vd_i = ve_i + r_i/w_i
+ */
+ se->deadline = se->vruntime + vslice;
}

static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -5196,19 +5433,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- unsigned long ideal_runtime, delta_exec;
+ unsigned long delta_exec;
struct sched_entity *se;
s64 delta;

- /*
- * When many tasks blow up the sched_period; it is possible that
- * sched_slice() reports unusually large results (when many tasks are
- * very light for example). Therefore impose a maximum.
- */
- ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
-
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime) {
+ if (delta_exec > curr->slice) {
resched_curr(rq_of(cfs_rq));
/*
* The current task ran long enough, ensure it doesn't get
@@ -5232,7 +5462,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq
if (delta < 0)
return;

- if (delta > ideal_runtime)
+ if (delta > curr->slice)
resched_curr(rq_of(cfs_rq));
}

@@ -5287,17 +5517,20 @@ wakeup_preempt_entity(struct sched_entit
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- struct sched_entity *left = __pick_first_entity(cfs_rq);
- struct sched_entity *se;
+ struct sched_entity *left, *se;

- /*
- * If curr is set we have to see if its left of the leftmost entity
- * still in the tree, provided there was anything in the tree at all.
- */
- if (!left || (curr && entity_before(curr, left)))
- left = curr;
+ if (sched_feat(EEVDF)) {
+ /*
+ * Enabling NEXT_BUDDY will affect latency but not fairness.
+ */
+ if (sched_feat(NEXT_BUDDY) &&
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+ return cfs_rq->next;
+
+ return pick_eevdf(cfs_rq);
+ }

- se = left; /* ideally we run the leftmost entity */
+ se = left = pick_cfs(cfs_rq, curr);

/*
* Avoid running the skip buddy, if running something else can
@@ -5390,7 +5623,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
return;
#endif

- if (cfs_rq->nr_running > 1)
+ if (!sched_feat(EEVDF) && cfs_rq->nr_running > 1)
check_preempt_tick(cfs_rq, curr);
}

@@ -6396,13 +6629,12 @@ static inline void unthrottle_offline_cf
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);

SCHED_WARN_ON(task_rq(p) != rq);

if (rq->cfs.h_nr_running > 1) {
- u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
s64 delta = slice - ran;

if (delta < 0) {
@@ -8122,7 +8354,19 @@ static void check_preempt_wakeup(struct
if (cse_is_idle != pse_is_idle)
return;

- update_curr(cfs_rq_of(se));
+ cfs_rq = cfs_rq_of(se);
+ update_curr(cfs_rq);
+
+ if (sched_feat(EEVDF)) {
+ /*
+ * XXX pick_eevdf(cfs_rq) != se ?
+ */
+ if (pick_eevdf(cfs_rq) == pse)
+ goto preempt;
+
+ return;
+ }
+
if (wakeup_preempt_entity(se, pse) == 1) {
/*
* Bias pick_next to pick the sched entity that is
@@ -8368,7 +8612,7 @@ static void yield_task_fair(struct rq *r

clear_buddies(cfs_rq, se);

- if (curr->policy != SCHED_BATCH) {
+ if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) {
update_rq_clock(rq);
/*
* Update run-time statistics of the 'current'.
@@ -8381,6 +8625,8 @@ static void yield_task_fair(struct rq *r
*/
rq_clock_skip_update(rq);
}
+ if (sched_feat(EEVDF))
+ se->deadline += calc_delta_fair(se->slice, se);

set_skip_buddy(se);
}
@@ -12136,8 +12382,8 @@ static void rq_offline_fair(struct rq *r
static inline bool
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
{
- u64 slice = sched_slice(cfs_rq_of(se), se);
u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;

return (rtime * min_nr_tasks > slice);
}
@@ -12832,7 +13078,7 @@ static unsigned int get_rr_interval_fair
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
+ rr_interval = NS_TO_JIFFIES(se->slice);

return rr_interval;
}
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -13,6 +13,7 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)

/*
* Prefer to schedule the task we woke last (assuming it failed
@@ -103,3 +104,5 @@ SCHED_FEAT(LATENCY_WARN, false)

SCHED_FEAT(ALT_PERIOD, true)
SCHED_FEAT(BASE_SLICE, true)
+
+SCHED_FEAT(EEVDF, true)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2481,9 +2481,10 @@ extern void check_preempt_curr(struct rq
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;

+extern unsigned int sysctl_sched_min_granularity;
+
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_idle_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
extern int sysctl_resched_latency_warn_ms;
@@ -3507,5 +3508,6 @@ static inline void init_sched_mm_cid(str
#endif

extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);

#endif /* _KERNEL_SCHED_SCHED_H */




Subject: [tip: sched/core] sched/fair: Implement an EEVDF-like scheduling policy

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 147f3efaa24182a21706bca15eab2f3f4630b5fe
Gitweb: https://git.kernel.org/tip/147f3efaa24182a21706bca15eab2f3f4630b5fe
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 31 May 2023 13:58:44 +02:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Wed, 19 Jul 2023 09:43:58 +02:00

sched/fair: Implement an EEVDF-like scheduling policy

Where CFS is currently a WFQ based scheduler with only a single knob,
the weight. The addition of a second, latency oriented parameter,
makes something like WF2Q or EEVDF based a much better fit.

Specifically, EEVDF does EDF like scheduling in the left half of the
tree -- those entities that are owed service. Except because this is a
virtual time scheduler, the deadlines are in virtual time as well,
which is what allows over-subscription.

EEVDF has two parameters:

- weight, or time-slope: which is mapped to nice just as before

- request size, or slice length: which is used to compute
the virtual deadline as: vd_i = ve_i + r_i/w_i

Basically, by setting a smaller slice, the deadline will be earlier
and the task will be more eligible and ran earlier.

Tick driven preemption is driven by request/slice completion; while
wakeup preemption is driven by the deadline.

Because the tree is now effectively an interval tree, and the
selection is no longer 'leftmost', over-scheduling is less of a
problem.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
include/linux/sched.h | 4 +-
kernel/sched/core.c | 1 +-
kernel/sched/debug.c | 6 +-
kernel/sched/fair.c | 338 +++++++++++++++++++++++++++++++++------
kernel/sched/features.h | 3 +-
kernel/sched/sched.h | 4 +-
6 files changed, 308 insertions(+), 48 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ba1828b..177b3f3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -549,6 +549,9 @@ struct sched_entity {
/* For load-balancing: */
struct load_weight load;
struct rb_node run_node;
+ u64 deadline;
+ u64 min_deadline;
+
struct list_head group_node;
unsigned int on_rq;

@@ -557,6 +560,7 @@ struct sched_entity {
u64 prev_sum_exec_runtime;
u64 vruntime;
s64 vlag;
+ u64 slice;

u64 nr_migrations;

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 84b0d47..e85a2fd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4502,6 +4502,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.nr_migrations = 0;
p->se.vruntime = 0;
p->se.vlag = 0;
+ p->se.slice = sysctl_sched_min_granularity;
INIT_LIST_HEAD(&p->se.group_node);

#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e48d2b2..18efc6d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -582,9 +582,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else
SEQ_printf(m, " %c", task_state_to_char(p));

- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
+ SPLIT_NS(p->se.deadline),
+ SPLIT_NS(p->se.slice),
+ SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd12ada..4d3505d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -47,6 +47,7 @@
#include <linux/psi.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
+#include <linux/rbtree_augmented.h>

#include <asm/switch_to.h>

@@ -347,6 +348,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
return mul_u64_u32_shr(delta_exec, fact, shift);
}

+/*
+ * delta /= w
+ */
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+{
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+
+ return delta;
+}

const struct sched_class fair_sched_class;

@@ -717,11 +728,62 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)

/*
* lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * However, since V is approximated by the weighted average of all entities it
+ * is possible -- by addition/removal/reweight to the tree -- to move V around
+ * and end up with a larger lag than we started with.
+ *
+ * Limit this to either double the slice length with a minimum of TICK_NSEC
+ * since that is the timing granularity.
+ *
+ * EEVDF gives the following limit for a steady state system:
+ *
+ * -r_max < lag < max(r_max, q)
+ *
+ * XXX could add max_slice to the augmented data to track this.
*/
void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ s64 lag, limit;
+
SCHED_WARN_ON(!se->on_rq);
- se->vlag = avg_vruntime(cfs_rq) - se->vruntime;
+ lag = avg_vruntime(cfs_rq) - se->vruntime;
+
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+ se->vlag = clamp(lag, -limit, limit);
+}
+
+/*
+ * Entity is eligible once it received less service than it ought to have,
+ * eg. lag >= 0.
+ *
+ * lag_i = S - s_i = w_i*(V - v_i)
+ *
+ * lag_i >= 0 -> V >= v_i
+ *
+ * \Sum (v_i - v)*w_i
+ * V = ------------------ + v
+ * \Sum w_i
+ *
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ *
+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ * to the loss in precision caused by the division.
+ */
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ return avg >= entity_key(cfs_rq, se) * load;
}

static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
@@ -740,8 +802,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)

static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);

u64 vruntime = cfs_rq->min_vruntime;

@@ -752,9 +814,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}

- if (leftmost) { /* non-empty tree */
- struct sched_entity *se = __node_2_se(leftmost);
-
+ if (se) {
if (!curr)
vruntime = se->vruntime;
else
@@ -771,18 +831,50 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
return entity_before(__node_2_se(a), __node_2_se(b));
}

+#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+
+static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (deadline_gt(min_deadline, se, rse))
+ se->min_deadline = rse->min_deadline;
+ }
+}
+
+/*
+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
+ */
+static inline bool min_deadline_update(struct sched_entity *se, bool exit)
+{
+ u64 old_min_deadline = se->min_deadline;
+ struct rb_node *node = &se->run_node;
+
+ se->min_deadline = se->deadline;
+ __update_min_deadline(se, node->rb_right);
+ __update_min_deadline(se, node->rb_left);
+
+ return se->min_deadline == old_min_deadline;
+}
+
+RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
+ run_node, min_deadline, min_deadline_update);
+
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
+ se->min_deadline = se->deadline;
+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ __entity_less, &min_deadline_cb);
}

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ &min_deadline_cb);
avg_vruntime_sub(cfs_rq, se);
}

@@ -806,6 +898,97 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
return __node_2_se(next);
}

+static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+
+ /*
+ * If curr is set we have to see if its left of the leftmost entity
+ * still in the tree, provided there was anything in the tree at all.
+ */
+ if (!left || (curr && entity_before(curr, left)))
+ left = curr;
+
+ return left;
+}
+
+/*
+ * Earliest Eligible Virtual Deadline First
+ *
+ * In order to provide latency guarantees for different request sizes
+ * EEVDF selects the best runnable task from two criteria:
+ *
+ * 1) the task must be eligible (must be owed service)
+ *
+ * 2) from those tasks that meet 1), we select the one
+ * with the earliest virtual deadline.
+ *
+ * We can do this in O(log n) time due to an augmented RB-tree. The
+ * tree keeps the entries sorted on service, but also functions as a
+ * heap based on the deadline by keeping:
+ *
+ * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
+ *
+ * Which allows an EDF like search on (sub)trees.
+ */
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct sched_entity *curr = cfs_rq->curr;
+ struct sched_entity *best = NULL;
+
+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
+ curr = NULL;
+
+ while (node) {
+ struct sched_entity *se = __node_2_se(node);
+
+ /*
+ * If this entity is not eligible, try the left subtree.
+ */
+ if (!entity_eligible(cfs_rq, se)) {
+ node = node->rb_left;
+ continue;
+ }
+
+ /*
+ * If this entity has an earlier deadline than the previous
+ * best, take this one. If it also has the earliest deadline
+ * of its subtree, we're done.
+ */
+ if (!best || deadline_gt(deadline, best, se)) {
+ best = se;
+ if (best->deadline == best->min_deadline)
+ break;
+ }
+
+ /*
+ * If the earlest deadline in this subtree is in the fully
+ * eligible left half of our space, go there.
+ */
+ if (node->rb_left &&
+ __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
+ node = node->rb_left;
+ continue;
+ }
+
+ node = node->rb_right;
+ }
+
+ if (!best || (curr && deadline_gt(deadline, best, curr)))
+ best = curr;
+
+ if (unlikely(!best)) {
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ if (left) {
+ pr_err("EEVDF scheduling fail, picking leftmost\n");
+ return left;
+ }
+ }
+
+ return best;
+}
+
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
@@ -840,17 +1023,6 @@ int sched_update_scaling(void)
#endif

/*
- * delta /= w
- */
-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-{
- if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
-
- return delta;
-}
-
-/*
* The idea is to set a period in which each task runs once.
*
* When there are too many tasks (sched_nr_latency) we have to stretch
@@ -915,6 +1087,48 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return slice;
}

+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
+/*
+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+ * this is probably good enough.
+ */
+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if ((s64)(se->vruntime - se->deadline) < 0)
+ return;
+
+ if (sched_feat(EEVDF)) {
+ /*
+ * For EEVDF the virtual time slope is determined by w_i (iow.
+ * nice) while the request time r_i is determined by
+ * sysctl_sched_min_granularity.
+ */
+ se->slice = sysctl_sched_min_granularity;
+
+ /*
+ * The task has consumed its request, reschedule.
+ */
+ if (cfs_rq->nr_running > 1) {
+ resched_curr(rq_of(cfs_rq));
+ clear_buddies(cfs_rq, se);
+ }
+ } else {
+ /*
+ * When many tasks blow up the sched_period; it is possible
+ * that sched_slice() reports unusually large results (when
+ * many tasks are very light for example). Therefore impose a
+ * maximum.
+ */
+ se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency);
+ }
+
+ /*
+ * EEVDF: vd_i = ve_i + r_i / w_i
+ */
+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
+}
+
#include "pelt.h"
#ifdef CONFIG_SMP

@@ -1047,6 +1261,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
schedstat_add(cfs_rq->exec_clock, delta_exec);

curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);

if (entity_is_task(curr)) {
@@ -3521,6 +3736,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
* we need to scale se->vlag when w_i changes.
*/
se->vlag = div_s64(se->vlag * old_weight, weight);
+ } else {
+ s64 deadline = se->deadline - se->vruntime;
+ /*
+ * When the weight changes, the virtual time slope changes and
+ * we should adjust the relative virtual deadline accordingly.
+ */
+ deadline = div_s64(deadline * old_weight, weight);
+ se->deadline = se->vruntime + deadline;
}

#ifdef CONFIG_SMP
@@ -4871,6 +5094,7 @@ static inline bool entity_is_long_sleeper(struct sched_entity *se)
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
+ u64 vslice = calc_delta_fair(se->slice, se);
u64 vruntime = avg_vruntime(cfs_rq);
s64 lag = 0;

@@ -4942,9 +5166,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
*/
load = cfs_rq->avg_load;
if (curr && curr->on_rq)
- load += curr->load.weight;
+ load += scale_load_down(curr->load.weight);

- lag *= load + se->load.weight;
+ lag *= load + scale_load_down(se->load.weight);
if (WARN_ON_ONCE(!load))
load = 1;
lag = div_s64(lag, load);
@@ -4985,6 +5209,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
}

se->vruntime = vruntime;
+
+ /*
+ * When joining the competition; the exisiting tasks will be,
+ * on average, halfway through their slice, as such start tasks
+ * off with half a slice to ease into the competition.
+ */
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && initial)
+ vslice /= 2;
+
+ /*
+ * EEVDF: vd_i = ve_i + r_i/w_i
+ */
+ se->deadline = se->vruntime + vslice;
}

static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -5207,19 +5444,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- unsigned long ideal_runtime, delta_exec;
+ unsigned long delta_exec;
struct sched_entity *se;
s64 delta;

- /*
- * When many tasks blow up the sched_period; it is possible that
- * sched_slice() reports unusually large results (when many tasks are
- * very light for example). Therefore impose a maximum.
- */
- ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
-
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime) {
+ if (delta_exec > curr->slice) {
resched_curr(rq_of(cfs_rq));
/*
* The current task ran long enough, ensure it doesn't get
@@ -5243,7 +5473,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
if (delta < 0)
return;

- if (delta > ideal_runtime)
+ if (delta > curr->slice)
resched_curr(rq_of(cfs_rq));
}

@@ -5298,17 +5528,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- struct sched_entity *left = __pick_first_entity(cfs_rq);
- struct sched_entity *se;
+ struct sched_entity *left, *se;

- /*
- * If curr is set we have to see if its left of the leftmost entity
- * still in the tree, provided there was anything in the tree at all.
- */
- if (!left || (curr && entity_before(curr, left)))
- left = curr;
+ if (sched_feat(EEVDF)) {
+ /*
+ * Enabling NEXT_BUDDY will affect latency but not fairness.
+ */
+ if (sched_feat(NEXT_BUDDY) &&
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+ return cfs_rq->next;
+
+ return pick_eevdf(cfs_rq);
+ }

- se = left; /* ideally we run the leftmost entity */
+ se = left = pick_cfs(cfs_rq, curr);

/*
* Avoid running the skip buddy, if running something else can
@@ -5401,7 +5634,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
return;
#endif

- if (cfs_rq->nr_running > 1)
+ if (!sched_feat(EEVDF) && cfs_rq->nr_running > 1)
check_preempt_tick(cfs_rq, curr);
}

@@ -6445,13 +6678,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);

SCHED_WARN_ON(task_rq(p) != rq);

if (rq->cfs.h_nr_running > 1) {
- u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
s64 delta = slice - ran;

if (delta < 0) {
@@ -8228,7 +8460,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (cse_is_idle != pse_is_idle)
return;

- update_curr(cfs_rq_of(se));
+ cfs_rq = cfs_rq_of(se);
+ update_curr(cfs_rq);
+
+ if (sched_feat(EEVDF)) {
+ /*
+ * XXX pick_eevdf(cfs_rq) != se ?
+ */
+ if (pick_eevdf(cfs_rq) == pse)
+ goto preempt;
+
+ return;
+ }
+
if (wakeup_preempt_entity(se, pse) == 1) {
/*
* Bias pick_next to pick the sched entity that is
@@ -8474,7 +8718,7 @@ static void yield_task_fair(struct rq *rq)

clear_buddies(cfs_rq, se);

- if (curr->policy != SCHED_BATCH) {
+ if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) {
update_rq_clock(rq);
/*
* Update run-time statistics of the 'current'.
@@ -8487,6 +8731,8 @@ static void yield_task_fair(struct rq *rq)
*/
rq_clock_skip_update(rq);
}
+ if (sched_feat(EEVDF))
+ se->deadline += calc_delta_fair(se->slice, se);

set_skip_buddy(se);
}
@@ -12363,8 +12609,8 @@ static void rq_offline_fair(struct rq *rq)
static inline bool
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
{
- u64 slice = sched_slice(cfs_rq_of(se), se);
u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;

return (rtime * min_nr_tasks > slice);
}
@@ -13059,7 +13305,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
+ rr_interval = NS_TO_JIFFIES(se->slice);

return rr_interval;
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 7958a10..60cce1e 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -13,6 +13,7 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)

/*
* Prefer to schedule the task we woke last (assuming it failed
@@ -103,3 +104,5 @@ SCHED_FEAT(LATENCY_WARN, false)

SCHED_FEAT(ALT_PERIOD, true)
SCHED_FEAT(BASE_SLICE, true)
+
+SCHED_FEAT(EEVDF, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 52a0a4b..aa5b293 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2505,9 +2505,10 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;

+extern unsigned int sysctl_sched_min_granularity;
+
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_idle_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
extern int sysctl_resched_latency_warn_ms;
@@ -3487,5 +3488,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
#endif

extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);

#endif /* _KERNEL_SCHED_SCHED_H */

2023-09-29 21:43:57

by Benjamin Segall

[permalink] [raw]
Subject: Re: [PATCH 05/15] sched/fair: Implement an EEVDF like policy

Peter Zijlstra <[email protected]> writes:

> +
> +/*
> + * Earliest Eligible Virtual Deadline First
> + *
> + * In order to provide latency guarantees for different request sizes
> + * EEVDF selects the best runnable task from two criteria:
> + *
> + * 1) the task must be eligible (must be owed service)
> + *
> + * 2) from those tasks that meet 1), we select the one
> + * with the earliest virtual deadline.
> + *
> + * We can do this in O(log n) time due to an augmented RB-tree. The
> + * tree keeps the entries sorted on service, but also functions as a
> + * heap based on the deadline by keeping:
> + *
> + * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
> + *
> + * Which allows an EDF like search on (sub)trees.
> + */
> +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> +{
> + struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
> + struct sched_entity *curr = cfs_rq->curr;
> + struct sched_entity *best = NULL;
> +
> + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
> + curr = NULL;
> +
> + while (node) {
> + struct sched_entity *se = __node_2_se(node);
> +
> + /*
> + * If this entity is not eligible, try the left subtree.
> + */
> + if (!entity_eligible(cfs_rq, se)) {
> + node = node->rb_left;
> + continue;
> + }
> +
> + /*
> + * If this entity has an earlier deadline than the previous
> + * best, take this one. If it also has the earliest deadline
> + * of its subtree, we're done.
> + */
> + if (!best || deadline_gt(deadline, best, se)) {
> + best = se;
> + if (best->deadline == best->min_deadline)
> + break;
> + }
> +
> + /*
> + * If the earlest deadline in this subtree is in the fully
> + * eligible left half of our space, go there.
> + */
> + if (node->rb_left &&
> + __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
> + node = node->rb_left;
> + continue;
> + }
> +
> + node = node->rb_right;
> + }

I believe that this can fail to actually find the earliest eligible
deadline, because the earliest deadline (min_deadline) can be in the
right branch, but that se isn't eligible, and the actual target se is in
the left branch. A trivial 3-se example with the nodes represented by
(vruntime, deadline, min_deadline):

(5,9,7)
/ \
(4,8,8) (6,7,7)

AIUI, here the EEVDF pick should be (4,8,8), but pick_eevdf() will
instead pick (5,9,7), because it goes into the right branch and then
fails eligibility.

I'm not sure how much of a problem this is in practice, either in
frequency or severity, but it probably should be mentioned if it's
an intentional tradeoff.



Thinking out loud, I think that it would be sufficient to recheck via something like

for_each_sched_entity(best) {
check __node_2_se(best->rb_left)->min_deadline, store in actual_best
}

for the best min_deadline, and then go do a heap lookup in actual_best
to find the se matching that min_deadline.

I think this pass could then be combined with our initial descent for
better cache behavior by keeping track of the best rb_left->min_deadline
each time we take a right branch. We still have to look at up to ~2x the
nodes, but I don't think that's avoidable? I'll expand my quick hack I
used to test my simple case into a something of a stress tester and try
some implementations.

2023-09-30 16:18:43

by Benjamin Segall

[permalink] [raw]
Subject: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

The old pick_eevdf could fail to find the actual earliest eligible
deadline when it descended to the right looking for min_deadline, but it
turned out that that min_deadline wasn't actually eligible. In that case
we need to go back and search through any left branches we skipped
looking for the actual best _eligible_ min_deadline.

This is more expensive, but still O(log n), and at worst should only
involve descending two branches of the rbtree.

I've run this through a userspace stress test (thank you
tools/lib/rbtree.c), so hopefully this implementation doesn't miss any
corner cases.

Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Signed-off-by: Ben Segall <[email protected]>
---
kernel/sched/fair.c | 72 ++++++++++++++++++++++++++++++++++++---------
1 file changed, 58 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0c31cda0712f..77e9440b8ab3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -864,18 +864,20 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
*
* se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
*
* Which allows an EDF like search on (sub)trees.
*/
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *curr = cfs_rq->curr;
struct sched_entity *best = NULL;
+ struct sched_entity *best_left = NULL;

if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
+ best = curr;

/*
* Once selected, run a task until it either becomes non-eligible or
* until it gets a new slice. See the HACK in set_next_entity().
*/
@@ -892,45 +894,87 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
node = node->rb_left;
continue;
}

/*
- * If this entity has an earlier deadline than the previous
- * best, take this one. If it also has the earliest deadline
- * of its subtree, we're done.
+ * Now we heap search eligible trees for the best (min_)deadline
*/
- if (!best || deadline_gt(deadline, best, se)) {
+ if (!best || deadline_gt(deadline, best, se))
best = se;
- if (best->deadline == best->min_deadline)
- break;
- }

/*
- * If the earlest deadline in this subtree is in the fully
- * eligible left half of our space, go there.
+ * Every se in a left branch is eligible, keep track of the
+ * branch with the best min_deadline
*/
+ if (node->rb_left) {
+ struct sched_entity *left = __node_2_se(node->rb_left);
+
+ if (!best_left || deadline_gt(min_deadline, best_left, left))
+ best_left = left;
+
+ /*
+ * min_deadline is in the left branch. rb_left and all
+ * descendants are eligible, so immediately switch to the second
+ * loop.
+ */
+ if (left->min_deadline == se->min_deadline)
+ break;
+ }
+
+ /* min_deadline is at this node, no need to look right */
+ if (se->deadline == se->min_deadline)
+ break;
+
+ /* else min_deadline is in the right branch. */
+ node = node->rb_right;
+ }
+
+ /*
+ * We ran into an eligible node which is itself the best.
+ * (Or nr_running == 0 and both are NULL)
+ */
+ if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0)
+ return best;
+
+ /*
+ * Now best_left and all of its children are eligible, and we are just
+ * looking for deadline == min_deadline
+ */
+ node = &best_left->run_node;
+ while (node) {
+ struct sched_entity *se = __node_2_se(node);
+
+ /* min_deadline is the current node */
+ if (se->deadline == se->min_deadline)
+ return se;
+
+ /* min_deadline is in the left branch */
if (node->rb_left &&
__node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
node = node->rb_left;
continue;
}

+ /* else min_deadline is in the right branch */
node = node->rb_right;
}
+ return NULL;
+}

- if (!best || (curr && deadline_gt(deadline, best, curr)))
- best = curr;
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *se = __pick_eevdf(cfs_rq);

- if (unlikely(!best)) {
+ if (!se) {
struct sched_entity *left = __pick_first_entity(cfs_rq);
if (left) {
pr_err("EEVDF scheduling fail, picking leftmost\n");
return left;
}
}

- return best;
+ return se;
}

#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
--
2.42.0.582.g8ccd20d70d-goog

2023-10-02 20:55:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 05/15] sched/fair: Implement an EEVDF like policy

On Fri, Sep 29, 2023 at 02:40:31PM -0700, Benjamin Segall wrote:

> > +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> > +{
> > + struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
> > + struct sched_entity *curr = cfs_rq->curr;
> > + struct sched_entity *best = NULL;
> > +
> > + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
> > + curr = NULL;
> > +
> > + while (node) {
> > + struct sched_entity *se = __node_2_se(node);
> > +
> > + /*
> > + * If this entity is not eligible, try the left subtree.
> > + */
> > + if (!entity_eligible(cfs_rq, se)) {
> > + node = node->rb_left;
> > + continue;
> > + }
> > +
> > + /*
> > + * If this entity has an earlier deadline than the previous
> > + * best, take this one. If it also has the earliest deadline
> > + * of its subtree, we're done.
> > + */
> > + if (!best || deadline_gt(deadline, best, se)) {
> > + best = se;
> > + if (best->deadline == best->min_deadline)
> > + break;
> > + }
> > +
> > + /*
> > + * If the earlest deadline in this subtree is in the fully
> > + * eligible left half of our space, go there.
> > + */
> > + if (node->rb_left &&
> > + __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
> > + node = node->rb_left;
> > + continue;
> > + }
> > +
> > + node = node->rb_right;
> > + }
>
> I believe that this can fail to actually find the earliest eligible
> deadline, because the earliest deadline (min_deadline) can be in the
> right branch, but that se isn't eligible, and the actual target se is in
> the left branch. A trivial 3-se example with the nodes represented by
> (vruntime, deadline, min_deadline):
>
> (5,9,7)
> / \
> (4,8,8) (6,7,7)
>
> AIUI, here the EEVDF pick should be (4,8,8), but pick_eevdf() will
> instead pick (5,9,7), because it goes into the right branch and then
> fails eligibility.
>
> I'm not sure how much of a problem this is in practice, either in
> frequency or severity, but it probably should be mentioned if it's
> an intentional tradeoff.

Well, that is embarrassing :-(

You're quite right -- and I *SHOULD* have double checked my decade old
patches, but alas.

Re-reading the paper, your proposal is fairly close to what they have.
Let me go stare at your patch in more detail.


Subject: [tip: sched/urgent] sched/fair: Fix pick_eevdf()

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID: 561c58efd2394d76a32254d91e4b1de8ecdeb5c8
Gitweb: https://git.kernel.org/tip/561c58efd2394d76a32254d91e4b1de8ecdeb5c8
Author: Benjamin Segall <[email protected]>
AuthorDate: Fri, 29 Sep 2023 17:09:30 -07:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 03 Oct 2023 12:32:30 +02:00

sched/fair: Fix pick_eevdf()

The old pick_eevdf() could fail to find the actual earliest eligible
deadline when it descended to the right looking for min_deadline, but
it turned out that that min_deadline wasn't actually eligible. In that
case we need to go back and search through any left branches we
skipped looking for the actual best _eligible_ min_deadline.

This is more expensive, but still O(log n), and at worst should only
involve descending two branches of the rbtree.

I've run this through a userspace stress test (thank you
tools/lib/rbtree.c), so hopefully this implementation doesn't miss any
corner cases.

Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Signed-off-by: Ben Segall <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 72 +++++++++++++++++++++++++++++++++++---------
1 file changed, 58 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef7490c..929d21d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -872,14 +872,16 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
*
* Which allows an EDF like search on (sub)trees.
*/
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *curr = cfs_rq->curr;
struct sched_entity *best = NULL;
+ struct sched_entity *best_left = NULL;

if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
+ best = curr;

/*
* Once selected, run a task until it either becomes non-eligible or
@@ -900,33 +902,75 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
}

/*
- * If this entity has an earlier deadline than the previous
- * best, take this one. If it also has the earliest deadline
- * of its subtree, we're done.
+ * Now we heap search eligible trees for the best (min_)deadline
*/
- if (!best || deadline_gt(deadline, best, se)) {
+ if (!best || deadline_gt(deadline, best, se))
best = se;
- if (best->deadline == best->min_deadline)
- break;
- }

/*
- * If the earlest deadline in this subtree is in the fully
- * eligible left half of our space, go there.
+ * Every se in a left branch is eligible, keep track of the
+ * branch with the best min_deadline
*/
+ if (node->rb_left) {
+ struct sched_entity *left = __node_2_se(node->rb_left);
+
+ if (!best_left || deadline_gt(min_deadline, best_left, left))
+ best_left = left;
+
+ /*
+ * min_deadline is in the left branch. rb_left and all
+ * descendants are eligible, so immediately switch to the second
+ * loop.
+ */
+ if (left->min_deadline == se->min_deadline)
+ break;
+ }
+
+ /* min_deadline is at this node, no need to look right */
+ if (se->deadline == se->min_deadline)
+ break;
+
+ /* else min_deadline is in the right branch. */
+ node = node->rb_right;
+ }
+
+ /*
+ * We ran into an eligible node which is itself the best.
+ * (Or nr_running == 0 and both are NULL)
+ */
+ if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0)
+ return best;
+
+ /*
+ * Now best_left and all of its children are eligible, and we are just
+ * looking for deadline == min_deadline
+ */
+ node = &best_left->run_node;
+ while (node) {
+ struct sched_entity *se = __node_2_se(node);
+
+ /* min_deadline is the current node */
+ if (se->deadline == se->min_deadline)
+ return se;
+
+ /* min_deadline is in the left branch */
if (node->rb_left &&
__node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
node = node->rb_left;
continue;
}

+ /* else min_deadline is in the right branch */
node = node->rb_right;
}
+ return NULL;
+}

- if (!best || (curr && deadline_gt(deadline, best, curr)))
- best = curr;
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *se = __pick_eevdf(cfs_rq);

- if (unlikely(!best)) {
+ if (!se) {
struct sched_entity *left = __pick_first_entity(cfs_rq);
if (left) {
pr_err("EEVDF scheduling fail, picking leftmost\n");
@@ -934,7 +978,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
}
}

- return best;
+ return se;
}

#ifdef CONFIG_SCHED_DEBUG

2023-10-04 20:39:55

by Marek Szyprowski

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

Hi,

On 30.09.2023 02:09, Benjamin Segall wrote:
> The old pick_eevdf could fail to find the actual earliest eligible
> deadline when it descended to the right looking for min_deadline, but it
> turned out that that min_deadline wasn't actually eligible. In that case
> we need to go back and search through any left branches we skipped
> looking for the actual best _eligible_ min_deadline.
>
> This is more expensive, but still O(log n), and at worst should only
> involve descending two branches of the rbtree.
>
> I've run this through a userspace stress test (thank you
> tools/lib/rbtree.c), so hopefully this implementation doesn't miss any
> corner cases.
>
> Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
> Signed-off-by: Ben Segall <[email protected]>

This patch landed in today's linux-next as commit 561c58efd239
("sched/fair: Fix pick_eevdf()"). Surprisingly it introduced a warning
about circular locking dependency. It can be easily observed during boot
from time to time on on qemu/arm64 'virt' machine:

======================================================
WARNING: possible circular locking dependency detected
6.6.0-rc4+ #7222 Not tainted
------------------------------------------------------
systemd-udevd/1187 is trying to acquire lock:
ffffbcc2be0c4de0 (console_owner){..-.}-{0:0}, at:
console_flush_all+0x1b0/0x500

but task is already holding lock:
ffff5535ffdd2b18 (&rq->__lock){-.-.}-{2:2}, at: __schedule+0xe0/0xc40

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #4 (&rq->__lock){-.-.}-{2:2}:
       _raw_spin_lock_nested+0x44/0x5c
       raw_spin_rq_lock_nested+0x24/0x40
       task_fork_fair+0x3c/0xac
       sched_cgroup_fork+0xe8/0x14c
       copy_process+0x11c4/0x1a14
       kernel_clone+0x8c/0x400
       user_mode_thread+0x70/0x98
       rest_init+0x28/0x190
       arch_post_acpi_subsys_init+0x0/0x8
       start_kernel+0x594/0x684
       __primary_switched+0xbc/0xc4

-> #3 (&p->pi_lock){-.-.}-{2:2}:
       _raw_spin_lock_irqsave+0x60/0x88
       try_to_wake_up+0x58/0x468
       default_wake_function+0x14/0x20
       woken_wake_function+0x20/0x2c
       __wake_up_common+0x94/0x170
       __wake_up_common_lock+0x7c/0xcc
       __wake_up+0x18/0x24
       tty_wakeup+0x34/0x70
       tty_port_default_wakeup+0x20/0x38
       tty_port_tty_wakeup+0x18/0x24
       uart_write_wakeup+0x18/0x28
       pl011_tx_chars+0x140/0x2b4
       pl011_start_tx+0xe8/0x190
       serial_port_runtime_resume+0x90/0xc0
       __rpm_callback+0x48/0x1a8
       rpm_callback+0x6c/0x78
       rpm_resume+0x438/0x6d8
       pm_runtime_work+0x84/0xc8
       process_one_work+0x1ec/0x53c
       worker_thread+0x298/0x408
       kthread+0x124/0x128
       ret_from_fork+0x10/0x20

-> #2 (&tty->write_wait){....}-{2:2}:
       _raw_spin_lock_irqsave+0x60/0x88
       __wake_up_common_lock+0x5c/0xcc
       __wake_up+0x18/0x24
       tty_wakeup+0x34/0x70
       tty_port_default_wakeup+0x20/0x38
       tty_port_tty_wakeup+0x18/0x24
       uart_write_wakeup+0x18/0x28
       pl011_tx_chars+0x140/0x2b4
       pl011_start_tx+0xe8/0x190
       serial_port_runtime_resume+0x90/0xc0
       __rpm_callback+0x48/0x1a8
       rpm_callback+0x6c/0x78
       rpm_resume+0x438/0x6d8
       pm_runtime_work+0x84/0xc8
       process_one_work+0x1ec/0x53c
       worker_thread+0x298/0x408
       kthread+0x124/0x128
       ret_from_fork+0x10/0x20

-> #1 (&port_lock_key){..-.}-{2:2}:
       _raw_spin_lock+0x48/0x60
       pl011_console_write+0x13c/0x1b0
       console_flush_all+0x20c/0x500
       console_unlock+0x6c/0x130
       vprintk_emit+0x228/0x3a0
       vprintk_default+0x38/0x44
       vprintk+0xa4/0xc0
       _printk+0x5c/0x84
       register_console+0x1f4/0x420
       serial_core_register_port+0x5a4/0x5d8
       serial_ctrl_register_port+0x10/0x1c
       uart_add_one_port+0x10/0x1c
       pl011_register_port+0x70/0x12c
       pl011_probe+0x1bc/0x1fc
       amba_probe+0x110/0x1c8
       really_probe+0x148/0x2b4
       __driver_probe_device+0x78/0x12c
       driver_probe_device+0xd8/0x160
       __device_attach_driver+0xb8/0x138
       bus_for_each_drv+0x84/0xe0
       __device_attach+0xa8/0x1b0
       device_initial_probe+0x14/0x20
       bus_probe_device+0xb0/0xb4
       device_add+0x574/0x738
       amba_device_add+0x40/0xac
       of_platform_bus_create+0x2b4/0x378
       of_platform_populate+0x50/0xfc
       of_platform_default_populate_init+0xd0/0xf0
       do_one_initcall+0x74/0x2f0
       kernel_init_freeable+0x28c/0x4dc
       kernel_init+0x24/0x1dc
       ret_from_fork+0x10/0x20

-> #0 (console_owner){..-.}-{0:0}:
       __lock_acquire+0x1318/0x20c4
       lock_acquire+0x1e8/0x318
       console_flush_all+0x1f8/0x500
       console_unlock+0x6c/0x130
       vprintk_emit+0x228/0x3a0
       vprintk_default+0x38/0x44
       vprintk+0xa4/0xc0
       _printk+0x5c/0x84
       pick_next_task_fair+0x28c/0x498
       __schedule+0x164/0xc40
       do_task_dead+0x54/0x58
       do_exit+0x61c/0x9e8
       do_group_exit+0x34/0x90
       __wake_up_parent+0x0/0x30
       invoke_syscall+0x48/0x114
       el0_svc_common.constprop.0+0x40/0xe0
       do_el0_svc_compat+0x1c/0x38
       el0_svc_compat+0x48/0xb4
       el0t_32_sync_handler+0x90/0x140
       el0t_32_sync+0x194/0x198

other info that might help us debug this:

Chain exists of:
  console_owner --> &p->pi_lock --> &rq->__lock

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&rq->__lock);
                               lock(&p->pi_lock);
                               lock(&rq->__lock);
  lock(console_owner);

 *** DEADLOCK ***

3 locks held by systemd-udevd/1187:
 #0: ffff5535ffdd2b18 (&rq->__lock){-.-.}-{2:2}, at: __schedule+0xe0/0xc40
 #1: ffffbcc2be0c4c30 (console_lock){+.+.}-{0:0}, at:
vprintk_emit+0x11c/0x3a0
 #2: ffffbcc2be0c4c88 (console_srcu){....}-{0:0}, at:
console_flush_all+0x7c/0x500

stack backtrace:
CPU: 1 PID: 1187 Comm: systemd-udevd Not tainted 6.6.0-rc4+ #7222
Hardware name: linux,dummy-virt (DT)
Call trace:
 dump_backtrace+0x98/0xf0
 show_stack+0x18/0x24
 dump_stack_lvl+0x60/0xac
 dump_stack+0x18/0x24
 print_circular_bug+0x290/0x370
 check_noncircular+0x15c/0x170
 __lock_acquire+0x1318/0x20c4
 lock_acquire+0x1e8/0x318
 console_flush_all+0x1f8/0x500
 console_unlock+0x6c/0x130
 vprintk_emit+0x228/0x3a0
 vprintk_default+0x38/0x44
 vprintk+0xa4/0xc0
 _printk+0x5c/0x84
 pick_next_task_fair+0x28c/0x498
 __schedule+0x164/0xc40
 do_task_dead+0x54/0x58
 do_exit+0x61c/0x9e8
 do_group_exit+0x34/0x90
 __wake_up_parent+0x0/0x30
 invoke_syscall+0x48/0x114
 el0_svc_common.constprop.0+0x40/0xe0
 do_el0_svc_compat+0x1c/0x38
 el0_svc_compat+0x48/0xb4
 el0t_32_sync_handler+0x90/0x140
 el0t_32_sync+0x194/0x198

The problem is probably elsewhere, but this scheduler change only
revealed it in a fully reproducible way. Reverting $subject on top of
linux-next hides the problem deep enough that I was not able to
reproduce it. Let me know if there is anything I can do to help fixing
this issue.

> ---
> kernel/sched/fair.c | 72 ++++++++++++++++++++++++++++++++++++---------
> 1 file changed, 58 insertions(+), 14 deletions(-)
>
> ...

Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland

2023-10-05 14:17:03

by Biju Das

[permalink] [raw]
Subject: RE: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

Hi all,

> -----Original Message-----
> From: Biju Das
> Sent: Thursday, October 5, 2023 8:32 AM
> Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct
> se
>
> Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct
> se
> Date: Wed, 4 Oct 2023 22:39:39 +0200 [thread overview]
> Message-ID: <[email protected]> (raw)
> In-Reply-To: <[email protected]>
>
> Hi,
>
> On 30.09.2023 02:09, Benjamin Segall wrote:
> > The old pick_eevdf could fail to find the actual earliest eligible
> > deadline when it descended to the right looking for min_deadline, but
> > it turned out that that min_deadline wasn't actually eligible. In that
> > case we need to go back and search through any left branches we
> > skipped looking for the actual best _eligible_ min_deadline.
> >
> > This is more expensive, but still O(log n), and at worst should only
> > involve descending two branches of the rbtree.
> >
> > I've run this through a userspace stress test (thank you
> > tools/lib/rbtree.c), so hopefully this implementation doesn't miss any
> > corner cases.
> >
> > Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling
> > policy")
> > Signed-off-by: Ben Segall <[email protected]>
>
> This patch causing issues [1] in Renesas RZ/G2L SMARC EVK platform.
> Reverting the patch fixes the warning messages
>
> [1]
> [ 25.550898] EEVDF scheduling fail, picking leftmost
>
> [ 15.109634] ======================================================
> [ 15.109636] WARNING: possible circular locking dependency detected
> [ 15.109641] 6.6.0-rc4-next-20231005-arm64-renesas-ga03f9ebbbb4c #1165
> Not tainted
> [ 15.109648] ------------------------------------------------------
> [ 15.109649] migration/0/16 is trying to acquire lock:
> [ 15.109654] ffff800081713460 (console_owner){..-.}-{0:0}, at:
> console_flush_all.constprop.0+0x1a0/0x438
> [ 15.109694]
> [ 15.109694] but task is already holding lock:
> [ 15.109697] ffff00007fbd2298 (&rq->__lock){-.-.}-{2:2}, at:
> __schedule+0xd0/0xbe0
> [ 15.109718]
> [ 15.109718] which lock already depends on the new lock.
> [ 15.109718]
> [ 15.109720]
> [ 15.109720] the existing dependency chain (in reverse order) is:
>
> 25.551560] __down_trylock_console_sem+0x34/0xb8
> [ 25.551567] console_trylock+0x24/0x74
> [ 25.551574] vprintk_emit+0x114/0x388
> [ 25.551581] vprintk_default+0x34/0x3c
> [ 25.551588] vprintk+0x9c/0xb4
> [ 25.551594] _printk+0x58/0x7c
> [ 25.551600] pick_next_task_fair+0x274/0x480
> [ 25.551608] __schedule+0x154/0xbe0
> [ 25.551616] schedule+0x48/0x110
> [ 25.551623] worker_thread+0x1b8/0x3f8
> [ 25.551630] kthread+0x114/0x118
> [ 25.551635] ret_from_fork+0x10/0x20
> [ OK ] Started System Logging Service.
> [ 26.099203] EEVDF scheduling fail, picking leftmost

Complete log can be found here

https://pastebin.com/zZkRFiSf

Cheers,
Biju

2023-10-05 15:52:26

by Biju Das

[permalink] [raw]
Subject: RE: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

Hi Peter Zijlstra,

> Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct
> se
>
> On Thu, Oct 05, 2023 at 07:31:34AM +0000, Biju Das wrote:
>
> > [ 26.099203] EEVDF scheduling fail, picking leftmost
>
> This, that the problem.. the rest is just noise because printk stinks.
>
> Weirdly have not seen that trigger, and I've been running with this patch
> on for a few days now :/

I agree Original issue is "EEVDF scheduling fail, picking leftmost"
Which is triggering noisy lock warning messages during boot.

2 platforms are affected both ARM platforms(Renesas and Samsung)
Maybe other platforms affected too.

I am happy to test if there is a fix available for this issue.

Cheers,
Biju

2023-10-05 16:14:06

by Biju Das

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se
Date: Wed, 4 Oct 2023 22:39:39 +0200 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

Hi,

On 30.09.2023 02:09, Benjamin Segall wrote:
> The old pick_eevdf could fail to find the actual earliest eligible
> deadline when it descended to the right looking for min_deadline, but it
> turned out that that min_deadline wasn't actually eligible. In that case
> we need to go back and search through any left branches we skipped
> looking for the actual best _eligible_ min_deadline.
>
> This is more expensive, but still O(log n), and at worst should only
> involve descending two branches of the rbtree.
>
> I've run this through a userspace stress test (thank you
> tools/lib/rbtree.c), so hopefully this implementation doesn't miss any
> corner cases.
>
> Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
> Signed-off-by: Ben Segall <[email protected]>

This patch causing issues [1] in Renesas RZ/G2L SMARC EVK platform. Reverting the patch fixes the warning messages

[1]
[ 25.550898] EEVDF scheduling fail, picking leftmost

[ 15.109634] ======================================================
[ 15.109636] WARNING: possible circular locking dependency detected
[ 15.109641] 6.6.0-rc4-next-20231005-arm64-renesas-ga03f9ebbbb4c #1165 Not tainted
[ 15.109648] ------------------------------------------------------
[ 15.109649] migration/0/16 is trying to acquire lock:
[ 15.109654] ffff800081713460 (console_owner){..-.}-{0:0}, at: console_flush_all.constprop.0+0x1a0/0x438
[ 15.109694]
[ 15.109694] but task is already holding lock:
[ 15.109697] ffff00007fbd2298 (&rq->__lock){-.-.}-{2:2}, at: __schedule+0xd0/0xbe0
[ 15.109718]
[ 15.109718] which lock already depends on the new lock.
[ 15.109718]
[ 15.109720]
[ 15.109720] the existing dependency chain (in reverse order) is:

25.551560] __down_trylock_console_sem+0x34/0xb8
[ 25.551567] console_trylock+0x24/0x74
[ 25.551574] vprintk_emit+0x114/0x388
[ 25.551581] vprintk_default+0x34/0x3c
[ 25.551588] vprintk+0x9c/0xb4
[ 25.551594] _printk+0x58/0x7c
[ 25.551600] pick_next_task_fair+0x274/0x480
[ 25.551608] __schedule+0x154/0xbe0
[ 25.551616] schedule+0x48/0x110
[ 25.551623] worker_thread+0x1b8/0x3f8
[ 25.551630] kthread+0x114/0x118
[ 25.551635] ret_from_fork+0x10/0x20
[ OK ] Started System Logging Service.
[ 26.099203] EEVDF scheduling fail, picking leftmost

Cheers,
Biju

2023-10-06 08:36:50

by Marek Szyprowski

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

On 05.10.2023 17:08, Biju Das wrote:
> Hi Peter Zijlstra,
>
>> Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct
>> se
>>
>> On Thu, Oct 05, 2023 at 07:31:34AM +0000, Biju Das wrote:
>>
>>> [ 26.099203] EEVDF scheduling fail, picking leftmost
>> This, that the problem.. the rest is just noise because printk stinks.
>>
>> Weirdly have not seen that trigger, and I've been running with this patch
>> on for a few days now :/
> I agree Original issue is "EEVDF scheduling fail, picking leftmost"
> Which is triggering noisy lock warning messages during boot.
>
> 2 platforms are affected both ARM platforms(Renesas and Samsung)
> Maybe other platforms affected too.


Just to note, I've run into this issue on the QEmu's 'arm64/virt'
platform, not on the Samsung specific hardware.


You can easily reproduce it with the following steps:

# make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- defconfig

# ./scripts/config -e PROVE_LOCKING -e DEBUG_ATOMIC_SLEEP -e PM_DEBUG -e
PM_ADVANCED_DEBUG

# make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- olddefconfig Image.gz

# wget
https://cloud.debian.org/images/cloud/buster/20230802-1460/debian-10-nocloud-arm64-20230802-1460.tar.xz

# tar xJfv debian-10-nocloud-arm64-20230802-1460.tar.xz


Then run QEmu a few times until you see the lockdep splat:

# qemu-system-aarch64 -serial stdio -kernel arch/arm64/boot/Image
-append "console=ttyAMA0 root=/dev/vda1 rootwait" -M virt -cpu
cortex-a57 -smp 2 -m 1024 -netdev user,id=user -device
virtio-net-device,netdev=user -display none -device
virtio-blk-device,drive=virtio-blk0 -drive
file=disk.raw,id=virtio-blk0,if=none,format=raw


I have no idea if this is ARM64-specific anyhow, but this is how I
reproduced it. I hope this guide helps fixing the bug!

Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland

2023-10-06 09:38:27

by Biju Das

[permalink] [raw]
Subject: RE: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

> Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct
> se
>
> On 05.10.2023 17:08, Biju Das wrote:
> > Hi Peter Zijlstra,
> >
> >> Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the
> >> correct se
> >>
> >> On Thu, Oct 05, 2023 at 07:31:34AM +0000, Biju Das wrote:
> >>
> >>> [ 26.099203] EEVDF scheduling fail, picking leftmost
> >> This, that the problem.. the rest is just noise because printk stinks.
> >>
> >> Weirdly have not seen that trigger, and I've been running with this
> >> patch on for a few days now :/
> > I agree Original issue is "EEVDF scheduling fail, picking leftmost"
> > Which is triggering noisy lock warning messages during boot.
> >
> > 2 platforms are affected both ARM platforms(Renesas and Samsung) Maybe
> > other platforms affected too.
>
>
> Just to note, I've run into this issue on the QEmu's 'arm64/virt'
> platform, not on the Samsung specific hardware.

OK, if needed I am happy to test on the real hardware, if a fix found for this issue. It is 100% reproducible Renesas RZ/g2L SMARC EVK platform.

Cheers,
Biju

2023-10-06 10:33:09

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

On Fri, 2023-10-06 at 10:35 +0200, Marek Szyprowski wrote:
>
>
> Just to note, I've run into this issue on the QEmu's 'arm64/virt'
> platform, not on the Samsung specific hardware. 

It doesn't appear to be arch specific, all I have to do is enable
autogroup, raspberry or x86_64 desktop box, the occasional failure
tripping over task groups, leaving both best and best_left with
identical but not what we're looking for ->min_deadline.

-Mike

2023-10-06 14:40:19

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

On Fri, 2023-10-06 at 16:00 +0200, Peter Zijlstra wrote:
> On Fri, Oct 06, 2023 at 12:31:28PM +0200, Mike Galbraith wrote:
> > On Fri, 2023-10-06 at 10:35 +0200, Marek Szyprowski wrote:
> > >
> > >
> > > Just to note, I've run into this issue on the QEmu's 'arm64/virt'
> > > platform, not on the Samsung specific hardware. 
> >
> > It doesn't appear to be arch specific, all I have to do is enable
> > autogroup, raspberry or x86_64 desktop box, the occasional failure
> > tripping over task groups, leaving both best and best_left with
> > identical but not what we're looking for ->min_deadline.
>
> OK, autogroups enabled and booted, /debug/sched/debug shows autogroups
> are indeed in existence.
>
> I've ran hackbench and a kernel build, but no whoopsie yet :-(
>
> I suppose I'll kick some benchmarks and go make a cup 'o tea or
> something.

Hm, just booting gets me a handful, and generic desktop activity
produces a fairly regular supply. This is virgin 6.6.0.ga9e6eb3-tip.

[ 340.473123] EEVDF scheduling fail, picking leftmost
[ 340.473132] EEVDF scheduling fail, picking leftmost
[ 343.656549] EEVDF scheduling fail, picking leftmost
[ 343.656557] EEVDF scheduling fail, picking leftmost
[ 343.690463] EEVDF scheduling fail, picking leftmost
[ 343.690472] EEVDF scheduling fail, picking leftmost
[ 426.224039] EEVDF scheduling fail, picking leftmost
[ 426.224080] EEVDF scheduling fail, picking leftmost
[ 426.224084] EEVDF scheduling fail, picking leftmost
[ 426.363323] EEVDF scheduling fail, picking leftmost
[ 426.759244] EEVDF scheduling fail, picking leftmost
[ 426.759256] EEVDF scheduling fail, picking leftmost
[ 441.872524] EEVDF scheduling fail, picking leftmost
[ 441.872556] EEVDF scheduling fail, picking leftmost

2023-10-06 16:55:57

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

On Fri, 2023-10-06 at 17:55 +0200, Peter Zijlstra wrote:


> If I create two groups (or two users with autogroup on) and have them
> both build a kernel I do indeed get splat.

Ah, my desktop setup is two wide konsole instances with 14 tabs each
for old testament /me (dang dinky universe) and internet stuff running
as a mere mortal, so lots of groups.

-Mike

2023-10-06 20:15:49

by Marek Szyprowski

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

On 06.10.2023 21:24, Peter Zijlstra wrote:
> On Fri, Oct 06, 2023 at 05:55:01PM +0200, Peter Zijlstra wrote:
>> And yeah, min_deadline is hosed somehow.
>>
>> migration/28-185 [028] d..2. 70.264274: validate_cfs_rq: --- /
>> migration/28-185 [028] d..2. 70.264277: __print_se: ffff88845cf48080 w: 1024 ve: -58857638 lag: 870381 vd: -55861854 vmd: -66302085 E (11372/tr)
>> migration/28-185 [028] d..2. 70.264280: __print_se: ffff88810d165800 w: 25 ve: -80323686 lag: 22336429 vd: -41496434 vmd: -66302085 E (-1//autogroup-31)
>> migration/28-185 [028] d..2. 70.264282: __print_se: ffff888108379000 w: 25 ve: 0 lag: -57987257 vd: 114632828 vmd: 114632828 N (-1//autogroup-33)
>> migration/28-185 [028] d..2. 70.264283: validate_cfs_rq: min_deadline: -55861854 avg_vruntime: -62278313462 / 1074 = -57987256
>>
>> I need to go make dinner (kids hungry), but I'll see if I can figure out
>> how this happens...
> *sigh*, does the below help?

It looks that this fixed the issue. At least I was no longer able to
reproduce it.

Feel free to add:

Reported-by: Marek Szyprowski <[email protected]>

Tested-by: Marek Szyprowski <[email protected]>

> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 04fbcbda97d5..6a670f119efa 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3632,6 +3747,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
> */
> deadline = div_s64(deadline * old_weight, weight);
> se->deadline = se->vruntime + deadline;
> + min_deadline_cb_propagate(&se->run_node, NULL);
> }
>
> #ifdef CONFIG_SMP
>
Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland

2023-10-06 23:29:33

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

On Fri, 2023-10-06 at 21:24 +0200, Peter Zijlstra wrote:
>
> *sigh*, does the below help?

<invisible ink goggles> ah, so there you... weren't.

Yup, all better.

> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 04fbcbda97d5..6a670f119efa 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3632,6 +3747,7 @@ static void reweight_entity(struct cfs_rq
> *cfs_rq, struct sched_entity *se,
>                  */
>                 deadline = div_s64(deadline * old_weight, weight);
>                 se->deadline = se->vruntime + deadline;
> +               min_deadline_cb_propagate(&se->run_node, NULL);
>         }
>  
>  #ifdef CONFIG_SMP

2023-10-07 00:38:10

by Chen Yu

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

On 2023-10-06 at 21:24:45 +0200, Peter Zijlstra wrote:
> On Fri, Oct 06, 2023 at 05:55:01PM +0200, Peter Zijlstra wrote:
>
> > And yeah, min_deadline is hosed somehow.
> >
> > migration/28-185 [028] d..2. 70.264274: validate_cfs_rq: --- /
> > migration/28-185 [028] d..2. 70.264277: __print_se: ffff88845cf48080 w: 1024 ve: -58857638 lag: 870381 vd: -55861854 vmd: -66302085 E (11372/tr)
> > migration/28-185 [028] d..2. 70.264280: __print_se: ffff88810d165800 w: 25 ve: -80323686 lag: 22336429 vd: -41496434 vmd: -66302085 E (-1//autogroup-31)
> > migration/28-185 [028] d..2. 70.264282: __print_se: ffff888108379000 w: 25 ve: 0 lag: -57987257 vd: 114632828 vmd: 114632828 N (-1//autogroup-33)
> > migration/28-185 [028] d..2. 70.264283: validate_cfs_rq: min_deadline: -55861854 avg_vruntime: -62278313462 / 1074 = -57987256
> >
> > I need to go make dinner (kids hungry), but I'll see if I can figure out
> > how this happens...
>
> *sigh*, does the below help?
>
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 04fbcbda97d5..6a670f119efa 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3632,6 +3747,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
> */
> deadline = div_s64(deadline * old_weight, weight);
> se->deadline = se->vruntime + deadline;
> + min_deadline_cb_propagate(&se->run_node, NULL);
> }
>
> #ifdef CONFIG_SMP

Is it because without this patch, the se->deadline is not always synced with se->min_deadline,
then in pick_eevdf() the following condition could not be met, thus we get a NULL candidate
and has to pick the leftmost one?
if (se->deadline == se->min_deadline)

Regarding the circular locking warning triggered by printk, does it mean we should not get a
NULL candidate from __pick_eevdf() in theory? And besides, we should not use printk with rq lock
hold?

thanks,
Chenyu

2023-10-07 06:25:23

by Biju Das

[permalink] [raw]
Subject: RE: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

Hi Peter Zijlstra,


> Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct
> se
>
> On Fri, Oct 06, 2023 at 05:55:01PM +0200, Peter Zijlstra wrote:
>
> > And yeah, min_deadline is hosed somehow.
> >
> > migration/28-185 [028] d..2. 70.264274: validate_cfs_rq: --- /
> > migration/28-185 [028] d..2. 70.264277: __print_se:
> ffff88845cf48080 w: 1024 ve: -58857638 lag: 870381 vd: -55861854 vmd: -
> 66302085 E (11372/tr)
> > migration/28-185 [028] d..2. 70.264280: __print_se:
> ffff88810d165800 w: 25 ve: -80323686 lag: 22336429 vd: -41496434 vmd: -
> 66302085 E (-1//autogroup-31)
> > migration/28-185 [028] d..2. 70.264282: __print_se:
> ffff888108379000 w: 25 ve: 0 lag: -57987257 vd: 114632828 vmd: 114632828 N
> (-1//autogroup-33)
> > migration/28-185 [028] d..2. 70.264283: validate_cfs_rq:
> min_deadline: -55861854 avg_vruntime: -62278313462 / 1074 = -57987256
> >
> > I need to go make dinner (kids hungry), but I'll see if I can figure
> > out how this happens...
>
> *sigh*, does the below help?

I confirm the message "EEVDF scheduling fail, picking leftmost" is not appearing with this patch.

Cheers,
Biju
>
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index
> 04fbcbda97d5..6a670f119efa 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3632,6 +3747,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq,
> struct sched_entity *se,
> */
> deadline = div_s64(deadline * old_weight, weight);
> se->deadline = se->vruntime + deadline;
> + min_deadline_cb_propagate(&se->run_node, NULL);
> }
>
> #ifdef CONFIG_SMP

2023-10-07 06:27:59

by Biju Das

[permalink] [raw]
Subject: RE: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

Hi Chen Yu,

> Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct
> se
>
> On 2023-10-06 at 21:24:45 +0200, Peter Zijlstra wrote:
> > On Fri, Oct 06, 2023 at 05:55:01PM +0200, Peter Zijlstra wrote:
> >
> > > And yeah, min_deadline is hosed somehow.
> > >
> > > migration/28-185 [028] d..2. 70.264274: validate_cfs_rq: --- /
> > > migration/28-185 [028] d..2. 70.264277: __print_se:
> ffff88845cf48080 w: 1024 ve: -58857638 lag: 870381 vd: -55861854 vmd: -
> 66302085 E (11372/tr)
> > > migration/28-185 [028] d..2. 70.264280: __print_se:
> ffff88810d165800 w: 25 ve: -80323686 lag: 22336429 vd: -41496434 vmd: -
> 66302085 E (-1//autogroup-31)
> > > migration/28-185 [028] d..2. 70.264282: __print_se:
> ffff888108379000 w: 25 ve: 0 lag: -57987257 vd: 114632828 vmd: 114632828 N
> (-1//autogroup-33)
> > > migration/28-185 [028] d..2. 70.264283: validate_cfs_rq:
> min_deadline: -55861854 avg_vruntime: -62278313462 / 1074 = -57987256
> > >
> > > I need to go make dinner (kids hungry), but I'll see if I can figure
> > > out how this happens...
> >
> > *sigh*, does the below help?
> >
> > ---
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index
> > 04fbcbda97d5..6a670f119efa 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -3632,6 +3747,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq,
> struct sched_entity *se,
> > */
> > deadline = div_s64(deadline * old_weight, weight);
> > se->deadline = se->vruntime + deadline;
> > + min_deadline_cb_propagate(&se->run_node, NULL);
> > }
> >
> > #ifdef CONFIG_SMP
>
> Is it because without this patch, the se->deadline is not always synced
> with se->min_deadline, then in pick_eevdf() the following condition could
> not be met, thus we get a NULL candidate and has to pick the leftmost one?
> if (se->deadline == se->min_deadline)
>
> Regarding the circular locking warning triggered by printk, does it mean we
> should not get a NULL candidate from __pick_eevdf() in theory? And besides,
> we should not use printk with rq lock hold?

Is it not a useful error log? At least from the initial report Marek Szyprowski doesn't see "EEVDF scheduling fail, picking leftmost" but seen only warning triggered by this in the logs.

Cheers,
Biju

2023-10-07 06:43:11

by Chen Yu

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

Hi Biju,

On 2023-10-07 at 06:26:05 +0000, Biju Das wrote:
> Hi Chen Yu,
>
> > Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct
> > se
> >
> > On 2023-10-06 at 21:24:45 +0200, Peter Zijlstra wrote:
> > > On Fri, Oct 06, 2023 at 05:55:01PM +0200, Peter Zijlstra wrote:
> > >
> > > > And yeah, min_deadline is hosed somehow.
> > > >
> > > > migration/28-185 [028] d..2. 70.264274: validate_cfs_rq: --- /
> > > > migration/28-185 [028] d..2. 70.264277: __print_se:
> > ffff88845cf48080 w: 1024 ve: -58857638 lag: 870381 vd: -55861854 vmd: -
> > 66302085 E (11372/tr)
> > > > migration/28-185 [028] d..2. 70.264280: __print_se:
> > ffff88810d165800 w: 25 ve: -80323686 lag: 22336429 vd: -41496434 vmd: -
> > 66302085 E (-1//autogroup-31)
> > > > migration/28-185 [028] d..2. 70.264282: __print_se:
> > ffff888108379000 w: 25 ve: 0 lag: -57987257 vd: 114632828 vmd: 114632828 N
> > (-1//autogroup-33)
> > > > migration/28-185 [028] d..2. 70.264283: validate_cfs_rq:
> > min_deadline: -55861854 avg_vruntime: -62278313462 / 1074 = -57987256
> > > >
> > > > I need to go make dinner (kids hungry), but I'll see if I can figure
> > > > out how this happens...
> > >
> > > *sigh*, does the below help?
> > >
> > > ---
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index
> > > 04fbcbda97d5..6a670f119efa 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -3632,6 +3747,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq,
> > struct sched_entity *se,
> > > */
> > > deadline = div_s64(deadline * old_weight, weight);
> > > se->deadline = se->vruntime + deadline;
> > > + min_deadline_cb_propagate(&se->run_node, NULL);
> > > }
> > >
> > > #ifdef CONFIG_SMP
> >
> > Is it because without this patch, the se->deadline is not always synced
> > with se->min_deadline, then in pick_eevdf() the following condition could
> > not be met, thus we get a NULL candidate and has to pick the leftmost one?
> > if (se->deadline == se->min_deadline)
> >
> > Regarding the circular locking warning triggered by printk, does it mean we
> > should not get a NULL candidate from __pick_eevdf() in theory? And besides,
> > we should not use printk with rq lock hold?
>
> Is it not a useful error log? At least from the initial report Marek Szyprowski doesn't see "EEVDF scheduling fail, picking leftmost" but seen only warning triggered by this in the logs.
>

Yes, it is a useful log. I was trying to figure out the safe scenario to use
printk.

thanks,
Chenyu

Subject: [tip: sched/urgent] sched/eevdf: Fix pick_eevdf()

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID: b01db23d5923a35023540edc4f0c5f019e11ac7d
Gitweb: https://git.kernel.org/tip/b01db23d5923a35023540edc4f0c5f019e11ac7d
Author: Benjamin Segall <[email protected]>
AuthorDate: Fri, 29 Sep 2023 17:09:30 -07:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Mon, 09 Oct 2023 09:48:33 +02:00

sched/eevdf: Fix pick_eevdf()

The old pick_eevdf() could fail to find the actual earliest eligible
deadline when it descended to the right looking for min_deadline, but
it turned out that that min_deadline wasn't actually eligible. In that
case we need to go back and search through any left branches we
skipped looking for the actual best _eligible_ min_deadline.

This is more expensive, but still O(log n), and at worst should only
involve descending two branches of the rbtree.

I've run this through a userspace stress test (thank you
tools/lib/rbtree.c), so hopefully this implementation doesn't miss any
corner cases.

Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Signed-off-by: Ben Segall <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 72 +++++++++++++++++++++++++++++++++++---------
1 file changed, 58 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a4b904a..061a30a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -872,14 +872,16 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
*
* Which allows an EDF like search on (sub)trees.
*/
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *curr = cfs_rq->curr;
struct sched_entity *best = NULL;
+ struct sched_entity *best_left = NULL;

if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
+ best = curr;

/*
* Once selected, run a task until it either becomes non-eligible or
@@ -900,33 +902,75 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
}

/*
- * If this entity has an earlier deadline than the previous
- * best, take this one. If it also has the earliest deadline
- * of its subtree, we're done.
+ * Now we heap search eligible trees for the best (min_)deadline
*/
- if (!best || deadline_gt(deadline, best, se)) {
+ if (!best || deadline_gt(deadline, best, se))
best = se;
- if (best->deadline == best->min_deadline)
- break;
- }

/*
- * If the earlest deadline in this subtree is in the fully
- * eligible left half of our space, go there.
+ * Every se in a left branch is eligible, keep track of the
+ * branch with the best min_deadline
*/
+ if (node->rb_left) {
+ struct sched_entity *left = __node_2_se(node->rb_left);
+
+ if (!best_left || deadline_gt(min_deadline, best_left, left))
+ best_left = left;
+
+ /*
+ * min_deadline is in the left branch. rb_left and all
+ * descendants are eligible, so immediately switch to the second
+ * loop.
+ */
+ if (left->min_deadline == se->min_deadline)
+ break;
+ }
+
+ /* min_deadline is at this node, no need to look right */
+ if (se->deadline == se->min_deadline)
+ break;
+
+ /* else min_deadline is in the right branch. */
+ node = node->rb_right;
+ }
+
+ /*
+ * We ran into an eligible node which is itself the best.
+ * (Or nr_running == 0 and both are NULL)
+ */
+ if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0)
+ return best;
+
+ /*
+ * Now best_left and all of its children are eligible, and we are just
+ * looking for deadline == min_deadline
+ */
+ node = &best_left->run_node;
+ while (node) {
+ struct sched_entity *se = __node_2_se(node);
+
+ /* min_deadline is the current node */
+ if (se->deadline == se->min_deadline)
+ return se;
+
+ /* min_deadline is in the left branch */
if (node->rb_left &&
__node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
node = node->rb_left;
continue;
}

+ /* else min_deadline is in the right branch */
node = node->rb_right;
}
+ return NULL;
+}

- if (!best || (curr && deadline_gt(deadline, best, curr)))
- best = curr;
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *se = __pick_eevdf(cfs_rq);

- if (unlikely(!best)) {
+ if (!se) {
struct sched_entity *left = __pick_first_entity(cfs_rq);
if (left) {
pr_err("EEVDF scheduling fail, picking leftmost\n");
@@ -934,7 +978,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
}
}

- return best;
+ return se;
}

#ifdef CONFIG_SCHED_DEBUG

2023-10-09 14:28:43

by Phil Auld

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

On Sat, Oct 07, 2023 at 02:42:10PM +0800 Chen Yu wrote:
> Hi Biju,
>
> On 2023-10-07 at 06:26:05 +0000, Biju Das wrote:
> > Hi Chen Yu,
> >
> > > Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct
> > > se
> > >
> > > On 2023-10-06 at 21:24:45 +0200, Peter Zijlstra wrote:
> > > > On Fri, Oct 06, 2023 at 05:55:01PM +0200, Peter Zijlstra wrote:
> > > >
> > > > > And yeah, min_deadline is hosed somehow.
> > > > >
> > > > > migration/28-185 [028] d..2. 70.264274: validate_cfs_rq: --- /
> > > > > migration/28-185 [028] d..2. 70.264277: __print_se:
> > > ffff88845cf48080 w: 1024 ve: -58857638 lag: 870381 vd: -55861854 vmd: -
> > > 66302085 E (11372/tr)
> > > > > migration/28-185 [028] d..2. 70.264280: __print_se:
> > > ffff88810d165800 w: 25 ve: -80323686 lag: 22336429 vd: -41496434 vmd: -
> > > 66302085 E (-1//autogroup-31)
> > > > > migration/28-185 [028] d..2. 70.264282: __print_se:
> > > ffff888108379000 w: 25 ve: 0 lag: -57987257 vd: 114632828 vmd: 114632828 N
> > > (-1//autogroup-33)
> > > > > migration/28-185 [028] d..2. 70.264283: validate_cfs_rq:
> > > min_deadline: -55861854 avg_vruntime: -62278313462 / 1074 = -57987256
> > > > >
> > > > > I need to go make dinner (kids hungry), but I'll see if I can figure
> > > > > out how this happens...
> > > >
> > > > *sigh*, does the below help?
> > > >
> > > > ---
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index
> > > > 04fbcbda97d5..6a670f119efa 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -3632,6 +3747,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq,
> > > struct sched_entity *se,
> > > > */
> > > > deadline = div_s64(deadline * old_weight, weight);
> > > > se->deadline = se->vruntime + deadline;
> > > > + min_deadline_cb_propagate(&se->run_node, NULL);
> > > > }
> > > >
> > > > #ifdef CONFIG_SMP
> > >
> > > Is it because without this patch, the se->deadline is not always synced
> > > with se->min_deadline, then in pick_eevdf() the following condition could
> > > not be met, thus we get a NULL candidate and has to pick the leftmost one?
> > > if (se->deadline == se->min_deadline)
> > >
> > > Regarding the circular locking warning triggered by printk, does it mean we
> > > should not get a NULL candidate from __pick_eevdf() in theory? And besides,
> > > we should not use printk with rq lock hold?
> >
> > Is it not a useful error log? At least from the initial report Marek Szyprowski doesn't see "EEVDF scheduling fail, picking leftmost" but seen only warning triggered by this in the logs.
> >
>
> Yes, it is a useful log. I was trying to figure out the safe scenario to use
> printk.
>

You should not use printk with a rq lock held as some console implementations
(framebuffer etc) call schedule_work() which loops back into the scheduler
and bad things happen.

Some places in the scheduler use printk_deferred() when needed.

I think this will be better when the RT printk stuff is fully merged.

Cheers,
Phil

> thanks,
> Chenyu
>

--

2023-10-10 02:09:08

by Chen Yu

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

Hi Phil,

On 2023-10-09 at 10:27:40 -0400, Phil Auld wrote:
> On Sat, Oct 07, 2023 at 02:42:10PM +0800 Chen Yu wrote:
> > Hi Biju,
> >
> > On 2023-10-07 at 06:26:05 +0000, Biju Das wrote:
> > > Hi Chen Yu,
> > >
> > > > Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct
> > > > se
> > > >
> > > > On 2023-10-06 at 21:24:45 +0200, Peter Zijlstra wrote:
> > > > > On Fri, Oct 06, 2023 at 05:55:01PM +0200, Peter Zijlstra wrote:
> > > > >
> > > > > > And yeah, min_deadline is hosed somehow.
> > > > > >
> > > > > > migration/28-185 [028] d..2. 70.264274: validate_cfs_rq: --- /
> > > > > > migration/28-185 [028] d..2. 70.264277: __print_se:
> > > > ffff88845cf48080 w: 1024 ve: -58857638 lag: 870381 vd: -55861854 vmd: -
> > > > 66302085 E (11372/tr)
> > > > > > migration/28-185 [028] d..2. 70.264280: __print_se:
> > > > ffff88810d165800 w: 25 ve: -80323686 lag: 22336429 vd: -41496434 vmd: -
> > > > 66302085 E (-1//autogroup-31)
> > > > > > migration/28-185 [028] d..2. 70.264282: __print_se:
> > > > ffff888108379000 w: 25 ve: 0 lag: -57987257 vd: 114632828 vmd: 114632828 N
> > > > (-1//autogroup-33)
> > > > > > migration/28-185 [028] d..2. 70.264283: validate_cfs_rq:
> > > > min_deadline: -55861854 avg_vruntime: -62278313462 / 1074 = -57987256
> > > > > >
> > > > > > I need to go make dinner (kids hungry), but I'll see if I can figure
> > > > > > out how this happens...
> > > > >
> > > > > *sigh*, does the below help?
> > > > >
> > > > > ---
> > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index
> > > > > 04fbcbda97d5..6a670f119efa 100644
> > > > > --- a/kernel/sched/fair.c
> > > > > +++ b/kernel/sched/fair.c
> > > > > @@ -3632,6 +3747,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq,
> > > > struct sched_entity *se,
> > > > > */
> > > > > deadline = div_s64(deadline * old_weight, weight);
> > > > > se->deadline = se->vruntime + deadline;
> > > > > + min_deadline_cb_propagate(&se->run_node, NULL);
> > > > > }
> > > > >
> > > > > #ifdef CONFIG_SMP
> > > >
> > > > Is it because without this patch, the se->deadline is not always synced
> > > > with se->min_deadline, then in pick_eevdf() the following condition could
> > > > not be met, thus we get a NULL candidate and has to pick the leftmost one?
> > > > if (se->deadline == se->min_deadline)
> > > >
> > > > Regarding the circular locking warning triggered by printk, does it mean we
> > > > should not get a NULL candidate from __pick_eevdf() in theory? And besides,
> > > > we should not use printk with rq lock hold?
> > >
> > > Is it not a useful error log? At least from the initial report Marek Szyprowski doesn't see "EEVDF scheduling fail, picking leftmost" but seen only warning triggered by this in the logs.
> > >
> >
> > Yes, it is a useful log. I was trying to figure out the safe scenario to use
> > printk.
> >
>
> You should not use printk with a rq lock held as some console implementations
> (framebuffer etc) call schedule_work() which loops back into the scheduler
> and bad things happen.
>
> Some places in the scheduler use printk_deferred() when needed.
>
> I think this will be better when the RT printk stuff is fully merged.
>

I see, got it! Thanks for the education.

thanks,
Chenyu

2023-10-11 04:15:50

by Abel Wu

[permalink] [raw]
Subject: Re: Re: [PATCH 05/15] sched/fair: Implement an EEVDF like policy

在 9/30/23 5:40 AM, Benjamin Segall Wrote:
> Peter Zijlstra <[email protected]> writes:
>
>> +
>> +/*
>> + * Earliest Eligible Virtual Deadline First
>> + *
>> + * In order to provide latency guarantees for different request sizes
>> + * EEVDF selects the best runnable task from two criteria:
>> + *
>> + * 1) the task must be eligible (must be owed service)
>> + *
>> + * 2) from those tasks that meet 1), we select the one
>> + * with the earliest virtual deadline.
>> + *
>> + * We can do this in O(log n) time due to an augmented RB-tree. The
>> + * tree keeps the entries sorted on service, but also functions as a
>> + * heap based on the deadline by keeping:
>> + *
>> + * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
>> + *
>> + * Which allows an EDF like search on (sub)trees.
>> + */
>> +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
>> +{
>> + struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
>> + struct sched_entity *curr = cfs_rq->curr;
>> + struct sched_entity *best = NULL;
>> +
>> + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
>> + curr = NULL;
>> +
>> + while (node) {
>> + struct sched_entity *se = __node_2_se(node);
>> +
>> + /*
>> + * If this entity is not eligible, try the left subtree.
>> + */
>> + if (!entity_eligible(cfs_rq, se)) {
>> + node = node->rb_left;
>> + continue;
>> + }
>> +
>> + /*
>> + * If this entity has an earlier deadline than the previous
>> + * best, take this one. If it also has the earliest deadline
>> + * of its subtree, we're done.
>> + */
>> + if (!best || deadline_gt(deadline, best, se)) {
>> + best = se;
>> + if (best->deadline == best->min_deadline)
>> + break;
>> + }
>> +
>> + /*
>> + * If the earlest deadline in this subtree is in the fully
>> + * eligible left half of our space, go there.
>> + */
>> + if (node->rb_left &&
>> + __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
>> + node = node->rb_left;
>> + continue;
>> + }
>> +
>> + node = node->rb_right;
>> + }
>
> I believe that this can fail to actually find the earliest eligible
> deadline, because the earliest deadline (min_deadline) can be in the
> right branch, but that se isn't eligible, and the actual target se is in
> the left branch. A trivial 3-se example with the nodes represented by
> (vruntime, deadline, min_deadline):
>
> (5,9,7)
> / \
> (4,8,8) (6,7,7)
>
> AIUI, here the EEVDF pick should be (4,8,8), but pick_eevdf() will
> instead pick (5,9,7), because it goes into the right branch and then
> fails eligibility.
>
> I'm not sure how much of a problem this is in practice, either in
> frequency or severity, but it probably should be mentioned if it's
> an intentional tradeoff.

Assume entity i satisfies (d_i == min_deadline) && (v_i > V), there
must be an eligible entity j with (d_j >= d_i) && (v_j < V). Given
that how deadline is calculated, it can be inferred that:

vslice_i < vslice_j

IOW a more batch-like entity with looser deadline will beat entities
that is more interactive-like even with tighter deadline, only because
the former is eligible while the latter isn't.

With Benjamin's fix, the semantics of 'Earliest Eligible' preserved.
But since all this is about latency rather than fairness, I wonder if
there are cases worthy of breaking the 'eligible' rule.

Thanks & Best,
Abel

>
>
>
> Thinking out loud, I think that it would be sufficient to recheck via something like
>
> for_each_sched_entity(best) {
> check __node_2_se(best->rb_left)->min_deadline, store in actual_best
> }
>
> for the best min_deadline, and then go do a heap lookup in actual_best
> to find the se matching that min_deadline.
>
> I think this pass could then be combined with our initial descent for
> better cache behavior by keeping track of the best rb_left->min_deadline
> each time we take a right branch. We still have to look at up to ~2x the
> nodes, but I don't think that's avoidable? I'll expand my quick hack I
> used to test my simple case into a something of a stress tester and try
> some implementations.

2023-10-11 07:34:22

by Peter Zijlstra

[permalink] [raw]
Subject: Re: Re: [PATCH 05/15] sched/fair: Implement an EEVDF like policy

On Wed, Oct 11, 2023 at 12:14:30PM +0800, Abel Wu wrote:

> With Benjamin's fix, the semantics of 'Earliest Eligible' preserved.

Yeah, my bad.

> But since all this is about latency rather than fairness, I wonder if

It is about both, fairness is absolutely important.

> there are cases worthy of breaking the 'eligible' rule.

See the discussion with Youssef, if we weaken the eligible rule you get
horrific interference because you end up placing new tasks around the
0-lag point.

2023-10-11 11:50:55

by Abel Wu

[permalink] [raw]
Subject: Re: Re: [PATCH 05/15] sched/fair: Implement an EEVDF like policy

On 10/11/23 3:33 PM, Peter Zijlstra Wrote:
> On Wed, Oct 11, 2023 at 12:14:30PM +0800, Abel Wu wrote:
>
>> there are cases worthy of breaking the 'eligible' rule.
>
> See the discussion with Youssef, if we weaken the eligible rule you get
> horrific interference because you end up placing new tasks around the
> 0-lag point.

I have just begun studying the EEVDF scheduler, and obviously there
are lots of things to catch up with :)

At a quick glance at Youssef's first reply, I'm sure that's exactly
the same as I thought about, the EVDF. The intention behind is w/o
eligibility the task_timeline can be organized by deadline rather
than vruntime, hence task selection can be done in O(1) while the
min_vruntime can be updated through augmented rbtree.

Anyway, I will learn from your discussion with Youssef first, thanks
for providing the info!

Best,
Abel

2023-10-11 12:15:52

by Abel Wu

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

On 9/30/23 8:09 AM, Benjamin Segall Wrote:
> The old pick_eevdf could fail to find the actual earliest eligible
> deadline when it descended to the right looking for min_deadline, but it
> turned out that that min_deadline wasn't actually eligible. In that case
> we need to go back and search through any left branches we skipped
> looking for the actual best _eligible_ min_deadline.
>
> This is more expensive, but still O(log n), and at worst should only
> involve descending two branches of the rbtree.
>
> I've run this through a userspace stress test (thank you
> tools/lib/rbtree.c), so hopefully this implementation doesn't miss any
> corner cases.
>
> Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
> Signed-off-by: Ben Segall <[email protected]>
> ---
> kernel/sched/fair.c | 72 ++++++++++++++++++++++++++++++++++++---------
> 1 file changed, 58 insertions(+), 14 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0c31cda0712f..77e9440b8ab3 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -864,18 +864,20 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
> *
> * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
> *
> * Which allows an EDF like search on (sub)trees.
> */
> -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> +static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
> {
> struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
> struct sched_entity *curr = cfs_rq->curr;
> struct sched_entity *best = NULL;
> + struct sched_entity *best_left = NULL;
>
> if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
> curr = NULL;
> + best = curr;
>
> /*
> * Once selected, run a task until it either becomes non-eligible or
> * until it gets a new slice. See the HACK in set_next_entity().
> */
> @@ -892,45 +894,87 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> node = node->rb_left;
> continue;
> }
>
> /*
> - * If this entity has an earlier deadline than the previous
> - * best, take this one. If it also has the earliest deadline
> - * of its subtree, we're done.
> + * Now we heap search eligible trees for the best (min_)deadline
> */
> - if (!best || deadline_gt(deadline, best, se)) {
> + if (!best || deadline_gt(deadline, best, se))
> best = se;
> - if (best->deadline == best->min_deadline)
> - break;
> - }
>
> /*
> - * If the earlest deadline in this subtree is in the fully
> - * eligible left half of our space, go there.
> + * Every se in a left branch is eligible, keep track of the
> + * branch with the best min_deadline
> */
> + if (node->rb_left) {
> + struct sched_entity *left = __node_2_se(node->rb_left);
> +
> + if (!best_left || deadline_gt(min_deadline, best_left, left))
> + best_left = left;
> +
> + /*
> + * min_deadline is in the left branch. rb_left and all
> + * descendants are eligible, so immediately switch to the second
> + * loop.
> + */
> + if (left->min_deadline == se->min_deadline)
> + break;
> + }
> +
> + /* min_deadline is at this node, no need to look right */
> + if (se->deadline == se->min_deadline)
> + break;
> +
> + /* else min_deadline is in the right branch. */
> + node = node->rb_right;
> + }
> +
> + /*
> + * We ran into an eligible node which is itself the best.
> + * (Or nr_running == 0 and both are NULL)
> + */
> + if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0)
> + return best;
> +
> + /*
> + * Now best_left and all of its children are eligible, and we are just
> + * looking for deadline == min_deadline
> + */
> + node = &best_left->run_node;
> + while (node) {
> + struct sched_entity *se = __node_2_se(node);
> +
> + /* min_deadline is the current node */
> + if (se->deadline == se->min_deadline)
> + return se;

IMHO it would be better tiebreak on vruntime by moving this hunk to ..

> +
> + /* min_deadline is in the left branch */
> if (node->rb_left &&
> __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
> node = node->rb_left;
> continue;
> }

.. here, thoughts?

>
> + /* else min_deadline is in the right branch */
> node = node->rb_right;
> }
> + return NULL;

Why not 'best'? Since ..

> +}
>
> - if (!best || (curr && deadline_gt(deadline, best, curr)))
> - best = curr;
> +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> +{
> + struct sched_entity *se = __pick_eevdf(cfs_rq);
>
> - if (unlikely(!best)) {
> + if (!se) {
> struct sched_entity *left = __pick_first_entity(cfs_rq);

.. cfs_rq->curr isn't considered here.

> if (left) {
> pr_err("EEVDF scheduling fail, picking leftmost\n");
> return left;
> }
> }
>
> - return best;
> + return se;
> }
>
> #ifdef CONFIG_SCHED_DEBUG
> struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
> {

The implementation of __pick_eevdf() now is quite complicated which
makes it really hard to maintain. I'm trying my best to refactor this
part, hopefully can do some help. Below is only for illustration, I
need to test more.

static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *curr = cfs_rq->curr;
struct sched_entity *best = NULL;
struct sched_entity *cand = NULL;
bool all_eligible = false;

if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;

/*
* Once selected, run a task until it either becomes non-eligible or
* until it gets a new slice. See the HACK in set_next_entity().
*/
if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
return curr;

while (node) {
struct sched_entity *se = __node_2_se(node);

/*
* If this entity is not eligible, try the left subtree.
*/
if (!all_eligible && !entity_eligible(cfs_rq, se)) {
node = node->rb_left;
continue;
}

if (node->rb_left) {
struct sched_entity *left = __node_2_se(node->rb_left);

BUG_ON(left->min_deadline < se->min_deadline);

/* Tiebreak on vruntime */
if (left->min_deadline == se->min_deadline) {
node = node->rb_left;
all_eligible = true;
continue;
}

if (!all_eligible) {
/*
* We're going to search right subtree and the one
* with min_deadline can be non-eligible, so record
* the left subtree as a candidate.
*/
if (!cand || deadline_gt(min_deadline, cand, left))
cand = left;
}
}

/* min_deadline is at this node, no need to look right */
if (se->deadline == se->min_deadline) {
best = se;
break;
}

node = node->rb_right;

if (!node && cand) {
node = cand;
all_eligible = true;
cand = NULL;
}
}

if (!best || (curr && deadline_gt(deadline, best, curr)))
best = curr;

return best;
}

2023-10-11 13:16:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

On Wed, Oct 11, 2023 at 08:12:09PM +0800, Abel Wu wrote:

> The implementation of __pick_eevdf() now is quite complicated which
> makes it really hard to maintain. I'm trying my best to refactor this
> part, hopefully can do some help. Below is only for illustration, I
> need to test more.

Well, the form it has now is somewhat close to the one in the paper. I
think Ben added one early break it doesn't have or something.

As the paper explains, you get two walks, one down the eligibility path,
and then one down the heap. I think the current code structure
represents that fairly well.

Very vague memories seem to suggest I thought to be clever some 10+
years ago when I wrote that other decent loop that Ben showed to be
broken (and woe on me for not verifying it when I resurrected the
patches, I rewrote pretty much everything else, except this lookup :/).

2023-10-11 21:01:48

by Benjamin Segall

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

Abel Wu <[email protected]> writes:

> On 9/30/23 8:09 AM, Benjamin Segall Wrote:
>> + /*
>> + * Now best_left and all of its children are eligible, and we are just
>> + * looking for deadline == min_deadline
>> + */
>> + node = &best_left->run_node;
>> + while (node) {
>> + struct sched_entity *se = __node_2_se(node);
>> +
>> + /* min_deadline is the current node */
>> + if (se->deadline == se->min_deadline)
>> + return se;
>
> IMHO it would be better tiebreak on vruntime by moving this hunk to ..
>
>> +
>> + /* min_deadline is in the left branch */
>> if (node->rb_left &&
>> __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
>> node = node->rb_left;
>> continue;
>> }
>
> .. here, thoughts?

Yeah, that should work and be better on the tiebreak (and my test code
agrees). There's an argument that the tiebreak will never really come up
and it's better to avoid the potential one extra cache line from
"__node_2_se(node->rb_left)->min_deadline" though.

>
>> + /* else min_deadline is in the right branch */
>> node = node->rb_right;
>> }
>> + return NULL;
>
> Why not 'best'? Since ..

The only time this can happen is if the tree is corrupt. We only reach
this case if best_left is set at all (and best_left's min_deadline is
better than "best"'s, which includes curr). In that case getting an
error message is good, and in general I wasn't worrying about it much.

>
>> +}
>> - if (!best || (curr && deadline_gt(deadline, best, curr)))
>> - best = curr;
>> +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
>> +{
>> + struct sched_entity *se = __pick_eevdf(cfs_rq);
>> - if (unlikely(!best)) {
>> + if (!se) {
>> struct sched_entity *left = __pick_first_entity(cfs_rq);
>
> .. cfs_rq->curr isn't considered here.

That said, we should probably consider curr here in the error-case
fallback, if just as a "if (!left) left = cfs_rq->curr;"

>
>> if (left) {
>> pr_err("EEVDF scheduling fail, picking leftmost\n");
>> return left;
>> }
>> }
>> - return best;
>> + return se;
>> }
>> #ifdef CONFIG_SCHED_DEBUG
>> struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
>> {
>
> The implementation of __pick_eevdf() now is quite complicated which
> makes it really hard to maintain. I'm trying my best to refactor this
> part, hopefully can do some help. Below is only for illustration, I
> need to test more.
>

A passing version of that single-loop code minus the cfs_rq->curr
handling is here (just pasting the curr handling back on the start/end
should do):

static struct sched_entity *pick_eevdf_abel(struct cfs_rq *cfs_rq)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *best = NULL;
struct sched_entity *cand = NULL;
bool all_eligible = false;

while (node || cand) {
struct sched_entity *se = __node_2_se(node);
if (!node) {
BUG_ON(!cand);
node = &cand->run_node;
se = __node_2_se(node);
all_eligible = true;
cand = NULL;

/*
* Our initial pass ran into an eligible node which is
* itself the best
*/
if (best && (s64)(se->min_deadline - best->deadline) > 0)
break;
}

/*
* If this entity is not eligible, try the left subtree.
*/
if (!all_eligible && !entity_eligible(cfs_rq, se)) {
node = node->rb_left;
continue;
}

if (node->rb_left) {
struct sched_entity *left = __node_2_se(node->rb_left);

BUG_ON(left->min_deadline < se->min_deadline);

/* Tiebreak on vruntime */
if (left->min_deadline == se->min_deadline) {
node = node->rb_left;
all_eligible = true;
continue;
}

if (!all_eligible) {
/*
* We're going to search right subtree and the one
* with min_deadline can be non-eligible, so record
* the left subtree as a candidate.
*/
if (!cand || deadline_gt(min_deadline, cand, left))
cand = left;
}
}

if (!all_eligible && (!best || deadline_gt(deadline, best, se)))
best = se;

/* min_deadline is at this node, no need to look right */
if (se->deadline == se->min_deadline) {
if (all_eligible && (!best || deadline_gte(deadline, best, se)))
best = se;
if (!all_eligible && (!best || deadline_gt(deadline, best, se)))
best = se;
node = NULL;
continue;
}

node = node->rb_right;
}

return best;
}

This does exactly as well on the tiebreak condition of vruntime as the
improved two-loop version you suggested. If we don't care about the
tiebreak we can replace the ugly if(all_eligible) if(!all_eligible) in
the last section with a single version that just uses deadline_gt (or
gte) throughout.

Personally with all the extra bits I added for correctness this winds up
definitely uglier than the two-loop version, but it might be possible to
clean it up further. (And a bunch of it is just personal style
preference in the first place)

I've also attached my ugly userspace EEVDF tester as an attachment,
hopefully I attached it in a correct mode to go through lkml.


Attachments:
eevdf-tester.patch (18.40 kB)
EEVDF tester

2023-10-12 10:04:44

by Abel Wu

[permalink] [raw]
Subject: Re: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

On 10/11/23 9:14 PM, Peter Zijlstra Wrote:
> On Wed, Oct 11, 2023 at 08:12:09PM +0800, Abel Wu wrote:
>
> As the paper explains, you get two walks, one down the eligibility path,
> and then one down the heap. I think the current code structure
> represents that fairly well.

Yes, it does. I just wonder if the 2-step search is necessary, since
they obey the same rule of heap search:

1) node->min_deadline > node->left->min_deadline
1.1) BUG

2) node->min_deadline = node->left->min_deadline
2.1) go left if tiebreak on vruntime

3) node->min_deadline < node->left->min_deadline
3.1) return @node if it has min deadline, or
3.2) go right

which gives:

while (node) {
if ((left = node->left)) {
/* 1.1 */
BUG_ON(left->min < node->min);

/* 2.1 */
if (left->min == node->min) {
node = left;
continue;
}
}

/* 3.1 */
if (node->deadline == node->min)
return node;

/* 3.2 */
node = node->right;
}

The above returns the entity with ealiest deadline (and with smallest
vruntime if have same deadline). Then comes with eligibility:

0) it helps pruning the tree since the right subtree of a
non-eligible node can't contain any eligible node.

3.2.1) record left as a fallback iff the eligibility check
is active, and saving the best one is enough since
none of them contain non-eligible node, IOW the one
with min deadline in the left tree must be eligible.

4) the eligibility check ends immediately once go left from
an eligible node, including switch to the fallback which
is essentially is the 'left' of an eligible node.

5) fallback to the candidate (if exists) if failed to find
an eligible entity with earliest deadline.

which makes:

candidate = NULL;
need_check = true;

while (node) {
/* 0 */
if (need_check && !eligible(node)) {
node = node->left;
goto next;
}

if ((left = node->left)) {
/* 1.1 */
BUG_ON(left->min < node->min);

/* 2.1 */
if (left->min == node->min) {
node = left;
/* 4 */
need_check = false;
continue;
}

/* 3.2.1 */
if (need_check)
candidate = better(candidate, left);
}

/* 3.1 */
if (node->deadline == node->min)
return node;

/* 3.2 */
node = node->right;
next:
/* 5 */
if (!node && candidate) {
node = candidate;
need_check = false; /* 4 */
candidate = NULL;
}
}

The search ends with a 'best' entity on the tree, comparing it with
curr which is out of tree makes things a whole.

But it's absolutely fine to me to honor the 2-step search given by
the paper if you think that is already clear enough :)

Best,
Abel

2023-10-12 10:25:55

by Abel Wu

[permalink] [raw]
Subject: Re: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

On 10/12/23 5:01 AM, Benjamin Segall Wrote:
> Abel Wu <[email protected]> writes:
>
>> On 9/30/23 8:09 AM, Benjamin Segall Wrote:
>>> + /*
>>> + * Now best_left and all of its children are eligible, and we are just
>>> + * looking for deadline == min_deadline
>>> + */
>>> + node = &best_left->run_node;
>>> + while (node) {
>>> + struct sched_entity *se = __node_2_se(node);
>>> +
>>> + /* min_deadline is the current node */
>>> + if (se->deadline == se->min_deadline)
>>> + return se;
>>
>> IMHO it would be better tiebreak on vruntime by moving this hunk to ..
>>
>>> +
>>> + /* min_deadline is in the left branch */
>>> if (node->rb_left &&
>>> __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
>>> node = node->rb_left;
>>> continue;
>>> }
>>
>> .. here, thoughts?
>
> Yeah, that should work and be better on the tiebreak (and my test code
> agrees). There's an argument that the tiebreak will never really come up
> and it's better to avoid the potential one extra cache line from
> "__node_2_se(node->rb_left)->min_deadline" though.

I see. Then probably do the same thing in the first loop?

>
>>
>>> + /* else min_deadline is in the right branch */
>>> node = node->rb_right;
>>> }
>>> + return NULL;
>>
>> Why not 'best'? Since ..
>
> The only time this can happen is if the tree is corrupt. We only reach
> this case if best_left is set at all (and best_left's min_deadline is
> better than "best"'s, which includes curr). In that case getting an
> error message is good, and in general I wasn't worrying about it much.

Right.

>
>>
>>> +}
>>> - if (!best || (curr && deadline_gt(deadline, best, curr)))
>>> - best = curr;
>>> +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
>>> +{
>>> + struct sched_entity *se = __pick_eevdf(cfs_rq);
>>> - if (unlikely(!best)) {
>>> + if (!se) {
>>> struct sched_entity *left = __pick_first_entity(cfs_rq);
>>
>> .. cfs_rq->curr isn't considered here.
>
> That said, we should probably consider curr here in the error-case
> fallback, if just as a "if (!left) left = cfs_rq->curr;"

I don't think so as there must be some bugs in the scheduler, replacing
'pr_err' with 'BUG()' would be more appropriate.

>
>
> I've also attached my ugly userspace EEVDF tester as an attachment,
> hopefully I attached it in a correct mode to go through lkml.

Received. Thanks, Ben.

2023-10-12 17:51:39

by Benjamin Segall

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

Abel Wu <[email protected]> writes:

> On 10/12/23 5:01 AM, Benjamin Segall Wrote:
>> Abel Wu <[email protected]> writes:
>>
>>> On 9/30/23 8:09 AM, Benjamin Segall Wrote:
>>>> + /*
>>>> + * Now best_left and all of its children are eligible, and we are just
>>>> + * looking for deadline == min_deadline
>>>> + */
>>>> + node = &best_left->run_node;
>>>> + while (node) {
>>>> + struct sched_entity *se = __node_2_se(node);
>>>> +
>>>> + /* min_deadline is the current node */
>>>> + if (se->deadline == se->min_deadline)
>>>> + return se;
>>>
>>> IMHO it would be better tiebreak on vruntime by moving this hunk to ..
>>>
>>>> +
>>>> + /* min_deadline is in the left branch */
>>>> if (node->rb_left &&
>>>> __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
>>>> node = node->rb_left;
>>>> continue;
>>>> }
>>>
>>> .. here, thoughts?
>> Yeah, that should work and be better on the tiebreak (and my test code
>> agrees). There's an argument that the tiebreak will never really come up
>> and it's better to avoid the potential one extra cache line from
>> "__node_2_se(node->rb_left)->min_deadline" though.
>
> I see. Then probably do the same thing in the first loop?
>

We effectively do that already sorta by accident almost always -
computing best and best_left via deadline_gt rather than gte prioritizes
earlier elements, which always have a better vruntime.

Then when we do the best_left->min_deadline vs best->deadline
computation, we prioritize best_left, which is the one case it can be
wrong, we'd need an additional
"if (se->min_deadline == best->deadline &&
(s64)(se->vruntime - best->vruntime) > 0) return best;" check at the end
of the second loop.

(Though again I don't know how much this sort of never-going-to-happen
slight fairness improvement is worth compared to the extra bit of
overhead)

2023-10-13 03:47:27

by Abel Wu

[permalink] [raw]
Subject: Re: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

On 10/13/23 1:51 AM, Benjamin Segall Wrote:
> Abel Wu <[email protected]> writes:
>
>> On 10/12/23 5:01 AM, Benjamin Segall Wrote:
>>> Abel Wu <[email protected]> writes:
>>>
>>>> On 9/30/23 8:09 AM, Benjamin Segall Wrote:
>>>>> + /*
>>>>> + * Now best_left and all of its children are eligible, and we are just
>>>>> + * looking for deadline == min_deadline
>>>>> + */
>>>>> + node = &best_left->run_node;
>>>>> + while (node) {
>>>>> + struct sched_entity *se = __node_2_se(node);
>>>>> +
>>>>> + /* min_deadline is the current node */
>>>>> + if (se->deadline == se->min_deadline)
>>>>> + return se;
>>>>
>>>> IMHO it would be better tiebreak on vruntime by moving this hunk to ..
>>>>
>>>>> +
>>>>> + /* min_deadline is in the left branch */
>>>>> if (node->rb_left &&
>>>>> __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
>>>>> node = node->rb_left;
>>>>> continue;
>>>>> }
>>>>
>>>> .. here, thoughts?
>>> Yeah, that should work and be better on the tiebreak (and my test code
>>> agrees). There's an argument that the tiebreak will never really come up
>>> and it's better to avoid the potential one extra cache line from
>>> "__node_2_se(node->rb_left)->min_deadline" though.
>>
>> I see. Then probably do the same thing in the first loop?
>>
>
> We effectively do that already sorta by accident almost always -
> computing best and best_left via deadline_gt rather than gte prioritizes
> earlier elements, which always have a better vruntime.

Sorry for not clarifying clearly about the 'same thing'. What I meant
was to avoid touch left if the node itself has the min deadline.

@@ -894,6 +894,9 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
if (!best || deadline_gt(deadline, best, se))
best = se;

+ if (se->deadline == se->min_deadline)
+ break;
+
/*
* Every se in a left branch is eligible, keep track of the
* branch with the best min_deadline
@@ -913,10 +916,6 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
break;
}

- /* min_deadline is at this node, no need to look right */
- if (se->deadline == se->min_deadline)
- break;
-
/* else min_deadline is in the right branch. */
node = node->rb_right;
}

(But still thanks for the convincing explanation on fairness.)

Best,
Abel

>
> Then when we do the best_left->min_deadline vs best->deadline
> computation, we prioritize best_left, which is the one case it can be
> wrong, we'd need an additional
> "if (se->min_deadline == best->deadline &&
> (s64)(se->vruntime - best->vruntime) > 0) return best;" check at the end
> of the second loop.
>
> (Though again I don't know how much this sort of never-going-to-happen
> slight fairness improvement is worth compared to the extra bit of
> overhead)

2023-10-13 16:52:05

by Benjamin Segall

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: fix pick_eevdf to always find the correct se

Abel Wu <[email protected]> writes:

> On 10/13/23 1:51 AM, Benjamin Segall Wrote:
>> Abel Wu <[email protected]> writes:
>>
>>> On 10/12/23 5:01 AM, Benjamin Segall Wrote:
>>>> Abel Wu <[email protected]> writes:
>>>>
>>>>> On 9/30/23 8:09 AM, Benjamin Segall Wrote:
>>>>>> + /*
>>>>>> + * Now best_left and all of its children are eligible, and we are just
>>>>>> + * looking for deadline == min_deadline
>>>>>> + */
>>>>>> + node = &best_left->run_node;
>>>>>> + while (node) {
>>>>>> + struct sched_entity *se = __node_2_se(node);
>>>>>> +
>>>>>> + /* min_deadline is the current node */
>>>>>> + if (se->deadline == se->min_deadline)
>>>>>> + return se;
>>>>>
>>>>> IMHO it would be better tiebreak on vruntime by moving this hunk to ..
>>>>>
>>>>>> +
>>>>>> + /* min_deadline is in the left branch */
>>>>>> if (node->rb_left &&
>>>>>> __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
>>>>>> node = node->rb_left;
>>>>>> continue;
>>>>>> }
>>>>>
>>>>> .. here, thoughts?
>>>> Yeah, that should work and be better on the tiebreak (and my test code
>>>> agrees). There's an argument that the tiebreak will never really come up
>>>> and it's better to avoid the potential one extra cache line from
>>>> "__node_2_se(node->rb_left)->min_deadline" though.
>>>
>>> I see. Then probably do the same thing in the first loop?
>>>
>> We effectively do that already sorta by accident almost always -
>> computing best and best_left via deadline_gt rather than gte prioritizes
>> earlier elements, which always have a better vruntime.
>
> Sorry for not clarifying clearly about the 'same thing'. What I meant
> was to avoid touch left if the node itself has the min deadline.
>
> @@ -894,6 +894,9 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
> if (!best || deadline_gt(deadline, best, se))
> best = se;
>
> + if (se->deadline == se->min_deadline)
> + break;
> +
> /*
> * Every se in a left branch is eligible, keep track of the
> * branch with the best min_deadline
> @@ -913,10 +916,6 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
> break;
> }
>
> - /* min_deadline is at this node, no need to look right */
> - if (se->deadline == se->min_deadline)
> - break;
> -
> /* else min_deadline is in the right branch. */
> node = node->rb_right;
> }
>
> (But still thanks for the convincing explanation on fairness.)
>

Ah, yes, in terms of optimizing performance rather than marginal
fairness, that would help.