2023-03-28 11:09:48

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 04/17] sched/fair: Add avg_vruntime

In order to move to an eligibility based scheduling policy it is
needed to have a better approximation of the ideal scheduler.

Specifically, for a virtual time weighted fair queueing based
scheduler the ideal scheduler will be the weighted average of the
individual virtual runtimes (math in the comment).

As such, compute the weighted average to approximate the ideal
scheduler -- note that the approximation is in the individual task
behaviour, which isn't strictly conformant.

Specifically consider adding a task with a vruntime left of center, in
this case the average will move backwards in time -- something the
ideal scheduler would of course never do.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/debug.c | 32 ++++++--------
kernel/sched/fair.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++++--
kernel/sched/sched.h | 5 ++
3 files changed, 128 insertions(+), 20 deletions(-)

--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -580,10 +580,9 @@ static void print_rq(struct seq_file *m,

void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
- spread, rq0_min_vruntime, spread0;
+ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
+ struct sched_entity *last, *first;
struct rq *rq = cpu_rq(cpu);
- struct sched_entity *last;
unsigned long flags;

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -597,26 +596,25 @@ void print_cfs_rq(struct seq_file *m, in
SPLIT_NS(cfs_rq->exec_clock));

raw_spin_rq_lock_irqsave(rq, flags);
- if (rb_first_cached(&cfs_rq->tasks_timeline))
- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
+ first = __pick_first_entity(cfs_rq);
+ if (first)
+ left_vruntime = first->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
- max_vruntime = last->vruntime;
+ right_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
raw_spin_rq_unlock_irqrestore(rq, flags);
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
- SPLIT_NS(MIN_vruntime));
+
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
+ SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
SPLIT_NS(min_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
- SPLIT_NS(max_vruntime));
- spread = max_vruntime - MIN_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
- SPLIT_NS(spread));
- spread0 = min_vruntime - rq0_min_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
- SPLIT_NS(spread0));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
+ SPLIT_NS(avg_vruntime(cfs_rq)));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
+ SPLIT_NS(right_vruntime));
+ spread = right_vruntime - left_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -601,9 +601,108 @@ static inline bool entity_before(const s
return (s64)(a->vruntime - b->vruntime) < 0;
}

+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return (s64)(se->vruntime - cfs_rq->min_vruntime);
+}
+
#define __node_2_se(node) \
rb_entry((node), struct sched_entity, run_node)

+/*
+ * Compute virtual time from the per-task service numbers:
+ *
+ * Fair schedulers conserve lag: \Sum lag_i = 0
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * \Sum lag_i = 0 -> \Sum w_i * (V - v_i) = V * \Sum w_i - \Sum w_i * v_i = 0
+ *
+ * From which we solve V:
+ *
+ * \Sum v_i * w_i
+ * V = --------------
+ * \Sum w_i
+ *
+ * However, since v_i is u64, and the multiplcation could easily overflow
+ * transform it into a relative form that uses smaller quantities:
+ *
+ * Substitute: v_i == (v_i - v) + v
+ *
+ * \Sum ((v_i - v) + v) * w_i \Sum (v_i - v) * w_i
+ * V = -------------------------- = -------------------- + v
+ * \Sum w_i \Sum w_i
+ *
+ * min_vruntime = v
+ * avg_vruntime = \Sum (v_i - v) * w_i
+ * cfs_rq->load = \Sum w_i
+ *
+ * Since min_vruntime is a monotonic increasing variable that closely tracks
+ * the per-task service, these deltas: (v_i - v), will be in the order of the
+ * maximal (virtual) lag induced in the system due to quantisation.
+ */
+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime += key * weight;
+ cfs_rq->avg_load += weight;
+}
+
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime -= key * weight;
+ cfs_rq->avg_load -= weight;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+ /*
+ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ */
+ cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+}
+
+u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ if (load)
+ avg = div_s64(avg, load);
+
+ return cfs_rq->min_vruntime + avg;
+}
+
+static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+ u64 min_vruntime = cfs_rq->min_vruntime;
+ /*
+ * open coded max_vruntime() to allow updating avg_vruntime
+ */
+ s64 delta = (s64)(vruntime - min_vruntime);
+ if (delta > 0) {
+ avg_vruntime_update(cfs_rq, delta);
+ min_vruntime = vruntime;
+ }
+ return min_vruntime;
+}
+
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
@@ -629,7 +728,7 @@ static void update_min_vruntime(struct c

/* ensure we never gain time by being placed backwards. */
u64_u32_store(cfs_rq->min_vruntime,
- max_vruntime(cfs_rq->min_vruntime, vruntime));
+ __update_min_vruntime(cfs_rq, vruntime));
}

static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -642,12 +741,14 @@ static inline bool __entity_less(struct
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ avg_vruntime_add(cfs_rq, se);
rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
}

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+ avg_vruntime_sub(cfs_rq, se);
}

struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -3330,6 +3431,8 @@ static void reweight_entity(struct cfs_r
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
+ else
+ avg_vruntime_sub(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
@@ -3345,9 +3448,11 @@ static void reweight_entity(struct cfs_r
#endif

enqueue_load_avg(cfs_rq, se);
- if (se->on_rq)
+ if (se->on_rq) {
update_load_add(&cfs_rq->load, se->load.weight);
-
+ if (cfs_rq->curr != se)
+ avg_vruntime_add(cfs_rq, se);
+ }
}

void reweight_task(struct task_struct *p, int prio)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -558,6 +558,9 @@ struct cfs_rq {
unsigned int idle_nr_running; /* SCHED_IDLE */
unsigned int idle_h_nr_running; /* SCHED_IDLE */

+ s64 avg_vruntime;
+ u64 avg_load;
+
u64 exec_clock;
u64 min_vruntime;
#ifdef CONFIG_SCHED_CORE
@@ -3312,4 +3315,6 @@ static inline void switch_mm_cid(struct
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
#endif

+extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+
#endif /* _KERNEL_SCHED_SCHED_H */



2023-03-29 00:00:10

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 04/17] sched/fair: Add avg_vruntime

On Tue, Mar 28, 2023 at 4:06 AM Peter Zijlstra <[email protected]> wrote:
[...]
> +/*
> + * Compute virtual time from the per-task service numbers:
> + *
> + * Fair schedulers conserve lag: \Sum lag_i = 0
> + *
> + * lag_i = S - s_i = w_i * (V - v_i)
> + *
> + * \Sum lag_i = 0 -> \Sum w_i * (V - v_i) = V * \Sum w_i - \Sum w_i * v_i = 0

Small note: I think it would be helpful to label these symbols
somewhere :) Weight and vruntime are fairly obvious, but I don't
think 'S' and 'V' are as clear. Are these non-virtual ideal service
time, and average vruntime, respectively?

2023-03-29 07:52:50

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 04/17] sched/fair: Add avg_vruntime

On Tue, Mar 28, 2023 at 04:57:49PM -0700, Josh Don wrote:
> On Tue, Mar 28, 2023 at 4:06 AM Peter Zijlstra <[email protected]> wrote:
> [...]
> > +/*
> > + * Compute virtual time from the per-task service numbers:
> > + *
> > + * Fair schedulers conserve lag: \Sum lag_i = 0
> > + *
> > + * lag_i = S - s_i = w_i * (V - v_i)
> > + *
> > + * \Sum lag_i = 0 -> \Sum w_i * (V - v_i) = V * \Sum w_i - \Sum w_i * v_i = 0
>
> Small note: I think it would be helpful to label these symbols
> somewhere :) Weight and vruntime are fairly obvious, but I don't
> think 'S' and 'V' are as clear. Are these non-virtual ideal service
> time, and average vruntime, respectively?

Yep, they are. I'll see what I can do with the comments.

2023-04-05 19:19:07

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 04/17] sched/fair: Add avg_vruntime

On Wed, Mar 29, 2023 at 09:50:51AM +0200, Peter Zijlstra wrote:
> On Tue, Mar 28, 2023 at 04:57:49PM -0700, Josh Don wrote:
> > On Tue, Mar 28, 2023 at 4:06 AM Peter Zijlstra <[email protected]> wrote:
> > [...]
> > > +/*
> > > + * Compute virtual time from the per-task service numbers:
> > > + *
> > > + * Fair schedulers conserve lag: \Sum lag_i = 0
> > > + *
> > > + * lag_i = S - s_i = w_i * (V - v_i)
> > > + *
> > > + * \Sum lag_i = 0 -> \Sum w_i * (V - v_i) = V * \Sum w_i - \Sum w_i * v_i = 0
> >
> > Small note: I think it would be helpful to label these symbols
> > somewhere :) Weight and vruntime are fairly obvious, but I don't
> > think 'S' and 'V' are as clear. Are these non-virtual ideal service
> > time, and average vruntime, respectively?
>
> Yep, they are. I'll see what I can do with the comments.


/*
* Compute virtual time from the per-task service numbers:
*
* Fair schedulers conserve lag:
*
* \Sum lag_i = 0
*
* Where lag_i is given by:
*
* lag_i = S - s_i = w_i * (V - v_i)
*
* Where S is the ideal service time and V is it's virtual time counterpart.
* Therefore:
*
* \Sum lag_i = 0
* \Sum w_i * (V - v_i) = 0
* \Sum w_i * V - w_i * v_i = 0
*
* From which we can solve an expression for V in v_i (which we have in
* se->vruntime):
*
* \Sum v_i * w_i \Sum v_i * w_i
* V = -------------- = --------------
* \Sum w_i W
*
* Specifically, this is the weighted average of all entity virtual runtimes.
*
* [[ NOTE: this is only equal to the ideal scheduler under the condition
* that join/leave operations happen at lag_i = 0, otherwise the
* virtual time has non-continguous motion equivalent to:
*
* V +-= lag_i / W
*
* Also see the comment in place_entity() that deals with this. ]]
*
* However, since v_i is u64, and the multiplcation could easily overflow
* transform it into a relative form that uses smaller quantities:
*
* Substitute: v_i == (v_i - v0) + v0
*
* \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
* V = ---------------------------- = --------------------- + v0
* W W
*
* Which we track using:
*
* v0 := cfs_rq->min_vruntime
* \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
* \Sum w_i := cfs_rq->avg_load
*
* Since min_vruntime is a monotonic increasing variable that closely tracks
* the per-task service, these deltas: (v_i - v), will be in the order of the
* maximal (virtual) lag induced in the system due to quantisation.
*
* Also, we use scale_load_down() to reduce the size.
*
* As measured, the max (key * weight) value was ~44 bits for a kernel build.
*/


And the comment in place_entity() (slightly updated since this morning):


/*
* If we want to place a task and preserve lag, we have to
* consider the effect of the new entity on the weighted
* average and compensate for this, otherwise lag can quickly
* evaporate.
*
* Lag is defined as:
*
* lag_i = S - s_i = w_i * (V - v_i)
*
* To avoid the 'w_i' term all over the place, we only track
* the virtual lag:
*
* vl_i = V - v_i <=> v_i = V - vl_i
*
* And we take V to be the weighted average of all v:
*
* V = (\Sum w_j*v_j) / W
*
* Where W is: \Sum w_j
*
* Then, the weighted average after adding an entity with lag
* vl_i is given by:
*
* V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
* = (W*V + w_i*(V - vl_i)) / (W + w_i)
* = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
* = (V*(W + w_i) - w_i*l) / (W + w_i)
* = V - w_i*vl_i / (W + w_i)
*
* And the actual lag after adding an entity with vl_i is:
*
* vl'_i = V' - v_i
* = V - w_i*vl_i / (W + w_i) - (V - vl_i)
* = vl_i - w_i*vl_i / (W + w_i)
*
* Which is strictly less than vl_i. So in order to preserve lag
* we should inflate the lag before placement such that the
* effective lag after placement comes out right.
*
* As such, invert the above relation for vl'_i to get the vl_i
* we need to use such that the lag after placement is the lag
* we computed before dequeue.
*
* vl'_i = vl_i - w_i*vl_i / (W + w_i)
* = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
*
* (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
* = W*vl_i
*
* vl_i = (W + w_i)*vl'_i / W
*/