2008-10-24 09:59:53

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 6/8] sched: avg_vruntime

Renicing requires scaling the lag. Therefore we need a way to compute the it.
Lag is defined as the difference between the service time received from the
ideal model and the actual scheduler.

The defining property of a fair scheduler is that the sum of all lags is zero;
which can be seen is trivially true for the ideal case, as all lags are zero.

Therefore, the average of all virtual runtimes will be the point of zero lag.

We cannot prove fairness for CFS due to sleeper fairness (without it we can).
However since we can observe it does converge to fairness in stable operation,
we can say the zero lag point converges to the average.

We can't just take the average of vruntime - as it will use the full range
of its u64 and will wrap around. Instead we'll use the average of
(vruntime - min_vruntime)

\Sum_{i}^{n} 1/n (v_{i} - v) = 1/n (\Sum_{i}^{n} v_{i}) - vn

By factoring out the 1/n (never storing that) we avoid rounding, which
would bring an accumulating error.

Signed-off-by: Peter Zijlstra <[email protected]>
---
kernel/sched.c | 3 ++
kernel/sched_debug.c | 3 ++
kernel/sched_fair.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 61 insertions(+), 1 deletion(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -383,6 +383,9 @@ struct cfs_rq {
struct load_weight load;
unsigned long nr_running;

+ long nr_queued;
+ s64 avg_vruntime;
+
u64 exec_clock;
u64 min_vruntime;

Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c
+++ linux-2.6/kernel/sched_debug.c
@@ -161,6 +161,9 @@ void print_cfs_rq(struct seq_file *m, in
SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
+ SPLIT_NS(avg_vruntime(cfs_rq)));
+
#ifdef CONFIG_SCHEDSTATS
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -271,6 +271,56 @@ static inline s64 entity_key(struct cfs_
return se->vruntime - cfs_rq->min_vruntime;
}

+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 key = entity_key(cfs_rq, se);
+ cfs_rq->avg_vruntime += key;
+ cfs_rq->nr_queued++;
+}
+
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 key = entity_key(cfs_rq, se);
+ cfs_rq->avg_vruntime -= key;
+ cfs_rq->nr_queued--;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+ cfs_rq->avg_vruntime -= cfs_rq->nr_queued * delta;
+}
+
+static u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+ s64 avg = cfs_rq->avg_vruntime;
+ long nr_queued = cfs_rq->nr_queued;
+
+ if (cfs_rq->curr) {
+ nr_queued++;
+ avg += entity_key(cfs_rq, cfs_rq->curr);
+ }
+
+ if (nr_queued)
+ avg = div_s64(avg, nr_queued);
+
+ return cfs_rq->min_vruntime + avg;
+}
+
+static void __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+ /*
+ * open coded max_vruntime() to allow updating avg_vruntime
+ */
+ s64 delta = (s64)(vruntime - cfs_rq->min_vruntime);
+ if (delta > 0) {
+ avg_vruntime_update(cfs_rq, delta);
+ cfs_rq->min_vruntime = vruntime;
+ }
+}
+
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
u64 vruntime = cfs_rq->min_vruntime;
@@ -289,7 +339,7 @@ static void update_min_vruntime(struct c
vruntime = min_vruntime(vruntime, se->vruntime);
}

- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+ __update_min_vruntime(cfs_rq, vruntime);
}

/*
@@ -303,6 +353,8 @@ static void __enqueue_entity(struct cfs_
s64 key = entity_key(cfs_rq, se);
int leftmost = 1;

+ avg_vruntime_add(cfs_rq, se);
+
/*
* Find the right place in the rbtree:
*/
@@ -345,6 +397,8 @@ static void __dequeue_entity(struct cfs_
cfs_rq->next = NULL;

rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+
+ avg_vruntime_sub(cfs_rq, se);
}

static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)

--


2008-10-29 18:03:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 6/8] sched: avg_vruntime

On Fri, 2008-10-24 at 11:06 +0200, Peter Zijlstra wrote:
> plain text document attachment (sched-avg-vruntime.patch)
> Renicing requires scaling the lag. Therefore we need a way to compute the it.
> Lag is defined as the difference between the service time received from the
> ideal model and the actual scheduler.
>
> The defining property of a fair scheduler is that the sum of all lags is zero;
> which can be seen is trivially true for the ideal case, as all lags are zero.
>
> Therefore, the average of all virtual runtimes will be the point of zero lag.
>
> We cannot prove fairness for CFS due to sleeper fairness (without it we can).
> However since we can observe it does converge to fairness in stable operation,
> we can say the zero lag point converges to the average.
>
> We can't just take the average of vruntime - as it will use the full range
> of its u64 and will wrap around. Instead we'll use the average of
> (vruntime - min_vruntime)
>
> \Sum_{i}^{n} 1/n (v_{i} - v) = 1/n (\Sum_{i}^{n} v_{i}) - vn
>
> By factoring out the 1/n (never storing that) we avoid rounding, which
> would bring an accumulating error.

Hi Fabio,

you were right, this is wrong.

How about this..

The fluid model, would for each task t_i, generate an execution time e_i

de_i = w_i / w_sum * dt

However, any real scheduler will be imperfect and have an error eps_i

dE_i = de_i + eps_i,

But due to only dt actual time having past we can state that

\Sum_i dE_i = dt, therefore \Sum_i eps_i = 0.

This will be reflected in a virtual runtime skew of

dv_i = eps_i / w_i

If we now wish to obtain the zero lag point, there were all tasks would
be in the fluid model, we get

eps_i = dv_i * w_i, which yields: \Sum dv_i * w_i = 0

IOW avg(v_i*w_i) = v_fluid

1/n \Sum_i v_i*w_i, [v_i -> v_i-x] ->
1/n \sum_i (v_i-x)*w_i =
1/n \Sum v_i*w_i - \Sum x*w_i =
1/n \Sum v_i*w_i - x \Sum w_i

which in turn would yield a patch like below..

I'll also try and quantify the error and effect of using min_vruntime as
zero lag point as Ingo suggested.

---
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c 2008-10-29 16:43:16.000000000 +0100
+++ linux-2.6/kernel/sched.c 2008-10-29 16:43:27.000000000 +0100
@@ -384,6 +384,10 @@ struct cfs_rq {
struct load_weight load;
unsigned long nr_running;

+ long nr_queued;
+ long avg_load;
+ s64 avg_vruntime;
+
u64 exec_clock;
u64 min_vruntime;

Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c 2008-10-29 16:43:04.000000000 +0100
+++ linux-2.6/kernel/sched_debug.c 2008-10-29 16:43:37.000000000 +0100
@@ -161,6 +161,9 @@ void print_cfs_rq(struct seq_file *m, in
SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
+ SPLIT_NS(avg_vruntime(cfs_rq)));
+
#ifdef CONFIG_SCHEDSTATS
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c 2008-10-29 16:43:17.000000000 +0100
+++ linux-2.6/kernel/sched_fair.c 2008-10-29 16:46:41.000000000 +0100
@@ -271,6 +271,60 @@ static inline s64 entity_key(struct cfs_
return se->vruntime - cfs_rq->min_vruntime;
}

+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 key = entity_key(cfs_rq, se);
+ cfs_rq->avg_load += se->load.weight;
+ cfs_rq->avg_vruntime += key * se->load.weight;
+ cfs_rq->nr_queued++;
+}
+
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 key = entity_key(cfs_rq, se);
+ cfs_rq->avg_load -= se->load.weight;
+ cfs_rq->avg_vruntime -= key * se->load.weight;
+ cfs_rq->nr_queued--;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+ cfs_rq->avg_vruntime -= cfs_rq->nr_queued * cfs_rq->avg_load * delta;
+}
+
+static u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+ s64 avg = cfs_rq->avg_vruntime;
+ long nr_queued = cfs_rq->nr_queued;
+
+ if (cfs_rq->curr) {
+ nr_queued++;
+ avg += entity_key(cfs_rq, cfs_rq->curr) * cfs_rq->curr->load.weight;
+ }
+
+ avg >>= NICE_0_SHIFT;
+
+ if (nr_queued)
+ avg = div_s64(avg, nr_queued);
+
+ return cfs_rq->min_vruntime + avg;
+}
+
+static void __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+ /*
+ * open coded max_vruntime() to allow updating avg_vruntime
+ */
+ s64 delta = (s64)(vruntime - cfs_rq->min_vruntime);
+ if (delta > 0) {
+ avg_vruntime_update(cfs_rq, delta);
+ cfs_rq->min_vruntime = vruntime;
+ }
+}
+
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
u64 vruntime = cfs_rq->min_vruntime;
@@ -289,7 +343,7 @@ static void update_min_vruntime(struct c
vruntime = min_vruntime(vruntime, se->vruntime);
}

- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+ __update_min_vruntime(cfs_rq, vruntime);
}

/*
@@ -303,6 +357,8 @@ static void __enqueue_entity(struct cfs_
s64 key = entity_key(cfs_rq, se);
int leftmost = 1;

+ avg_vruntime_add(cfs_rq, se);
+
/*
* Find the right place in the rbtree:
*/
@@ -345,6 +401,7 @@ static void __dequeue_entity(struct cfs_
cfs_rq->next = NULL;

rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+ avg_vruntime_sub(cfs_rq, se);
}

static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)

2008-11-01 19:04:22

by Fabio Checconi

[permalink] [raw]
Subject: Re: [PATCH 6/8] sched: avg_vruntime

Hi,

> From: Peter Zijlstra <[email protected]>
> Date: Wed, Oct 29, 2008 04:48:34PM +0100
>
> On Fri, 2008-10-24 at 11:06 +0200, Peter Zijlstra wrote:
...
> How about this..
>
> The fluid model, would for each task t_i, generate an execution time e_i
>
> de_i = w_i / w_sum * dt
>
> However, any real scheduler will be imperfect and have an error eps_i
>
> dE_i = de_i + eps_i,
>

...according to this equation...


> But due to only dt actual time having past we can state that
>
> \Sum_i dE_i = dt, therefore \Sum_i eps_i = 0.
>
> This will be reflected in a virtual runtime skew of
>
> dv_i = eps_i / w_i
>

...and to this one, what you call ``virtual runtime skew'' is:

dv_i = (dE_i - de_i) / w_i.


> If we now wish to obtain the zero lag point, there were all tasks would
> be in the fluid model, we get
>
> eps_i = dv_i * w_i, which yields: \Sum dv_i * w_i = 0
>
> IOW avg(v_i*w_i) = v_fluid
>

Looking at the code, it seems that you use the vruntime values of the
entities when you do the average, which are different from what you
previously called ``virtual runtime skew.'' I don't understand the
connection between the previous dv_i and the v_i there. Calling dVR_i
the vruntime increment for the i-th entity, dVR_i = dE_i / w_i, which
clearly differs from dv_i.

Moreover, v_fluid (considering all the flows backlogged from the beginning
and the set of active flows constant) is defined as:

v_fluid = 1 / w_sum * \sum w_i * VR_i,

so, unless w_sum == N, this differs from your expression for v_fluid.
Am I missing something there?


> 1/n \Sum_i v_i*w_i, [v_i -> v_i-x] ->
> 1/n \sum_i (v_i-x)*w_i =
> 1/n \Sum v_i*w_i - \Sum x*w_i =
> 1/n \Sum v_i*w_i - x \Sum w_i
>
> which in turn would yield a patch like below..
>
> I'll also try and quantify the error and effect of using min_vruntime as
> zero lag point as Ingo suggested.
>

min_vruntime, given its definition, is very likely to be near to the
maximum lag point...


> ---
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c 2008-10-29 16:43:16.000000000 +0100
> +++ linux-2.6/kernel/sched.c 2008-10-29 16:43:27.000000000 +0100
> @@ -384,6 +384,10 @@ struct cfs_rq {
> struct load_weight load;
> unsigned long nr_running;
>
> + long nr_queued;
> + long avg_load;
> + s64 avg_vruntime;
> +
> u64 exec_clock;
> u64 min_vruntime;
>
> Index: linux-2.6/kernel/sched_debug.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_debug.c 2008-10-29 16:43:04.000000000 +0100
> +++ linux-2.6/kernel/sched_debug.c 2008-10-29 16:43:37.000000000 +0100
> @@ -161,6 +161,9 @@ void print_cfs_rq(struct seq_file *m, in
> SPLIT_NS(spread0));
> SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
> SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
> + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
> + SPLIT_NS(avg_vruntime(cfs_rq)));
> +
> #ifdef CONFIG_SCHEDSTATS
> #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
>
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c 2008-10-29 16:43:17.000000000 +0100
> +++ linux-2.6/kernel/sched_fair.c 2008-10-29 16:46:41.000000000 +0100
> @@ -271,6 +271,60 @@ static inline s64 entity_key(struct cfs_
> return se->vruntime - cfs_rq->min_vruntime;
> }
>
> +static void
> +avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> + s64 key = entity_key(cfs_rq, se);
> + cfs_rq->avg_load += se->load.weight;
> + cfs_rq->avg_vruntime += key * se->load.weight;
> + cfs_rq->nr_queued++;
> +}
> +
> +static void
> +avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> + s64 key = entity_key(cfs_rq, se);
> + cfs_rq->avg_load -= se->load.weight;
> + cfs_rq->avg_vruntime -= key * se->load.weight;
> + cfs_rq->nr_queued--;
> +}
> +
> +static inline
> +void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
> +{
> + cfs_rq->avg_vruntime -= cfs_rq->nr_queued * cfs_rq->avg_load * delta;
> +}
> +
> +static u64 avg_vruntime(struct cfs_rq *cfs_rq)
> +{
> + s64 avg = cfs_rq->avg_vruntime;
> + long nr_queued = cfs_rq->nr_queued;
> +
> + if (cfs_rq->curr) {
> + nr_queued++;
> + avg += entity_key(cfs_rq, cfs_rq->curr) * cfs_rq->curr->load.weight;
> + }
> +
> + avg >>= NICE_0_SHIFT;
> +
> + if (nr_queued)
> + avg = div_s64(avg, nr_queued);
> +
> + return cfs_rq->min_vruntime + avg;
> +}
> +
> +static void __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
> +{
> + /*
> + * open coded max_vruntime() to allow updating avg_vruntime
> + */
> + s64 delta = (s64)(vruntime - cfs_rq->min_vruntime);
> + if (delta > 0) {
> + avg_vruntime_update(cfs_rq, delta);
> + cfs_rq->min_vruntime = vruntime;
> + }
> +}
> +
> static void update_min_vruntime(struct cfs_rq *cfs_rq)
> {
> u64 vruntime = cfs_rq->min_vruntime;
> @@ -289,7 +343,7 @@ static void update_min_vruntime(struct c
> vruntime = min_vruntime(vruntime, se->vruntime);
> }
>
> - cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
> + __update_min_vruntime(cfs_rq, vruntime);
> }
>
> /*
> @@ -303,6 +357,8 @@ static void __enqueue_entity(struct cfs_
> s64 key = entity_key(cfs_rq, se);
> int leftmost = 1;
>
> + avg_vruntime_add(cfs_rq, se);
> +
> /*
> * Find the right place in the rbtree:
> */
> @@ -345,6 +401,7 @@ static void __dequeue_entity(struct cfs_
> cfs_rq->next = NULL;
>
> rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
> + avg_vruntime_sub(cfs_rq, se);
> }
>
> static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
>