LinuxLists.cc - [ANNOUNCE/RFC] Really Simple Really Fair Scheduler

2007-09-02 12:02:27

Subject: [ANNOUNCE/RFC] Really Simple Really Fair Scheduler

* Ingo Molnar <[email protected]> wrote:

> Your math is fairly simple (and that is _good_, just like CFS's
> existing math is simple), it can be summed up in essence as (without
> complicating it with nice-level weighting, for easy
> understandability):
>
> " use the already existing p->sum_exec_runtime 'task runtime' metric
> that CFS maintains, and use that as the key into the rb-tree that
> selects tasks that should be run next. To handle sleeping tasks: keep
> a per-rq sum of all runnable task's ->sum_exec_runtime values and
> start newly woken tasks at the average rq->sum/nr_running value. "
>
> Now your patch does not actually do it that way in a clearly
> discernible manner because lots of changes are intermixed into one big
> patch.
>
> ( please correct me if i got your math wrong. Your patch does not add
> any comments at all to the new code and this slowed down my review
> and analysis of your patch quite considerably. Lack of comments makes
> it harder to see the purpose and makes it harder to notice the
> benefits/tradeoffs involved in each change. )

Roman, as an addendum to my review, please find below a prototype patch
i've just written that implements RSRFS (Really Simple Really Fair
Scheduler) ontop of CFS. It is intended to demonstrate the essence of
the math you have presented via your patch. (it has no nice levels
support yet, to make the math really apparent to everyone interested)

It's a truly simple patch:

3 files changed, 30 insertions(+), 23 deletions(-)

and gives good fairness:

$ ./massive_intr 10 10
002510 00000114
002515 00000114
002518 00000114
002514 00000114
002513 00000115
002509 00000115
002517 00000115
002511 00000115
002512 00000115
002516 00000115

Could you please confirm whether the math algorithm you are suggesting
is implemented by this patch roughly correctly? (ignoring nice levels,
i.e. only considering nice-0 tasks, ignoring rounding issues and
Bresenham optimizations, CPU time slicing and other changes.)

Thanks,

Ingo

---
include/linux/sched.h | 1 +
kernel/sched.c | 2 ++
kernel/sched_fair.c | 50 +++++++++++++++++++++++++++-----------------------
3 files changed, 30 insertions(+), 23 deletions(-)

Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -904,6 +904,7 @@ struct sched_entity {

u64 exec_start;
u64 sum_exec_runtime;
+ u64 exec_runtime;
u64 prev_sum_exec_runtime;
u64 wait_start_fair;
u64 sleep_start_fair;
Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -183,6 +183,7 @@ struct cfs_rq {

s64 fair_clock;
u64 exec_clock;
+ u64 exec_runtime;
s64 wait_runtime;
u64 sleeper_bonus;
unsigned long wait_runtime_overruns, wait_runtime_underruns;
@@ -1586,6 +1587,7 @@ static void __sched_fork(struct task_str
{
p->se.wait_start_fair = 0;
p->se.exec_start = 0;
+ p->se.exec_runtime = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.delta_exec = 0;
Index: linux/kernel/sched_fair.c
===================================================================
--- linux.orig/kernel/sched_fair.c
+++ linux/kernel/sched_fair.c
@@ -82,7 +82,7 @@ enum {
};

unsigned int sysctl_sched_features __read_mostly =
- SCHED_FEAT_FAIR_SLEEPERS *1 |
+ SCHED_FEAT_FAIR_SLEEPERS *0 |
SCHED_FEAT_SLEEPER_AVG *0 |
SCHED_FEAT_SLEEPER_LOAD_AVG *1 |
SCHED_FEAT_PRECISE_CPU_LOAD *1 |
@@ -194,6 +194,8 @@ __enqueue_entity(struct cfs_rq *cfs_rq,
update_load_add(&cfs_rq->load, se->load.weight);
cfs_rq->nr_running++;
se->on_rq = 1;
+
+ cfs_rq->exec_runtime += se->exec_runtime;
}

static inline void
@@ -205,6 +207,8 @@ __dequeue_entity(struct cfs_rq *cfs_rq,
update_load_sub(&cfs_rq->load, se->load.weight);
cfs_rq->nr_running--;
se->on_rq = 0;
+
+ cfs_rq->exec_runtime -= se->exec_runtime;
}

static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
@@ -347,6 +351,8 @@ __update_curr(struct cfs_rq *cfs_rq, str

curr->sum_exec_runtime += delta_exec;
cfs_rq->exec_clock += delta_exec;
+ curr->exec_runtime += delta_exec;
+ cfs_rq->exec_runtime += delta_exec;

if (unlikely(!load))
return;
@@ -431,7 +437,7 @@ calc_weighted(unsigned long delta, unsig
*/
static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- s64 key;
+ u64 key;

/*
* Are we enqueueing a waiting task? (for current tasks
@@ -442,26 +448,7 @@ static void update_stats_enqueue(struct
/*
* Update the key:
*/
- key = cfs_rq->fair_clock;
-
- /*
- * Optimize the common nice 0 case:
- */
- if (likely(se->load.weight == NICE_0_LOAD)) {
- key -= se->wait_runtime;
- } else {
- u64 tmp;
-
- if (se->wait_runtime < 0) {
- tmp = -se->wait_runtime;
- key += (tmp * se->load.inv_weight) >>
- (WMULT_SHIFT - NICE_0_SHIFT);
- } else {
- tmp = se->wait_runtime;
- key -= (tmp * se->load.inv_weight) >>
- (WMULT_SHIFT - NICE_0_SHIFT);
- }
- }
+ key = se->exec_runtime;

se->fair_key = key;
}
@@ -583,6 +570,20 @@ static void __enqueue_sleeper(struct cfs
cfs_rq->sleeper_bonus += delta_fair;
}

+/*
+ * Newly woken tasks are put into the "middle" of all runnable
+ * task's current runtime:
+ */
+static u64 avg_exec_runtime(struct cfs_rq *cfs_rq)
+{
+ u64 avg_exec_runtime = cfs_rq->exec_runtime;
+
+ if (cfs_rq->nr_running)
+ do_div(avg_exec_runtime, cfs_rq->nr_running);
+
+ return avg_exec_runtime;
+}
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *tsk = task_of(se);
@@ -640,8 +641,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
*/
update_curr(cfs_rq);

- if (wakeup)
+ if (wakeup) {
+ se->exec_runtime = avg_exec_runtime(cfs_rq);
enqueue_sleeper(cfs_rq, se);
+ }

update_stats_enqueue(cfs_rq, se);
__enqueue_entity(cfs_rq, se);
@@ -1126,6 +1129,7 @@ static void task_new_fair(struct rq *rq,
schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
}

+ se->exec_runtime = avg_exec_runtime(cfs_rq);
__enqueue_entity(cfs_rq, se);
}

2007-09-02 19:12:41

by Tong Li

[permalink] [raw]

Subject: Re: [ANNOUNCE/RFC] Really Simple Really Fair Scheduler

I like this patch since it's really simple. CFS does provide a nice
infrastructure to enable new algorithmic changes/extensions. My only
concern was the O(log N) complexity under heavy load, but I'm willing to
agree that it's OK in the common case. Some comments on the code:

> * Ingo Molnar <[email protected]> wrote:
>
> static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> - s64 key;
> + u64 key;
>
> /*
> * Are we enqueueing a waiting task? (for current tasks
> @@ -442,26 +448,7 @@ static void update_stats_enqueue(struct
> /*
> * Update the key:
> */
> - key = cfs_rq->fair_clock;
> -
> - /*
> - * Optimize the common nice 0 case:
> - */
> - if (likely(se->load.weight == NICE_0_LOAD)) {
> - key -= se->wait_runtime;
> - } else {
> - u64 tmp;
> -
> - if (se->wait_runtime < 0) {
> - tmp = -se->wait_runtime;
> - key += (tmp * se->load.inv_weight) >>
> - (WMULT_SHIFT - NICE_0_SHIFT);
> - } else {
> - tmp = se->wait_runtime;
> - key -= (tmp * se->load.inv_weight) >>
> - (WMULT_SHIFT - NICE_0_SHIFT);
> - }
> - }
> + key = se->exec_runtime;
>
> se->fair_key = key;
> }

Should we use the weighted fair clock exec_runtime as the key? This way
tasks with larger weights will have their keys incremented more slowly and
thus be given more CPU time. This is what other virtual-clock based fair
scheduling algorithms commonly do.

> @@ -583,6 +570,20 @@ static void __enqueue_sleeper(struct cfs
> cfs_rq->sleeper_bonus += delta_fair;
> }
>
> +/*
> + * Newly woken tasks are put into the "middle" of all runnable
> + * task's current runtime:
> + */
> +static u64 avg_exec_runtime(struct cfs_rq *cfs_rq)
> +{
> + u64 avg_exec_runtime = cfs_rq->exec_runtime;
> +
> + if (cfs_rq->nr_running)
> + do_div(avg_exec_runtime, cfs_rq->nr_running);
> +
> + return avg_exec_runtime;
> +}
> +
> static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> struct task_struct *tsk = task_of(se);
> @@ -640,8 +641,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
> */
> update_curr(cfs_rq);
>
> - if (wakeup)
> + if (wakeup) {
> + se->exec_runtime = avg_exec_runtime(cfs_rq);
> enqueue_sleeper(cfs_rq, se);
> + }
>
> update_stats_enqueue(cfs_rq, se);
> __enqueue_entity(cfs_rq, se);
> @@ -1126,6 +1129,7 @@ static void task_new_fair(struct rq *rq,
> schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
> }
>
> + se->exec_runtime = avg_exec_runtime(cfs_rq);
> __enqueue_entity(cfs_rq, se);
> }

What's the intuition behind avg_exec_runtime? I thought the original CFS
approach, i.e., setting a newly arriving task's key to be the current fair
clock, adjusted by wait_runtime, was good. It matches other fair queuing
algorithms and thus has provably good properties.

tong

2007-09-02 19:45:07

by Ingo Molnar

[permalink] [raw]

Subject: Re: [ANNOUNCE/RFC] Really Simple Really Fair Scheduler

* Tong Li <[email protected]> wrote:

> I like this patch since it's really simple. CFS does provide a nice
> infrastructure to enable new algorithmic changes/extensions. My only
> concern was the O(log N) complexity under heavy load, but I'm willing
> to agree that it's OK in the common case. [...]

yeah. Note that just in case i wasnt clear enough: my patch attempts to
be an adoption of the core fairness math algorithm Roman suggested - so
it is not my idea and i dont want to take credit for it. (if it were to
go upstream it would of course carry a prominent "fairness math
rewritten by Roman Zippel" credit.)

about O(log N) complexity: the "timeline" nature of the CFS rbtree
(which rbtree Roman's patch preserves) guarantees a certain good level
of cache locality. We generally insert tasks at the "right side" of the
tree and remove them from the "left side" of the tree. (not always of
course, but for most workloads) So in practice, on a reasonable CPU,
there's no difference to the cachemiss patterns of pure O(1) algorithms.
And in terms of cycle overhead, lets consider something really extreme:
_one million runnable tasks on a single CPU_ (which is clearly silly and
unrealistic), which has a worst-case rbtree depth of ~31. A modern CPU
can walk a 31-deep binary tree in the neighborhood of 100 cycles
(cached). That makes it comparable to the O(1) scheduler's reliance on
the BSF instruction on x86 (which instruction costs a few dozen cycles
last i remember). In practice O(log(N)) algorithms are really equivalent
to O(1) algorithms. The big thing about the "O(1) scheduler" was that
the scheduler it replaced was O(N). Now an O(N) algorithm _does_ hurt.

No doubt, people _will_ play with CFS and will try to implement its
timeline data structure using O(1) algorithms (or improved tree
algorithms). It's doable and it will certainly be interesting to see the
results of such experiments. The rbtree was simply the most natural
choice of an already existing, lightweight in-kernel tree data
structure. [ It's also used by the MM so continued sanity and
performance of that code is guaranteed by the MM hackers ;-) ]

> [...] Some comments on the code:

> >+ key = se->exec_runtime;
> >
> > se->fair_key = key;
> >}
>
> Should we use the weighted fair clock exec_runtime as the key? This
> way tasks with larger weights will have their keys incremented more
> slowly and thus be given more CPU time. This is what other
> virtual-clock based fair scheduling algorithms commonly do.

yep. The code i posted treats all tasks as nice-0. I suspect by adding a
calc_weighted() transformation to the key calculation above we'd get
most of the nice level support. (but i havent tried that yet - i was
mainly interested in a simple expression of Roman's ideas)

> >+ se->exec_runtime = avg_exec_runtime(cfs_rq);
> > __enqueue_entity(cfs_rq, se);
> >}
>
> What's the intuition behind avg_exec_runtime? I thought the original
> CFS approach, i.e., setting a newly arriving task's key to be the
> current fair clock, adjusted by wait_runtime, was good. It matches
> other fair queuing algorithms and thus has provably good properties.

it's i think what Roman's algorithm does, and i wanted to implement that
and only that, to be able to review the effects of a much simpler, yet
behaviorally equivalent patch. Perhaps Roman can shed some light on what
the thinking behind that average is.

Ingo

2007-09-03 18:37:57

by Roman Zippel

[permalink] [raw]

Subject: Re: [ANNOUNCE/RFC] Really Simple Really Fair Scheduler

Hi,

On Sun, 2 Sep 2007, Ingo Molnar wrote:

> Roman, as an addendum to my review, please find below a prototype patch
> i've just written that implements RSRFS (Really Simple Really Fair
> Scheduler) ontop of CFS. It is intended to demonstrate the essence of
> the math you have presented via your patch. (it has no nice levels
> support yet, to make the math really apparent to everyone interested)

It simplifies the math too much, the nice level weighting is an essential
part of the math and without it one can't really understand the problem
I'm trying to solve. At http://lkml.org/lkml/2007/9/3/168 I provide an
example of the math, which more acurately demonstrates the essential
math.

bye, Roman

2007-09-03 18:54:24

by Ingo Molnar

[permalink] [raw]

Subject: Re: [ANNOUNCE/RFC] Really Simple Really Fair Scheduler

* Roman Zippel <[email protected]> wrote:

> Hi,
>
> On Sun, 2 Sep 2007, Ingo Molnar wrote:
>
> > Roman, as an addendum to my review, please find below a prototype patch
> > i've just written that implements RSRFS (Really Simple Really Fair
> > Scheduler) ontop of CFS. It is intended to demonstrate the essence of
> > the math you have presented via your patch. (it has no nice levels
> > support yet, to make the math really apparent to everyone interested)
>
> It simplifies the math too much, the nice level weighting is an
> essential part of the math and without it one can't really understand
> the problem I'm trying to solve. At http://lkml.org/lkml/2007/9/3/168
> I provide an example of the math, which more acurately demonstrates
> the essential math.

as i mentioned it above, in the first level review i'm mainly interested
in your patch's behavior wrt. simple nice-0 tasks, how they run and how
they sleep and wake up. Nothing else. Why? That's what 99% of the Linux
tasks do after all. So could you please confirm whether that aspect of
what i wrote (both in words and in code) is correct?

If this basic model is correct, we can look further. If this basic model
is flawed, no amount of weighting, nice level logic, rounding behavior
can fix it. I'm sure you agree with that too.

That's why i wrote this prototype, please confirm or deny whether i
correctly understood how you intend to handle nice-0 tasks. (From your
mails i read a part-acknowledgement of that but i'd like to see a more
clear acknowledgement - or please mention any issues if you know about
them.) Thanks!

Ingo

2007-09-03 19:13:21

by Roman Zippel

[permalink] [raw]

Subject: Re: [ANNOUNCE/RFC] Really Simple Really Fair Scheduler

Hi,

On Mon, 3 Sep 2007, Ingo Molnar wrote:

> If this basic model is correct, we can look further.

The basic model is correct insofar I use an absolute time instead of a
relative time, but it's not the essence of my math, so I don't quite
understand the point of this exercise.

bye, Roman

2007-09-03 19:21:26

by Ingo Molnar

[permalink] [raw]

Subject: Re: [ANNOUNCE/RFC] Really Simple Really Fair Scheduler

* Roman Zippel <[email protected]> wrote:

> On Mon, 3 Sep 2007, Ingo Molnar wrote:
>
> > If this basic model is correct, we can look further.
>
> The basic model is correct insofar I use an absolute time instead of a
> relative time, but it's not the essence of my math, so I don't quite
> understand the point of this exercise.

thanks. (and i did not claim nor do i want to claim this to be the
essence of your efforts - it is very clear from your mails where your
focus is.)

My next question then is about this code of yours in the wakeup path:

+static void
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ kclock_t min_time;
+
+ verify_queue(cfs_rq, cfs_rq->curr != se, se);
+ min_time = get_time_avg(cfs_rq) - se->req_weight_inv;
+ if ((kclock_t)(se->time_norm - min_time) < 0)
+ se->time_norm = min_time;

why do you only use the "min_time" if the pre-sleep time_norm is smaller
than the min_time? Here 'min_time' is close to the current average.
Shouldnt here the woken up task be set to the average time, like i did
it in the crude prototype:

+ se->exec_runtime = avg_exec_runtime(cfs_rq);

(and lets again only consider the special case of only having nice-0
tasks.)

Or is it set in a similar way as my prototype does, and i missed some
detail why that branch is there?

Ingo

2007-09-03 19:55:16

by Roman Zippel

[permalink] [raw]

Subject: Re: [ANNOUNCE/RFC] Really Simple Really Fair Scheduler

Hi,

On Mon, 3 Sep 2007, Ingo Molnar wrote:

> My next question then is about this code of yours in the wakeup path:
>
> +static void
> +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> + kclock_t min_time;
> +
> + verify_queue(cfs_rq, cfs_rq->curr != se, se);
> + min_time = get_time_avg(cfs_rq) - se->req_weight_inv;
> + if ((kclock_t)(se->time_norm - min_time) < 0)
> + se->time_norm = min_time;
>
> why do you only use the "min_time" if the pre-sleep time_norm is smaller
> than the min_time? Here 'min_time' is close to the current average.

It's a variation of the sleeper bonus. Let's assume two running tasks
which have been running for 95ms and 105ms and a time slice of 10ms, the
average is thus 100ms. If the new task has been sleeping for a while it
starts at 90ms, if the task had been running lately it doesn't get this
bonus again.

> Shouldnt here the woken up task be set to the average time, like i did
> it in the crude prototype:
>
> + se->exec_runtime = avg_exec_runtime(cfs_rq);

That would be equivalent to simply clearing wait_runtime in CFS.

bye, Roman

2007-09-03 20:04:20

by Ingo Molnar

[permalink] [raw]

Subject: Re: [ANNOUNCE/RFC] Really Simple Really Fair Scheduler

* Roman Zippel <[email protected]> wrote:

> Hi,
>
> On Mon, 3 Sep 2007, Ingo Molnar wrote:
>
> > My next question then is about this code of yours in the wakeup path:
> >
> > +static void
> > +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > +{
> > + kclock_t min_time;
> > +
> > + verify_queue(cfs_rq, cfs_rq->curr != se, se);
> > + min_time = get_time_avg(cfs_rq) - se->req_weight_inv;
> > + if ((kclock_t)(se->time_norm - min_time) < 0)
> > + se->time_norm = min_time;
> >
> > why do you only use the "min_time" if the pre-sleep time_norm is smaller
> > than the min_time? Here 'min_time' is close to the current average.
>
> It's a variation of the sleeper bonus. [...]

hm, where are its effects described in your explanation? Seems like a
key item.

> [...] Let's assume two running tasks which have been running for 95ms
> and 105ms and a time slice of 10ms, the average is thus 100ms. If the
> new task has been sleeping for a while it starts at 90ms, if the task
> had been running lately it doesn't get this bonus again.

what happens if there are lots of such tasks? What limits the total
bonus?

> > Shouldnt here the woken up task be set to the average time, like i
> > did it in the crude prototype:
> >
> > + se->exec_runtime = avg_exec_runtime(cfs_rq);
>
> That would be equivalent to simply clearing wait_runtime in CFS.

so my prototype patch is not an exact map of the nice-0 special-case of
your code? Would this be the correct thing then perhaps:

+ se->exec_runtime =
+ max(avg_exec_runtime(cfs_rq), se->exec_runtime);

Or if not, could you suggest a code-line at that place? Thanks,

Ingo

2007-09-04 02:50:40

by Roman Zippel

[permalink] [raw]

Subject: Re: [ANNOUNCE/RFC] Really Simple Really Fair Scheduler

Hi,

On Mon, 3 Sep 2007, Ingo Molnar wrote:

> > It's a variation of the sleeper bonus. [...]
>
> hm, where are its effects described in your explanation? Seems like a
> key item.

It has no direct effect on the correctness of the mathematical model, the
time is initialized before the time is added to the model.
What you're after is the effect on the behaviour, which wasn't my focus,
so sorry, I didn't mention it.

> > [...] Let's assume two running tasks which have been running for 95ms
> > and 105ms and a time slice of 10ms, the average is thus 100ms. If the
> > new task has been sleeping for a while it starts at 90ms, if the task
> > had been running lately it doesn't get this bonus again.
>
> what happens if there are lots of such tasks? What limits the total
> bonus?

Every task gets a little more bonus, but it grows quite slowly, every task
gets time_slice/task_cnt more than the previous one (assuming all tasks
are started at the same time), so the 10th gets 2.82*time_slice, the 100th
gets 5.18*time_slice, the 1000th gets 7.48*time_slice...

I thought about it already and I didn't consider it a serious problem,
however it's solvable if needed by keeping the bonus separate:

avg = avg_exec_runtime(cfs_rq);
se->time_norm = kclock_max(avg - time_slice, se->time_norm);
se->bonus = avg - se->time_norm;
cfs_rq->time_sum += avg;

But the bonus also had to be undone when unqueueing the task:

cfs_rq->time_sum -= se->time_norm + bonus;

> > That would be equivalent to simply clearing wait_runtime in CFS.
>
> so my prototype patch is not an exact map of the nice-0 special-case of
> your code? Would this be the correct thing then perhaps:
>
> + se->exec_runtime =
> + max(avg_exec_runtime(cfs_rq), se->exec_runtime);

You should substract the desired bonus here from the average as above and
evtl. also store the given bonus separately.

bye, Roman

2007-09-04 06:30:26

by Ingo Molnar

[permalink] [raw]

Subject: Re: [ANNOUNCE/RFC] Really Simple Really Fair Scheduler

* Roman Zippel <[email protected]> wrote:

> > > It's a variation of the sleeper bonus. [...]
> >
> > hm, where are its effects described in your explanation? Seems like a
> > key item.
>
> It has no direct effect on the correctness of the mathematical model,
> the time is initialized before the time is added to the model. What
> you're after is the effect on the behaviour, which wasn't my focus, so
> sorry, I didn't mention it.

and what about the mirror image problem? Your math assumes that tasks
use up their full timeslices, somewhere starting at (12):

| (12) time_norm_app = sum_{t}^{T}(time_norm_{t} * weight_{t}) /
| sum_{t}^{T}(weight_{t})
|
| This produces only a approximate normalized time, but if all
| time_norm_{t} are equal (i.e. all tasks got their share), the result
| is the same, thus the error is only temporary.

>From here on the reasoning and the math is flawed i believe: it assumes
that tasks use up their timeslices - which is rarely the case.

In practical terms: your code causes unfairness to tasks that sleep when
they have used up only a fraction of their slice, when they are in a
fairness-deficit. For example consider a task that just waited alot to
get on the CPU, and when it finally got there (gathering a fairness
deficit of say 9 milliseconds), a hundred microseconds later it sleeps.

The problem is that when it wakes up and gets back on the runqueue your
code "forgets" that the task is in "need of fairness" by 9 milliseconds:
the task continues as if its previous starvation didnt happen at all.
This effect can cause _far more noise_ and even systematic starvation
than any numeric rounding effects could cause. (This could perhaps
explain the unfairness Mike reported, and this could perhaps explain the
noisy hackbench results i'm seeing with your patch - although i'm not
certain about this, i havent looked into those usecases in detail.)

CFS's current math addresses this problem via the use of the fair-clock
metric and via ->wait_runtime: that gives us a good idea what happened
to the system while the task slept. With your model, that information is
not maintained at all, and is not regainable. At the moment i dont see
how we could achieve good sleeper behavior with your model, you've
over-simplified the scheduler metrics and we've lost essential
information.

That's why i'm concentrating on the basic scheduling properties first,
without considering nice levels, rounding or optimizations - if that
basic model does not fit then a scheduler cannot work. That's also why
i've asked for a split-up patch series - it makes it far easier to
review and test the code and it makes it far easier to quickly apply the
obviously correct bits.

And i'd like to concur with Tong Li's observation that currently CFS is
a very generic fair scheduler upon which a multitude of scheduling
variants can be built and prototyped (we use a specific one right now,
but it's not cast into stone). The 30-lines-changed patch i sent shows
this property of CFS pretty well - and it already demonstrates the
issues we are talking about here.

Ingo

2007-09-04 11:21:20

by Roman Zippel

[permalink] [raw]

Subject: Re: [ANNOUNCE/RFC] Really Simple Really Fair Scheduler

Hi,

On Tue, 4 Sep 2007, Ingo Molnar wrote:

> and what about the mirror image problem?

Sorry, I'm not familiar with that in a scheduler context.

> Your math assumes that tasks
> use up their full timeslices, somewhere starting at (12):
>
> | (12) time_norm_app = sum_{t}^{T}(time_norm_{t} * weight_{t}) /
> | sum_{t}^{T}(weight_{t})
> |
> | This produces only a approximate normalized time, but if all
> | time_norm_{t} are equal (i.e. all tasks got their share), the result
> | is the same, thus the error is only temporary.
>
> >From here on the reasoning and the math is flawed i believe: it assumes
> that tasks use up their timeslices - which is rarely the case.

I'm not sure what you're refering to. The remaining math basically only
deals with the average calculation and if we're still at your simplified
model, then it's largely irrevelant, because the approximation error only
comes into effect with different weights. This means for the common case,
you can assume my code produces a perfect average.

> In practical terms: your code causes unfairness to tasks that sleep when
> they have used up only a fraction of their slice, when they are in a
> fairness-deficit. For example consider a task that just waited alot to
> get on the CPU, and when it finally got there (gathering a fairness
> deficit of say 9 milliseconds), a hundred microseconds later it sleeps.
>
> The problem is that when it wakes up and gets back on the runqueue your
> code "forgets" that the task is in "need of fairness" by 9 milliseconds:
> the task continues as if its previous starvation didnt happen at all.

Please put this into more context, e.g. demonstrate how this problem
doesn't exist in CFS. My code should largely mirror CFS behaviour with the
exception that the bonus maybe isn't as big and this:

if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
delta_fair = div64_likely32((u64)delta_fair * load,
load + se->load.weight);

> This effect can cause _far more noise_ and even systematic starvation
> than any numeric rounding effects could cause. (This could perhaps
> explain the unfairness Mike reported, and this could perhaps explain the
> noisy hackbench results i'm seeing with your patch - although i'm not
> certain about this, i havent looked into those usecases in detail.)

The problem Mike reported looks more like a bug, I'm puzzled why the
verify code didn't trigger as the whole thing was pretty much out of sync,
so in the next version I'll add a few more checks, but I'm not finished
with the hackbench results yet.

> CFS's current math addresses this problem via the use of the fair-clock
> metric and via ->wait_runtime: that gives us a good idea what happened
> to the system while the task slept. With your model, that information is
> not maintained at all, and is not regainable. At the moment i dont see
> how we could achieve good sleeper behavior with your model, you've
> over-simplified the scheduler metrics and we've lost essential
> information.

Please describe this sleeper behaviour, what kind of sleeper do you have
in mind exactly? If we take for example interactive tasks, these should be
constantly behind other running tasks and thus have an advantage at
getting the cpu, i.e. if a task doesn't use up its time slice its runtime
value won't advance as much as of other tasks which do use their slice.

bye, Roman