Currently the ideal slice length does not take group scheduling into account.
Change it so that it properly takes all the runnable tasks on this cpu into
account and caluclate the weight according to the grouping hierarchy.
Also fixes a bug in vslice which missed a factor NICE_0_LOAD.
Signed-off-by: Peter Zijlstra <[email protected]>
CC: Srivatsa Vaddagiri <[email protected]>
---
kernel/sched_fair.c | 42 +++++++++++++++++++++++++++++++-----------
1 file changed, 31 insertions(+), 11 deletions(-)
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -331,10 +331,15 @@ static u64 __sched_period(unsigned long
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 slice = __sched_period(cfs_rq->nr_running);
+ unsigned long nr_running = rq_of(cfs_rq)->nr_running;
+ u64 slice = __sched_period(nr_running);
- slice *= se->load.weight;
- do_div(slice, cfs_rq->load.weight);
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ slice *= se->load.weight;
+ do_div(slice, cfs_rq->load.weight);
+ }
return slice;
}
@@ -344,24 +349,39 @@ static u64 sched_slice(struct cfs_rq *cf
*
* vs = s/w = p/rw
*/
-static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
+static u64 __sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *new)
{
- u64 vslice = __sched_period(nr_running);
+ struct sched_entity *se = cfs_rq->curr;
+ unsigned long nr_running = rq_of(cfs_rq)->nr_running;
+ unsigned long weight = 0;
+ u64 vslice;
+
+ if (new) {
+ nr_running++;
+ weight = new->load.weight;
+ }
- do_div(vslice, rq_weight);
+ vslice = __sched_period(nr_running);
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ vslice *= NICE_0_LOAD;
+ do_div(vslice, cfs_rq->load.weight + weight);
+ weight = 0;
+ }
return vslice;
}
-static u64 sched_vslice(struct cfs_rq *cfs_rq)
+static inline u64 sched_vslice(struct cfs_rq *cfs_rq)
{
- return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
+ return __sched_vslice(cfs_rq, NULL);
}
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *new)
{
- return __sched_vslice(cfs_rq->load.weight + se->load.weight,
- cfs_rq->nr_running + 1);
+ return __sched_vslice(cfs_rq, new);
}
/*
--
On Wed, Oct 31, 2007 at 10:10:32PM +0100, Peter Zijlstra wrote:
> Currently the ideal slice length does not take group scheduling into account.
> Change it so that it properly takes all the runnable tasks on this cpu into
> account and caluclate the weight according to the grouping hierarchy.
>
> Also fixes a bug in vslice which missed a factor NICE_0_LOAD.
>
> Signed-off-by: Peter Zijlstra <[email protected]>
> CC: Srivatsa Vaddagiri <[email protected]>
> ---
> kernel/sched_fair.c | 42 +++++++++++++++++++++++++++++++-----------
> 1 file changed, 31 insertions(+), 11 deletions(-)
>
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c
> +++ linux-2.6/kernel/sched_fair.c
> @@ -331,10 +331,15 @@ static u64 __sched_period(unsigned long
> */
> static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> - u64 slice = __sched_period(cfs_rq->nr_running);
> + unsigned long nr_running = rq_of(cfs_rq)->nr_running;
> + u64 slice = __sched_period(nr_running);
>
> - slice *= se->load.weight;
> - do_div(slice, cfs_rq->load.weight);
> + for_each_sched_entity(se) {
> + cfs_rq = cfs_rq_of(se);
> +
> + slice *= se->load.weight;
> + do_div(slice, cfs_rq->load.weight);
> + }
>
> return slice;
Lets say we have two groups A and B on CPU0, of equal weight (1024).
Further,
A has 1 task (A0)
B has 1000 tasks (B0 .. B999)
Agreed its a extreme case, but illustrates the problem I have in mind
for this patch.
All tasks of same weight=1024.
Before this patch
=================
sched_slice(grp A) = 20ms * 1/2 = 10ms
sched_slice(A0) = 20ms
sched_slice(grp B) = 20ms * 1/2 = 10ms
sched_slice(B0) = (20ms * 1000/20) * 1 / 1000 = 1ms
sched_slice(B1) = ... = sched_slice(B99) = 1 ms
Fairness between groups and tasks would be obtained as below:
A0 B0-B9 A0 B10-B19 A0 B20-B29
|--------|--------|--------|--------|--------|--------|-----//--|
0 10ms 20ms 30ms 40ms 50ms 60ms
After this patch
================
sched_slice(grp A) = (20ms * 1001/20) * 1/2 ~= 500ms
sched_slice(A0) = 500ms
sched_slice(grp B) = 500ms
sched_slice(B0) = 0.5ms
Fairness between groups and tasks would be obtained as below:
A0 B0 - B99 A0
|-----------------------|-----------------------|-----------------------|
0 500ms 1000ms 1500ms
Did I get it right? If so, I don't like the fact that group A is allowed to run
for a long time (500ms) before giving chance to group B.
Can I know what real problem is being addressed by this change to
sched_slice()?
--
Regards,
vatsa
On Thu, 2007-11-01 at 17:01 +0530, Srivatsa Vaddagiri wrote:
> On Wed, Oct 31, 2007 at 10:10:32PM +0100, Peter Zijlstra wrote:
> > Currently the ideal slice length does not take group scheduling into account.
> > Change it so that it properly takes all the runnable tasks on this cpu into
> > account and caluclate the weight according to the grouping hierarchy.
> >
> > Also fixes a bug in vslice which missed a factor NICE_0_LOAD.
> >
> > Signed-off-by: Peter Zijlstra <[email protected]>
> > CC: Srivatsa Vaddagiri <[email protected]>
> > ---
> > kernel/sched_fair.c | 42 +++++++++++++++++++++++++++++++-----------
> > 1 file changed, 31 insertions(+), 11 deletions(-)
> >
> > Index: linux-2.6/kernel/sched_fair.c
> > ===================================================================
> > --- linux-2.6.orig/kernel/sched_fair.c
> > +++ linux-2.6/kernel/sched_fair.c
> > @@ -331,10 +331,15 @@ static u64 __sched_period(unsigned long
> > */
> > static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > - u64 slice = __sched_period(cfs_rq->nr_running);
> > + unsigned long nr_running = rq_of(cfs_rq)->nr_running;
> > + u64 slice = __sched_period(nr_running);
> >
> > - slice *= se->load.weight;
> > - do_div(slice, cfs_rq->load.weight);
> > + for_each_sched_entity(se) {
> > + cfs_rq = cfs_rq_of(se);
> > +
> > + slice *= se->load.weight;
> > + do_div(slice, cfs_rq->load.weight);
> > + }
> >
> > return slice;
>
>
> Lets say we have two groups A and B on CPU0, of equal weight (1024).
>
> Further,
>
> A has 1 task (A0)
> B has 1000 tasks (B0 .. B999)
>
> Agreed its a extreme case, but illustrates the problem I have in mind
> for this patch.
>
> All tasks of same weight=1024.
>
> Before this patch
> =================
>
> sched_slice(grp A) = 20ms * 1/2 = 10ms
> sched_slice(A0) = 20ms
>
> sched_slice(grp B) = 20ms * 1/2 = 10ms
> sched_slice(B0) = (20ms * 1000/20) * 1 / 1000 = 1ms
> sched_slice(B1) = ... = sched_slice(B99) = 1 ms
>
> Fairness between groups and tasks would be obtained as below:
>
> A0 B0-B9 A0 B10-B19 A0 B20-B29
> |--------|--------|--------|--------|--------|--------|-----//--|
> 0 10ms 20ms 30ms 40ms 50ms 60ms
>
> After this patch
> ================
>
> sched_slice(grp A) = (20ms * 1001/20) * 1/2 ~= 500ms
> sched_slice(A0) = 500ms
Hmm, right that is indeed not intended
> sched_slice(grp B) = 500ms
> sched_slice(B0) = 0.5ms
This 0.5 is indeed correct, whereas the previous 1ms was not
> Fairness between groups and tasks would be obtained as below:
>
> A0 B0 - B99 A0
> |-----------------------|-----------------------|-----------------------|
> 0 500ms 1000ms 1500ms
>
> Did I get it right? If so, I don't like the fact that group A is allowed to run
> for a long time (500ms) before giving chance to group B.
Hmm, quite bad indeed.
> Can I know what real problem is being addressed by this change to
> sched_slice()?
sched_slice() is about lantecy, its intended purpose is to ensure each
task is ran exactly once during sched_period() - which is
sysctl_sched_latency when nr_running <= sysctl_sched_nr_latency, and
otherwise linearly scales latency.
On Thu, 2007-11-01 at 12:51 +0100, Peter Zijlstra wrote:
> On Thu, 2007-11-01 at 17:01 +0530, Srivatsa Vaddagiri wrote:
> > On Wed, Oct 31, 2007 at 10:10:32PM +0100, Peter Zijlstra wrote:
> > > Currently the ideal slice length does not take group scheduling into account.
> > > Change it so that it properly takes all the runnable tasks on this cpu into
> > > account and caluclate the weight according to the grouping hierarchy.
> > >
> > > Also fixes a bug in vslice which missed a factor NICE_0_LOAD.
> > >
> > > Signed-off-by: Peter Zijlstra <[email protected]>
> > > CC: Srivatsa Vaddagiri <[email protected]>
> > > ---
> > > kernel/sched_fair.c | 42 +++++++++++++++++++++++++++++++-----------
> > > 1 file changed, 31 insertions(+), 11 deletions(-)
> > >
> > > Index: linux-2.6/kernel/sched_fair.c
> > > ===================================================================
> > > --- linux-2.6.orig/kernel/sched_fair.c
> > > +++ linux-2.6/kernel/sched_fair.c
> > > @@ -331,10 +331,15 @@ static u64 __sched_period(unsigned long
> > > */
> > > static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > > {
> > > - u64 slice = __sched_period(cfs_rq->nr_running);
> > > + unsigned long nr_running = rq_of(cfs_rq)->nr_running;
> > > + u64 slice = __sched_period(nr_running);
> > >
> > > - slice *= se->load.weight;
> > > - do_div(slice, cfs_rq->load.weight);
> > > + for_each_sched_entity(se) {
> > > + cfs_rq = cfs_rq_of(se);
> > > +
> > > + slice *= se->load.weight;
> > > + do_div(slice, cfs_rq->load.weight);
> > > + }
> > >
> > > return slice;
> >
> >
> > Lets say we have two groups A and B on CPU0, of equal weight (1024).
> >
> > Further,
> >
> > A has 1 task (A0)
> > B has 1000 tasks (B0 .. B999)
> >
> > Agreed its a extreme case, but illustrates the problem I have in mind
> > for this patch.
> >
> > All tasks of same weight=1024.
> >
> > Before this patch
> > =================
> >
> > sched_slice(grp A) = 20ms * 1/2 = 10ms
> > sched_slice(A0) = 20ms
> >
> > sched_slice(grp B) = 20ms * 1/2 = 10ms
> > sched_slice(B0) = (20ms * 1000/20) * 1 / 1000 = 1ms
> > sched_slice(B1) = ... = sched_slice(B99) = 1 ms
> >
> > Fairness between groups and tasks would be obtained as below:
> >
> > A0 B0-B9 A0 B10-B19 A0 B20-B29
> > |--------|--------|--------|--------|--------|--------|-----//--|
> > 0 10ms 20ms 30ms 40ms 50ms 60ms
> >
> > After this patch
> > ================
> >
> > sched_slice(grp A) = (20ms * 1001/20) * 1/2 ~= 500ms
> > sched_slice(A0) = 500ms
>
> Hmm, right that is indeed not intended
>
> > sched_slice(grp B) = 500ms
> > sched_slice(B0) = 0.5ms
>
> This 0.5 is indeed correct, whereas the previous 1ms was not
>
> > Fairness between groups and tasks would be obtained as below:
> >
> > A0 B0 - B99 A0
> > |-----------------------|-----------------------|-----------------------|
> > 0 500ms 1000ms 1500ms
> >
> > Did I get it right? If so, I don't like the fact that group A is allowed to run
> > for a long time (500ms) before giving chance to group B.
>
> Hmm, quite bad indeed.
hmm, then again, with 1001 tasks running, that is exactly what should
happen.
> > Can I know what real problem is being addressed by this change to
> > sched_slice()?
>
> sched_slice() is about lantecy, its intended purpose is to ensure each
> task is ran exactly once during sched_period() - which is
> sysctl_sched_latency when nr_running <= sysctl_sched_nr_latency, and
> otherwise linearly scales latency.
>
>
On Thu, 2007-11-01 at 12:58 +0100, Peter Zijlstra wrote:
> > sched_slice() is about lantecy, its intended purpose is to ensure each
> > task is ran exactly once during sched_period() - which is
> > sysctl_sched_latency when nr_running <= sysctl_sched_nr_latency, and
> > otherwise linearly scales latency.
The thing that got my brain in a twist is what to do about the non-leaf
nodes, for those it seems I'm not doing the right thing - I think.
On Thu, 2007-11-01 at 13:03 +0100, Peter Zijlstra wrote:
> On Thu, 2007-11-01 at 12:58 +0100, Peter Zijlstra wrote:
>
> > > sched_slice() is about lantecy, its intended purpose is to ensure each
> > > task is ran exactly once during sched_period() - which is
> > > sysctl_sched_latency when nr_running <= sysctl_sched_nr_latency, and
> > > otherwise linearly scales latency.
>
> The thing that got my brain in a twist is what to do about the non-leaf
> nodes, for those it seems I'm not doing the right thing - I think.
Ok, suppose a tree like so:
level 2 cfs_rq
A B
level 1 cfs_rqA cfs_rqB
A0 B0 - B99
So for sake of determinism, we want to impose a period in which all
level 1 tasks will have ran (at least) once.
Now what sched_slice() does is calculate the weighted proportion of the
given period for each task to run, so that each task runs exactly once.
Now level 2, can introduce these large weight differences, which in this
case result in 'lumps' of time.
In the given example above the weight difference is 1:100, which is
already at the edges of what regular nice levels could do.
How about limiting the max output of sched_slice() to
sysctl_sched_latency in order to break up these large stretches of time?
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -341,7 +341,7 @@ static u64 sched_slice(struct cfs_rq *cf
do_div(slice, cfs_rq->load.weight);
}
- return slice;
+ return min_t(u64, sysctl_sched_latency, slice);
}
/*
On Thu, Nov 01, 2007 at 01:20:08PM +0100, Peter Zijlstra wrote:
> On Thu, 2007-11-01 at 13:03 +0100, Peter Zijlstra wrote:
> > On Thu, 2007-11-01 at 12:58 +0100, Peter Zijlstra wrote:
> >
> > > > sched_slice() is about lantecy, its intended purpose is to ensure each
> > > > task is ran exactly once during sched_period() - which is
> > > > sysctl_sched_latency when nr_running <= sysctl_sched_nr_latency, and
> > > > otherwise linearly scales latency.
> >
> > The thing that got my brain in a twist is what to do about the non-leaf
> > nodes, for those it seems I'm not doing the right thing - I think.
>
> Ok, suppose a tree like so:
>
>
> level 2 cfs_rq
> A B
>
> level 1 cfs_rqA cfs_rqB
> A0 B0 - B99
>
>
> So for sake of determinism, we want to impose a period in which all
> level 1 tasks will have ran (at least) once.
Peter,
I fail to see why this requirement to "determine a period in
which all level 1 tasks will have ran (at least) once" is essential.
I am visualizing each of the groups to be similar to Xen-like partitions
which are given fair timeslices by the hypervisor (Linux kernel in this
case). How each partition (group in this case) manages the allocated
timeslice(s) to provide fairness to tasks within that partition/group should not
(IMHO) depend on other groups and esp. how many tasks other groups has.
For ex: before this patch, fair time would be allocated to group and
their tasks as below:
A0 B0-B9 A0 B10-B19 A0 B20-B29
|--------|--------|--------|--------|--------|--------|-----//--|
0 10ms 20ms 30ms 40ms 50ms 60ms
i.e during the first 10ms allocated to group B, B0-B9 run,
during the next 10ms allocated to group B, B10-B19 run etc
What's wrong with this scheme?
By letting __sched_period() be determined for each group independently,
we are building stronger isolation between them, which is good IMO
(imagine a rogue container that does a fork bomb).
> Now what sched_slice() does is calculate the weighted proportion of the
> given period for each task to run, so that each task runs exactly once.
>
> Now level 2, can introduce these large weight differences, which in this
> case result in 'lumps' of time.
>
> In the given example above the weight difference is 1:100, which is
> already at the edges of what regular nice levels could do.
>
> How about limiting the max output of sched_slice() to
> sysctl_sched_latency in order to break up these large stretches of time?
>
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c
> +++ linux-2.6/kernel/sched_fair.c
> @@ -341,7 +341,7 @@ static u64 sched_slice(struct cfs_rq *cf
> do_div(slice, cfs_rq->load.weight);
> }
>
> - return slice;
> + return min_t(u64, sysctl_sched_latency, slice);
Hmm, going back to the previous example I cited, this will lead to:
sched_slice(grp A) = min(20ms, 500ms) = 20ms
sched_slice(A0) = min(20ms, 500ms) = 20ms
sched_slice(grp B) = min(20ms, 500ms) = 20ms
sched_slice(B0) = min(20ms, 0.5ms) = 0.5ms
Fairness between groups and tasks would be obtained as below:
A0 B0-B39 A0 B40-B79 A0
|--------|--------|--------|--------|--------|
0 20ms 40ms 60ms 80ms
which seems to be more or less giving what we already have w/o the
patch?
--
Regards,
vatsa
On Thu, 2007-11-01 at 22:01 +0530, Srivatsa Vaddagiri wrote:
> On Thu, Nov 01, 2007 at 01:20:08PM +0100, Peter Zijlstra wrote:
> > On Thu, 2007-11-01 at 13:03 +0100, Peter Zijlstra wrote:
> > > On Thu, 2007-11-01 at 12:58 +0100, Peter Zijlstra wrote:
> > >
> > > > > sched_slice() is about lantecy, its intended purpose is to ensure each
> > > > > task is ran exactly once during sched_period() - which is
> > > > > sysctl_sched_latency when nr_running <= sysctl_sched_nr_latency, and
> > > > > otherwise linearly scales latency.
> > >
> > > The thing that got my brain in a twist is what to do about the non-leaf
> > > nodes, for those it seems I'm not doing the right thing - I think.
> >
> > Ok, suppose a tree like so:
> >
> >
> > level 2 cfs_rq
> > A B
> >
> > level 1 cfs_rqA cfs_rqB
> > A0 B0 - B99
> >
> >
> > So for sake of determinism, we want to impose a period in which all
> > level 1 tasks will have ran (at least) once.
>
> Peter,
> I fail to see why this requirement to "determine a period in
> which all level 1 tasks will have ran (at least) once" is essential.
Because it gives a steady feel to things. For humans its most essential
that things run in a predicable fashion. So no only does it matter how
much time a task gets, it also very much matters when it gets that.
It contributes to the feeling of gradual slow down.
> I am visualizing each of the groups to be similar to Xen-like partitions
> which are given fair timeslices by the hypervisor (Linux kernel in this
> case). How each partition (group in this case) manages the allocated
> timeslice(s) to provide fairness to tasks within that partition/group should not
> (IMHO) depend on other groups and esp. how many tasks other groups has.
Agreed, I've realised this since my last mail, one group should not
influence another in such a fashion, in this respect you don't want to
flatten it like I did.
> For ex: before this patch, fair time would be allocated to group and
> their tasks as below:
>
> A0 B0-B9 A0 B10-B19 A0 B20-B29
> |--------|--------|--------|--------|--------|--------|-----//--|
> 0 10ms 20ms 30ms 40ms 50ms 60ms
>
> i.e during the first 10ms allocated to group B, B0-B9 run,
> during the next 10ms allocated to group B, B10-B19 run etc
>
> What's wrong with this scheme?
What made me start tinkering here is that the nested level is again
distributing wall-time without being aware of the fraction it gets from
the upper levels.
So if we have two groups, A and B, and B is selected for 1/2 of period,
then Bn will again divide period, even though in reality it will only
have p/2.
So I guess, I need a top down traversal, not a bottom up traversal to
get this fixed up... I'll ponder this.
> By letting __sched_period() be determined for each group independently,
> we are building stronger isolation between them, which is good IMO
> (imagine a rogue container that does a fork bomb).
Agreed.
> > Index: linux-2.6/kernel/sched_fair.c
> > ===================================================================
> > --- linux-2.6.orig/kernel/sched_fair.c
> > +++ linux-2.6/kernel/sched_fair.c
> > @@ -341,7 +341,7 @@ static u64 sched_slice(struct cfs_rq *cf
> > do_div(slice, cfs_rq->load.weight);
> > }
> >
> > - return slice;
> > + return min_t(u64, sysctl_sched_latency, slice);
> which seems to be more or less giving what we already have w/o the
> patch?
Well, its basically giving up on overload, admittedly not very nice.