Index: GIT-warnings/fs/nfs/inode.c
===================================================================
--- GIT-warnings.orig/fs/nfs/inode.c 2005-12-21 16:22:09.000000000 +1100
+++ GIT-warnings/fs/nfs/inode.c 2005-12-21 16:22:11.000000000 +1100
@@ -937,7 +937,8 @@ static int nfs_wait_on_inode(struct inod
rpc_clnt_sigmask(clnt, &oldmask);
error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING,
- nfs_wait_schedule, TASK_INTERRUPTIBLE);
+ nfs_wait_schedule,
+ TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
rpc_clnt_sigunmask(clnt, &oldmask);
return error;
Index: GIT-warnings/fs/nfs/nfs4proc.c
===================================================================
--- GIT-warnings.orig/fs/nfs/nfs4proc.c 2005-12-21 16:22:09.000000000 +1100
+++ GIT-warnings/fs/nfs/nfs4proc.c 2005-12-21 16:22:11.000000000 +1100
@@ -2547,7 +2547,7 @@ static int nfs4_wait_clnt_recover(struct
rpc_clnt_sigmask(clnt, &oldset);
interruptible = TASK_UNINTERRUPTIBLE;
if (clnt->cl_intr)
- interruptible = TASK_INTERRUPTIBLE;
+ interruptible = TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE;
prepare_to_wait(&clp->cl_waitq, &wait, interruptible);
nfs4_schedule_state_recovery(clp);
if (clnt->cl_intr && signalled())
Index: GIT-warnings/fs/nfs/pagelist.c
===================================================================
--- GIT-warnings.orig/fs/nfs/pagelist.c 2005-12-21 16:22:09.000000000 +1100
+++ GIT-warnings/fs/nfs/pagelist.c 2005-12-21 16:22:11.000000000 +1100
@@ -210,7 +210,8 @@ nfs_wait_on_request(struct nfs_page *req
*/
rpc_clnt_sigmask(clnt, &oldmask);
ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY,
- nfs_wait_bit_interruptible, TASK_INTERRUPTIBLE);
+ nfs_wait_bit_interruptible,
+ TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
rpc_clnt_sigunmask(clnt, &oldmask);
out:
return ret;
Index: GIT-warnings/fs/nfs/write.c
===================================================================
--- GIT-warnings.orig/fs/nfs/write.c 2005-12-21 16:22:09.000000000 +1100
+++ GIT-warnings/fs/nfs/write.c 2005-12-21 16:22:11.000000000 +1100
@@ -595,7 +595,8 @@ static int nfs_wait_on_write_congestion(
sigset_t oldset;
rpc_clnt_sigmask(clnt, &oldset);
- prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(&nfs_write_congestion, &wait,
+ TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
if (bdi_write_congested(bdi)) {
if (signalled())
ret = -ERESTARTSYS;
Index: GIT-warnings/net/sunrpc/sched.c
===================================================================
--- GIT-warnings.orig/net/sunrpc/sched.c 2005-12-21 16:22:09.000000000 +1100
+++ GIT-warnings/net/sunrpc/sched.c 2005-12-21 16:22:11.000000000 +1100
@@ -659,7 +659,7 @@ static int __rpc_execute(struct rpc_task
/* Note: Caller should be using rpc_clnt_sigmask() */
status = out_of_line_wait_on_bit(&task->tk_runstate,
RPC_TASK_QUEUED, rpc_wait_bit_interruptible,
- TASK_INTERRUPTIBLE);
+ TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
if (status == -ERESTARTSYS) {
/*
* When a sync task receives a signal, it exits with
Index: GIT-warnings/net/sunrpc/svcsock.c
===================================================================
--- GIT-warnings.orig/net/sunrpc/svcsock.c 2005-12-21 16:22:09.000000000 +1100
+++ GIT-warnings/net/sunrpc/svcsock.c 2005-12-21 16:22:11.000000000 +1100
@@ -1213,7 +1213,7 @@ svc_recv(struct svc_serv *serv, struct s
* We have to be able to interrupt this wait
* to bring down the daemons ...
*/
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
add_wait_queue(&rqstp->rq_wait, &wait);
spin_unlock_bh(&serv->sv_lock);
On Wed, 2005-12-21 at 17:00 +1100, Peter Williams wrote:
> This patch addresses the adverse effect that the NFS client can have on
> interactive response when CPU bound tasks (such as a kernel build)
> operate on files mounted via NFS. (NB It is emphasized that this has
> nothing to do with the effects of interactive tasks accessing NFS
> mounted files themselves.)
>
> The problem occurs because tasks accessing NFS mounted files for data
> can undergo quite a lot of TASK_INTERRUPTIBLE sleep depending on the
> load on the server and the quality of the network connection. This can
> result in these tasks getting quite high values for sleep_avg and
> consequently a large priority bonus. On the system where I noticed this
> problem they were getting the full 10 bonus points and being given the
> same dynamic priority as genuine interactive tasks such as the X server
> and rythmbox.
>
> The solution to this problem is to use TASK_NONINTERACTIVE to tell the
> scheduler that the TASK_INTERRUPTIBLE sleeps in the NFS client and
> SUNRPC are NOT interactive sleeps.
Sorry. That theory is just plain wrong. ALL of those case _ARE_
interactive sleeps.
Cheers,
Trond
Trond Myklebust wrote:
> On Wed, 2005-12-21 at 17:00 +1100, Peter Williams wrote:
>
>>This patch addresses the adverse effect that the NFS client can have on
>>interactive response when CPU bound tasks (such as a kernel build)
>>operate on files mounted via NFS. (NB It is emphasized that this has
>>nothing to do with the effects of interactive tasks accessing NFS
>>mounted files themselves.)
>>
>>The problem occurs because tasks accessing NFS mounted files for data
>>can undergo quite a lot of TASK_INTERRUPTIBLE sleep depending on the
>>load on the server and the quality of the network connection. This can
>>result in these tasks getting quite high values for sleep_avg and
>>consequently a large priority bonus. On the system where I noticed this
>>problem they were getting the full 10 bonus points and being given the
>>same dynamic priority as genuine interactive tasks such as the X server
>>and rythmbox.
>>
>>The solution to this problem is to use TASK_NONINTERACTIVE to tell the
>>scheduler that the TASK_INTERRUPTIBLE sleeps in the NFS client and
>>SUNRPC are NOT interactive sleeps.
>
>
> Sorry. That theory is just plain wrong. ALL of those case _ARE_
> interactive sleeps.
It's not a theory. It's a result of observing a -j 16 build with the
sources on an NFS mounted file system with top with and without the
patches and comparing that with the same builds with the sources on a
local file system. Without the patches the tasks in the kernel build
all get the same dynamic priority as the X server and other interactive
programs when the sources are on an NFS mounted file system. With the
patches they generally have dynamic priorities between 6 to 10 higher
than the X server and other interactive programs.
In both cases, when the build is run on a source on a local file system
the kernel build tasks all have dynamic priorities 6 to 10 higher than
the X server and other interactive programs.
In all cases, the dynamic priorities of the X server and other
interactive programs are the same.
In the testing that I have done so far the patch has not resulted in any
genuine interactive tasks not being identified as interactive.
Peter
PS There's a difference between interruptible and interactive in that
while all interactive sleeps will be interruptible not all interruptible
sleeps are interactive. Ingo introduced TASK_NONINTERACTIVE to enable
this distinction to be made.
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
On Wed, 2005-12-21 at 17:32 +1100, Peter Williams wrote:
> > Sorry. That theory is just plain wrong. ALL of those case _ARE_
> > interactive sleeps.
>
> It's not a theory. It's a result of observing a -j 16 build with the
> sources on an NFS mounted file system with top with and without the
> patches and comparing that with the same builds with the sources on a
> local file system. Without the patches the tasks in the kernel build
> all get the same dynamic priority as the X server and other interactive
> programs when the sources are on an NFS mounted file system. With the
> patches they generally have dynamic priorities between 6 to 10 higher
> than the X server and other interactive programs.
...and if you stick in a faster server?...
There is _NO_ fundamental difference between NFS and a local filesystem
that warrants marking one as "interactive" and the other as
"noninteractive". What you are basically saying is that all I/O should
be marked as TASK_NONINTERACTIVE.
Cheers,
Trond
On Dec 21, 2005, at 08:21, Trond Myklebust wrote:
> ...and if you stick in a faster server?...
>
> There is _NO_ fundamental difference between NFS and a local
> filesystem that warrants marking one as "interactive" and the other
> as "noninteractive". What you are basically saying is that all I/O
> should be marked as TASK_NONINTERACTIVE.
Uhh, what part of disk/NFS/filesystem access is "interactive"? Which
of those sleeps directly involve responding to user-interface
events? _That_ is the whole point of the interactivity bonus, and
precisely why Ingo introduced TASK_NONINTERACTIVE sleeps; so that
processes that are not being useful for interactivity could be moved
away from TASK_NONINTERRUPTABLE, with the end result that the X-
server could be run at priority 0 without harming interactivity, even
during heavy *disk*, *NFS*, and *network* activity. Admittedly, that
may not be what some people want, but they're welcome to turn off the
interactivity bonuses via some file in /proc (sorry, don't remember
which at the moment).
Cheers,
Kyle Moffett
--
I have yet to see any problem, however complicated, which, when you
looked at it in the right way, did not become still more complicated.
-- Poul Anderson
On Wed, 2005-12-21 at 08:36 -0500, Kyle Moffett wrote:
> On Dec 21, 2005, at 08:21, Trond Myklebust wrote:
> > ...and if you stick in a faster server?...
> >
> > There is _NO_ fundamental difference between NFS and a local
> > filesystem that warrants marking one as "interactive" and the other
> > as "noninteractive". What you are basically saying is that all I/O
> > should be marked as TASK_NONINTERACTIVE.
>
> Uhh, what part of disk/NFS/filesystem access is "interactive"? Which
> of those sleeps directly involve responding to user-interface
> events? _That_ is the whole point of the interactivity bonus, and
> precisely why Ingo introduced TASK_NONINTERACTIVE sleeps; so that
> processes that are not being useful for interactivity could be moved
> away from TASK_NONINTERRUPTABLE, with the end result that the X-
> server could be run at priority 0 without harming interactivity, even
> during heavy *disk*, *NFS*, and *network* activity. Admittedly, that
> may not be what some people want, but they're welcome to turn off the
> interactivity bonuses via some file in /proc (sorry, don't remember
> which at the moment).
Then have io_schedule() automatically set that flag, and convert NFS to
use io_schedule(), or something along those lines. I don't want a bunch
of RT-specific flags littering the NFS/RPC code.
Cheers,
Trond
Kyle Moffett <[email protected]> wrote:
> On Dec 21, 2005, at 08:21, Trond Myklebust wrote:
> > ...and if you stick in a faster server?...
> > There is _NO_ fundamental difference between NFS and a local
> > filesystem that warrants marking one as "interactive" and the other
> > as "noninteractive". What you are basically saying is that all I/O
> > should be marked as TASK_NONINTERACTIVE.
> Uhh, what part of disk/NFS/filesystem access is "interactive"? Which
> of those sleeps directly involve responding to user-interface events?
And if it is a user waiting for the data to display? Can't distinguish that
so easily from the compiler waiting for something to do...
--
Dr. Horst H. von Brand User #22616 counter.li.org
Departamento de Informatica Fono: +56 32 654431
Universidad Tecnica Federico Santa Maria +56 32 654239
Casilla 110-V, Valparaiso, Chile Fax: +56 32 797513
* Peter Williams <[email protected]> wrote:
> It's not a theory. It's a result of observing a -j 16 build with the
> sources on an NFS mounted file system with top with and without the
> patches and comparing that with the same builds with the sources on a
> local file system. [...]
could you try the build with the scheduler queue from -mm, and set the
shell to SCHED_BATCH first? Do you still see interactivity problems
after that?
i'm not sure we want to override the scheduling patterns observed by the
kernel, via TASK_NONINTERACTIVE - apart of a few obvious cases.
Ingo
On Dec 21, 2005, at 11:10, Horst von Brand wrote:
> Kyle Moffett <[email protected]> wrote:
>> On Dec 21, 2005, at 08:21, Trond Myklebust wrote:
>>> ...and if you stick in a faster server?...
>>> There is _NO_ fundamental difference between NFS and a local
>>> filesystem that warrants marking one as "interactive" and the
>>> other as "noninteractive". What you are basically saying is that
>>> all I/O should be marked as TASK_NONINTERACTIVE.
>>
>> Uhh, what part of disk/NFS/filesystem access is "interactive"?
>> Which of those sleeps directly involve responding to user-
>> interface events?
>
> And if it is a user waiting for the data to display? Can't
> distinguish that so easily from the compiler waiting for something
> to do...
No, but in that case the program probably _already_ has some
interactivity bonus just from user interaction. On the other hand,
UI programming guidelines say that any task which might take more
than a half-second or so should not be run in the event loop, but in
a separate thread (either a drawing thread or similar). In that
case, your event loop thread is the one with the interactivity bonus,
and the others are just data processing threads (like the compile you
have running in the background or the webserver responding to HTTP
requests), that the user would need to manually arbitrate between
with nice levels.
The whole point of the interactivity bonus was that processes that
follow the cycle <waiting-for-input> => <respond-to-input-for-less-
than-time-quantum> => <waiting-for-input> would get a boost; things
like dragging a window or handling mouse or keyboard events should
happen within a small number of milliseconds, whereas background
tasks really _don't_ care if they are delayed running their time
quantum by 400ms, as long as they get their full quantum during each
cycle.
Cheers,
Kyle Moffett
--
Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are, by
definition, not smart enough to debug it.
-- Brian Kernighan
Ingo Molnar wrote:
> * Peter Williams <[email protected]> wrote:
>
>
>>It's not a theory. It's a result of observing a -j 16 build with the
>>sources on an NFS mounted file system with top with and without the
>>patches and comparing that with the same builds with the sources on a
>>local file system. [...]
>
>
> could you try the build with the scheduler queue from -mm, and set the
> shell to SCHED_BATCH first? Do you still see interactivity problems
> after that?
There's no real point in doing such a test as running the build as
SCHED_BATCH would obviously prevent its tasks from getting any
interactive bonus. So I'll concede that is a solution.
However, the problem I see with this solution is that it's pushing the
onus onto the user and forcing them to decide/remember to run non
interactive tasks as SCHED_BATCH (and I see the whole point of the
interactive responsiveness embellishments of the scheduler being to free
the user of the need to worry about these things). It's a marginally
better solution than its complement i.e. marking interactive tasks as
being such via putting them in a (hypothetical) SCHED_IA class because
that would clearly have to be a privileged operation unlike setting
SCHED_BATCH.
This is a case where the PAGG patches would have been useful. With them
a mechanism for monitoring exec()s and shifting programs to SCHED_BATCH
based on what program they had just exec()ed would be possible making
SCHED_BATCH a better solution to this problem. If PAGG were
complimented with a kernel to user space event notification mechanism
the bulk of this could be accomplished in user space. The new code SGI
is proposing as an alternative to PAGG may meet these requirements?
>
> i'm not sure we want to override the scheduling patterns observed by the
> kernel, via TASK_NONINTERACTIVE - apart of a few obvious cases.
I thought that this was one of the obvious cases. I.e. interruptible
sleeps that clearly aren't interactive.
I interpreted your statement "Right now only pipe_wait() will make use
of it, because it's a common source of not-so-interactive waits (kernel
compilation jobs, etc.)." in the original announcement of
TASK_INTERACTIVE to mean that it was a "work in progresss" and would be
used more extensively when other places for its application were identified.
BTW I don't think that it should be blindly applied to all file system
code as I tried that and it resulted in the X server not getting any
interactive bonus with obvious consequences :-(. I think that use of
TASK_NONINTERACTIVE should be done carefully and tested to make sure
that it has no unexpected scheduling implications (and I think that this
is such a case). Provided the TASK_XXX flags are always treated as such
there should be no changes to the semantics or efficiency (after all,
it's just an extra bit in an integer constant set at compile time) of
any other code (than the scheduler's) as a result of its use.
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
Kyle Moffett wrote:
> On Dec 21, 2005, at 11:10, Horst von Brand wrote:
>
>> Kyle Moffett <[email protected]> wrote:
>>
>>> On Dec 21, 2005, at 08:21, Trond Myklebust wrote:
>>>
>>>> ...and if you stick in a faster server?...
>>>> There is _NO_ fundamental difference between NFS and a local
>>>> filesystem that warrants marking one as "interactive" and the other
>>>> as "noninteractive". What you are basically saying is that all I/O
>>>> should be marked as TASK_NONINTERACTIVE.
>>>
>>>
>>> Uhh, what part of disk/NFS/filesystem access is "interactive"?
>>> Which of those sleeps directly involve responding to user- interface
>>> events?
>>
>>
>> And if it is a user waiting for the data to display? Can't
>> distinguish that so easily from the compiler waiting for something to
>> do...
>
>
> No, but in that case the program probably _already_ has some
> interactivity bonus just from user interaction.
And if it doesn't then it is (by definition) not interactive. :-)
As you imply, this change is targetting those tasks whose ONLY
interruptible sleeps are due to NFS use.
> On the other hand, UI
> programming guidelines say that any task which might take more than a
> half-second or so should not be run in the event loop, but in a
> separate thread (either a drawing thread or similar). In that case,
> your event loop thread is the one with the interactivity bonus, and the
> others are just data processing threads (like the compile you have
> running in the background or the webserver responding to HTTP
> requests), that the user would need to manually arbitrate between with
> nice levels.
>
> The whole point of the interactivity bonus was that processes that
> follow the cycle <waiting-for-input> => <respond-to-input-for-less-
> than-time-quantum> => <waiting-for-input> would get a boost; things
> like dragging a window or handling mouse or keyboard events should
> happen within a small number of milliseconds, whereas background tasks
> really _don't_ care if they are delayed running their time quantum by
> 400ms, as long as they get their full quantum during each cycle.
Exactly. It's all about latency and doesn't really effect the
allocation of CPU resources according to niceness as that is handled via
differential time slice allocations.
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
Trond Myklebust wrote:
> On Wed, 2005-12-21 at 08:36 -0500, Kyle Moffett wrote:
>
>>On Dec 21, 2005, at 08:21, Trond Myklebust wrote:
>>
>>>...and if you stick in a faster server?...
>>>
>>>There is _NO_ fundamental difference between NFS and a local
>>>filesystem that warrants marking one as "interactive" and the other
>>>as "noninteractive". What you are basically saying is that all I/O
>>>should be marked as TASK_NONINTERACTIVE.
>>
>>Uhh, what part of disk/NFS/filesystem access is "interactive"? Which
>>of those sleeps directly involve responding to user-interface
>>events? _That_ is the whole point of the interactivity bonus, and
>>precisely why Ingo introduced TASK_NONINTERACTIVE sleeps; so that
>>processes that are not being useful for interactivity could be moved
>>away from TASK_NONINTERRUPTABLE, with the end result that the X-
>>server could be run at priority 0 without harming interactivity, even
>>during heavy *disk*, *NFS*, and *network* activity. Admittedly, that
>>may not be what some people want, but they're welcome to turn off the
>>interactivity bonuses via some file in /proc (sorry, don't remember
>>which at the moment).
>
>
> Then have io_schedule() automatically set that flag, and convert NFS to
> use io_schedule(), or something along those lines. I don't want a bunch
> of RT-specific flags littering the NFS/RPC code.
This flag isn't RT-specific. It's used in the scheduling SCHED_NORMAL
tasks and has no other semantic effects.
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
On Thu, 2005-12-22 at 13:26 +1100, Peter Williams wrote:
> > Then have io_schedule() automatically set that flag, and convert NFS to
> > use io_schedule(), or something along those lines. I don't want a bunch
> > of RT-specific flags littering the NFS/RPC code.
>
> This flag isn't RT-specific. It's used in the scheduling SCHED_NORMAL
> tasks and has no other semantic effects.
It still has sod all business being in the NFS code. We don't touch task
scheduling in the filesystem code.
Trond
Trond Myklebust wrote:
> On Thu, 2005-12-22 at 13:26 +1100, Peter Williams wrote:
>
>
>>>Then have io_schedule() automatically set that flag, and convert NFS to
>>>use io_schedule(), or something along those lines. I don't want a bunch
>>>of RT-specific flags littering the NFS/RPC code.
>>
>>This flag isn't RT-specific. It's used in the scheduling SCHED_NORMAL
>>tasks and has no other semantic effects.
>
>
> It still has sod all business being in the NFS code. We don't touch task
> scheduling in the filesystem code.
How do you explain the use of the TASK_INTERRUPTIBLE flag then?
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
On Fri, 2005-12-23 at 09:33 +1100, Peter Williams wrote:
> > It still has sod all business being in the NFS code. We don't touch task
> > scheduling in the filesystem code.
>
> How do you explain the use of the TASK_INTERRUPTIBLE flag then?
Oh, please...
TASK_INTERRUPTIBLE is used to set the task to sleep. It has NOTHING to
do with scheduling.
Trond
On Dec 22, 2005, at 17:59, Trond Myklebust wrote:
> On Fri, 2005-12-23 at 09:33 +1100, Peter Williams wrote:
>>> It still has sod all business being in the NFS code. We don't
>>> touch task scheduling in the filesystem code.
>>
>> How do you explain the use of the TASK_INTERRUPTIBLE flag then?
>
> Oh, please...
>
> TASK_INTERRUPTIBLE is used to set the task to sleep. It has NOTHING
> to do with scheduling.
Putting a task to sleep _is_ rescheduling it. TASK_NONINTERACTIVE
means that you are about to reschedule and are willing to tolerate a
higher wakeup latency. TASK_INTERRUPTABLE means you are about to
sleep and want to be woken up using the "standard" latency. If you
do any kind of sleep at all, both are valid, independent of what part
of the kernel you are. There's a reason that both are TASK_* flags.
Cheers,
Kyle Moffett
--
If you don't believe that a case based on [nothing] could potentially
drag on in court for _years_, then you have no business playing with
the legal system at all.
-- Rob Landley
On Thu, 2005-12-22 at 19:02 -0500, Kyle Moffett wrote:
> On Dec 22, 2005, at 17:59, Trond Myklebust wrote:
> > On Fri, 2005-12-23 at 09:33 +1100, Peter Williams wrote:
> >>> It still has sod all business being in the NFS code. We don't
> >>> touch task scheduling in the filesystem code.
> >>
> >> How do you explain the use of the TASK_INTERRUPTIBLE flag then?
> >
> > Oh, please...
> >
> > TASK_INTERRUPTIBLE is used to set the task to sleep. It has NOTHING
> > to do with scheduling.
>
> Putting a task to sleep _is_ rescheduling it. TASK_NONINTERACTIVE
> means that you are about to reschedule and are willing to tolerate a
> higher wakeup latency. TASK_INTERRUPTABLE means you are about to
> sleep and want to be woken up using the "standard" latency. If you
> do any kind of sleep at all, both are valid, independent of what part
> of the kernel you are. There's a reason that both are TASK_* flags.
Tolerance for higher wakeup latencies is a scheduling _policy_ decision.
Please explain why the hell we should have to deal with that in
filesystem code?
As far as a filesystem is concerned, there should be 2 scheduling
states: running and sleeping. Any scheduling policy beyond that belongs
in kernel/*.
Trond
Trond Myklebust wrote:
> On Thu, 2005-12-22 at 19:02 -0500, Kyle Moffett wrote:
>
>>On Dec 22, 2005, at 17:59, Trond Myklebust wrote:
>>
>>>On Fri, 2005-12-23 at 09:33 +1100, Peter Williams wrote:
>>>
>>>>>It still has sod all business being in the NFS code. We don't
>>>>>touch task scheduling in the filesystem code.
>>>>
>>>>How do you explain the use of the TASK_INTERRUPTIBLE flag then?
>>>
>>>Oh, please...
>>>
>>>TASK_INTERRUPTIBLE is used to set the task to sleep. It has NOTHING
>>>to do with scheduling.
>>
>>Putting a task to sleep _is_ rescheduling it. TASK_NONINTERACTIVE
>>means that you are about to reschedule and are willing to tolerate a
>>higher wakeup latency. TASK_INTERRUPTABLE means you are about to
>>sleep and want to be woken up using the "standard" latency. If you
>>do any kind of sleep at all, both are valid, independent of what part
>>of the kernel you are. There's a reason that both are TASK_* flags.
>
>
> Tolerance for higher wakeup latencies is a scheduling _policy_ decision.
> Please explain why the hell we should have to deal with that in
> filesystem code?
In order to make good decisions it needs good data. I don't think that
it's unreasonable to expect sub systems to help in that regard
especially when there is no cost involved. The patch just turns another
bit on (at compile time) in some integer constants. No extra space or
computing resources are required.
>
> As far as a filesystem is concerned, there should be 2 scheduling
> states: running and sleeping. Any scheduling policy beyond that belongs
> in kernel/*.
Actually there are currently two kinds of sleep: interruptible and
uninterruptible. This just adds a variation to one of these,
interruptible, that says even though I'm interruptible I'm not
interactive (i.e. I'm not waiting for human intervention via a key
press, mouse action, etc. to initiate the interrupt). This helps the
scheduler to decide whether the task involved is an interactive one or
not which in turn improves users' interactive experiences by ensuring
snappy responses to keyboard and mouse actions even when the system is
heavily loaded.
There are probably many interruptible sleeps in the kernel that should
be marked as non interactive but for most of them it doesn't matter
because the duration of the sleep is so short that being mislabelled
doesn't materially effect the decision re whether a task is interactive
or not. However, for reasons not related to the quality or efficiency
of the code, NFS interruptible sleeps do not fall into that category as
they can be quite long due to server load or network congestion. (N.B.
the size of delays that can be significant is quite small i.e. much less
than the size of a normal time slice.)
An alternative to using TASK_NONINTERACTIVE to mark non interactive
interruptible sleeps that are significant (probably a small number)
would be to go in the other direction and treat all interruptible sleeps
as being non interactive and then labelling all the ones that are
interactive as such. Although this would result in no changes being
made to the NFS code, I'm pretty sure that this option would involve a
great deal more code changes elsewhere as all the places where genuine
interactive sleeping were identified and labelled.
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
On Fri, 2005-12-23 at 14:06 +1100, Peter Williams wrote:
> >
> > As far as a filesystem is concerned, there should be 2 scheduling
> > states: running and sleeping. Any scheduling policy beyond that belongs
> > in kernel/*.
>
> Actually there are currently two kinds of sleep: interruptible and
> uninterruptible. This just adds a variation to one of these,
> interruptible, that says even though I'm interruptible I'm not
> interactive (i.e. I'm not waiting for human intervention via a key
> press, mouse action, etc. to initiate the interrupt). This helps the
> scheduler to decide whether the task involved is an interactive one or
> not which in turn improves users' interactive experiences by ensuring
> snappy responses to keyboard and mouse actions even when the system is
> heavily loaded.
No! This is not the same thing at all.
You are asking the coder to provide a policy judgement as to whether or
not the users might care.
As far as I'm concerned, other users' MP3 player, X processes, and
keyboard response times can rot in hell whenever I'm busy writing out
data at full blast. I don't give a rats arse about user interactivity,
because my priority is to see the batch jobs complete.
However on another machine, the local administrator may have a different
opinion. That sort of difference in opinion is precisely why we do not
put this sort of policy in the filesystem code but leave it all in the
scheduler code where all the bits and pieces can (hopefully) be treated
consistently as a single policy, and where the user can be given tools
in order to tweak the policy.
TASK_NONINTERACTIVE is basically a piss-poor interface because it moves
the policy into the lower level code where the user has less control.
> There are probably many interruptible sleeps in the kernel that should
> be marked as non interactive but for most of them it doesn't matter
> because the duration of the sleep is so short that being mislabelled
> doesn't materially effect the decision re whether a task is interactive
> or not. However, for reasons not related to the quality or efficiency
> of the code, NFS interruptible sleeps do not fall into that category as
> they can be quite long due to server load or network congestion. (N.B.
> the size of delays that can be significant is quite small i.e. much less
> than the size of a normal time slice.)
>
> An alternative to using TASK_NONINTERACTIVE to mark non interactive
> interruptible sleeps that are significant (probably a small number)
> would be to go in the other direction and treat all interruptible sleeps
> as being non interactive and then labelling all the ones that are
> interactive as such. Although this would result in no changes being
> made to the NFS code, I'm pretty sure that this option would involve a
> great deal more code changes elsewhere as all the places where genuine
> interactive sleeping were identified and labelled.
That is exactly the same rotten idea, just implemented differently. You
are still asking coders to guess as to what the scheduling policy should
be instead of letting the user decide.
Trond
Trond Myklebust wrote:
> On Fri, 2005-12-23 at 14:06 +1100, Peter Williams wrote:
>
>>>As far as a filesystem is concerned, there should be 2 scheduling
>>>states: running and sleeping. Any scheduling policy beyond that belongs
>>>in kernel/*.
>>
>>Actually there are currently two kinds of sleep: interruptible and
>>uninterruptible. This just adds a variation to one of these,
>>interruptible, that says even though I'm interruptible I'm not
>>interactive (i.e. I'm not waiting for human intervention via a key
>>press, mouse action, etc. to initiate the interrupt). This helps the
>>scheduler to decide whether the task involved is an interactive one or
>>not which in turn improves users' interactive experiences by ensuring
>>snappy responses to keyboard and mouse actions even when the system is
>>heavily loaded.
>
>
> No! This is not the same thing at all.
>
> You are asking the coder to provide a policy judgement as to whether or
> not the users might care.
No. It is asking whether the NORMAL interruption of this interruptible
sleep will be caused by a human user action such as a keystroke or mouse
action. For the NFS client the answer to that question is unequivically
no. It's not a matter of policy it's a matter of fact.
>
> As far as I'm concerned, other users' MP3 player, X processes, and
> keyboard response times can rot in hell whenever I'm busy writing out
> data at full blast. I don't give a rats arse about user interactivity,
> because my priority is to see the batch jobs complete.
>
> However on another machine, the local administrator may have a different
> opinion. That sort of difference in opinion is precisely why we do not
> put this sort of policy
It's not policy. It's a statement of fact about the nature of the sleep
that is being undertaken.
> in the filesystem code but leave it all in the
> scheduler code where all the bits and pieces can (hopefully) be treated
> consistently as a single policy, and where the user can be given tools
> in order to tweak the policy.
>
> TASK_NONINTERACTIVE is basically a piss-poor interface because it moves
> the policy into the lower level code where the user has less control.
TASK_INTERACTIVE is not about policy.
>
>
>>There are probably many interruptible sleeps in the kernel that should
>>be marked as non interactive but for most of them it doesn't matter
>>because the duration of the sleep is so short that being mislabelled
>>doesn't materially effect the decision re whether a task is interactive
>>or not. However, for reasons not related to the quality or efficiency
>>of the code, NFS interruptible sleeps do not fall into that category as
>>they can be quite long due to server load or network congestion. (N.B.
>>the size of delays that can be significant is quite small i.e. much less
>>than the size of a normal time slice.)
>>
>>An alternative to using TASK_NONINTERACTIVE to mark non interactive
>>interruptible sleeps that are significant (probably a small number)
>>would be to go in the other direction and treat all interruptible sleeps
>>as being non interactive and then labelling all the ones that are
>>interactive as such. Although this would result in no changes being
>>made to the NFS code, I'm pretty sure that this option would involve a
>>great deal more code changes elsewhere as all the places where genuine
>>interactive sleeping were identified and labelled.
>
>
> That is exactly the same rotten idea, just implemented differently.
I thought that I said (or at least implied) that. The difference is
that we wouldn't be having this conversation.
> You
> are still asking coders to guess as to what the scheduling policy should
> be instead of letting the user decide.
I wish that I could make you understand that that isn't the case.
You're not being asked to make a policy decision you're being asked to
make a statement of fact about whether the interruptible sleep is
interactive or not. In the cases involved in this patch this question
is always "no, it's not an interactive" sleep and it can be answered at
compile time with absolutely no run time overhead incurred.
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
On Fri, 2005-12-23 at 21:49 +1100, Peter Williams wrote:
> No. It is asking whether the NORMAL interruption of this interruptible
> sleep will be caused by a human user action such as a keystroke or mouse
> action. For the NFS client the answer to that question is unequivically
> no. It's not a matter of policy it's a matter of fact.
/*
* Tasks that have marked their sleep as noninteractive get
* woken up without updating their sleep average. (i.e. their
* sleep is handled in a priority-neutral manner, no priority
* boost and no penalty.)
*/
This appears to be the only documentation for the TASK_NONINTERACTIVE
flag, and I see no mention of human user actions in that comment. The
comment rather appears to states that this particular flag is designed
to switch between two different scheduling policies.
If the flag really is only about identifying sleeps that will involve
human user actions, then surely it would be easy to set up a short set
of guidelines in Documentation, say, that spell out exactly what the
purpose is, and when it should be used.
That should be done _before_ one starts charging round converting every
instance of TASK_INTERRUPTIBLE.
Trond
Trond Myklebust wrote:
> On Fri, 2005-12-23 at 21:49 +1100, Peter Williams wrote:
>
>>No. It is asking whether the NORMAL interruption of this interruptible
>>sleep will be caused by a human user action such as a keystroke or mouse
>>action. For the NFS client the answer to that question is unequivically
>>no. It's not a matter of policy it's a matter of fact.
>
>
> /*
> * Tasks that have marked their sleep as noninteractive get
> * woken up without updating their sleep average. (i.e. their
> * sleep is handled in a priority-neutral manner, no priority
> * boost and no penalty.)
> */
>
> This appears to be the only documentation for the TASK_NONINTERACTIVE
> flag,
I guess it makes to many assumptions about the reader's prior knowledge
of the scheduler internals. I'll try to make it clearer.
> and I see no mention of human user actions in that comment. The
> comment rather appears to states that this particular flag is designed
> to switch between two different scheduling policies.
Changes of scheduling policy only occur via calls to sched_setscheduler().
>
> If the flag really is only about identifying sleeps that will involve
> human user actions, then surely it would be easy to set up a short set
> of guidelines in Documentation, say, that spell out exactly what the
> purpose is, and when it should be used.
Sounds reasonable. I'll propose some changes to the scheduler
documentation.
> That should be done _before_ one starts charging round converting every
> instance of TASK_INTERRUPTIBLE.
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
On Fri, 2005-12-23 at 10:39 +0100, Trond Myklebust wrote:
> No! This is not the same thing at all.
>
> You are asking the coder to provide a policy judgement as to whether
> or
> not the users might care.
>
> As far as I'm concerned, other users' MP3 player, X processes, and
> keyboard response times can rot in hell whenever I'm busy writing out
> data at full blast. I don't give a rats arse about user interactivity,
> because my priority is to see the batch jobs complete.
>
By your logic it's also broken to use cond_resched() in filesystem code.
Lee
On Fri, 2005-12-23 at 14:07 -0500, Lee Revell wrote:
> By your logic it's also broken to use cond_resched() in filesystem code.
...and your point is?
Trond
On Fri, 2005-12-23 at 22:08 +0100, Trond Myklebust wrote:
> On Fri, 2005-12-23 at 14:07 -0500, Lee Revell wrote:
>
> > By your logic it's also broken to use cond_resched() in filesystem code.
>
> ...and your point is?
Reductio ad absurdum. Subsystems not using cond_resched would render
Linux unusable for even trivial soft realtime applications like AV
playback and recording.
Lee
On Fri, 2005-12-23 at 16:17 -0500, Lee Revell wrote:
> On Fri, 2005-12-23 at 22:08 +0100, Trond Myklebust wrote:
> > On Fri, 2005-12-23 at 14:07 -0500, Lee Revell wrote:
> >
> > > By your logic it's also broken to use cond_resched() in filesystem code.
> >
> > ...and your point is?
>
> Reductio ad absurdum. Subsystems not using cond_resched would render
> Linux unusable for even trivial soft realtime applications like AV
> playback and recording.
It may surprise you to learn that some people don't use their computers
for AV playback and recording. However absurd it may seem to you, those
people are quite happy to use 2.4.x kernels without a cond_resched
lurking in every nook and cranny.
Trond
On Fri, 2005-12-23 at 22:23 +0100, Trond Myklebust wrote:
> On Fri, 2005-12-23 at 16:17 -0500, Lee Revell wrote:
> > On Fri, 2005-12-23 at 22:08 +0100, Trond Myklebust wrote:
> > > On Fri, 2005-12-23 at 14:07 -0500, Lee Revell wrote:
> > >
> > > > By your logic it's also broken to use cond_resched() in filesystem code.
> > >
> > > ...and your point is?
> >
> > Reductio ad absurdum. Subsystems not using cond_resched would render
> > Linux unusable for even trivial soft realtime applications like AV
> > playback and recording.
>
> It may surprise you to learn that some people don't use their computers
> for AV playback and recording. However absurd it may seem to you, those
> people are quite happy to use 2.4.x kernels without a cond_resched
> lurking in every nook and cranny.
Of course, but I think a reasonable goal for 2.6 is to maintain the
server side performance of 2.4 but also enable desktop type applications
to work well.
cond_resched is really a temporary hack to make the desktop usable until
the kernel becomes fully preemptible.
Lee
On Fri, 2005-12-23 at 17:04 -0500, Lee Revell wrote:
> cond_resched is really a temporary hack to make the desktop usable until
> the kernel becomes fully preemptible.
...and my argument is that we should avoid adding yet another load of
scheduling hacks deep in unrelated code in order to satisfy yet another
minority of users. The Linux way has always been to emphasise
maintainability, and hence clean coding, over functionality.
Cheers,
Trond
On Wed, Dec 21, 2005 at 05:32:52PM +1100, Peter Williams wrote:
> Trond Myklebust wrote:
[...]
> >
> >Sorry. That theory is just plain wrong. ALL of those case _ARE_
> >interactive sleeps.
>
> It's not a theory. It's a result of observing a -j 16 build with the
> sources on an NFS mounted file system with top with and without the
> patches and comparing that with the same builds with the sources on a
> local file system. Without the patches the tasks in the kernel build
> all get the same dynamic priority as the X server and other interactive
> programs when the sources are on an NFS mounted file system. With the
> patches they generally have dynamic priorities between 6 to 10 higher
> than the X server and other interactive programs.
>
A process waiting for NFS data looses cpu time, which is spent on running
something else. Therefore, it gains some priority so it won't be
forever behind when it wakes up. Same as for any other io waiting.
Perhaps expecting a 16-way parallel make to have "no impact" is
a bit optimistic. How about nicing the make, explicitly telling
linux that it isn't important? Or how about giving important
tasks extra priority?
Helge Hafting
Hi,
Trond Myklebust wrote:
> > /*
> > * Tasks that have marked their sleep as noninteractive get
> > * woken up without updating their sleep average. (i.e. their
> > * sleep is handled in a priority-neutral manner, no priority
> > * boost and no penalty.)
> > */
> >
> > This appears to be the only documentation for the TASK_NONINTERACTIVE
> > flag,
On 12/23/05, Peter Williams <[email protected]> wrote:
> I guess it makes to many assumptions about the reader's prior knowledge
> of the scheduler internals. I'll try to make it clearer.
FWIW, Ingo invented TASK_NONINTERACTIVE to fix a problem I had with
Wine. See the following threads for further discussion:
http://marc.theaimsgroup.com/?t=111729237700002&r=1&w=2
http://marc.theaimsgroup.com/?t=111761183900001&r=1&w=2
Pekka
Helge Hafting wrote:
> On Wed, Dec 21, 2005 at 05:32:52PM +1100, Peter Williams wrote:
>
>>Trond Myklebust wrote:
>
> [...]
>
>>>Sorry. That theory is just plain wrong. ALL of those case _ARE_
>>>interactive sleeps.
>>
>>It's not a theory. It's a result of observing a -j 16 build with the
>>sources on an NFS mounted file system with top with and without the
>>patches and comparing that with the same builds with the sources on a
>>local file system. Without the patches the tasks in the kernel build
>>all get the same dynamic priority as the X server and other interactive
>>programs when the sources are on an NFS mounted file system. With the
>>patches they generally have dynamic priorities between 6 to 10 higher
>>than the X server and other interactive programs.
>>
>
> A process waiting for NFS data looses cpu time, which is spent on running
> something else. Therefore, it gains some priority so it won't be
> forever behind when it wakes up. Same as for any other io waiting.
That's more or less independent of this issue as the distribution of CPU
to tasks is largely determined by the time slice mechanism and the
dynamic priority is primarily about latency. (This distinction is a
little distorted by the fact that, under some circumstances,
"interactive" tasks don't get moved to the expired list at the end of
their time slice but this usually won't matter as genuine interactive
tasks aren't generally CPU hogs.) In other words, the issue that you
raised is largely solved by the time tasks spend on the active queue
before moving to the expired queue rather than the order in which they
run when on the active queue.
This problem is all about those tasks getting an inappropriate boost to
improve their latency because they are mistakenly believed to be
interactive. Having had a closer think about the way the scheduler
works I'm now of the opinion that completely ignoring sleeps labelled as
TASK_NONINTERACTIVE may be a mistake and that it might be more
appropriate to treat them the same as TASK_UNITERRUPTIBLE but I'll bow
to Ingo on this as he would have a better understanding of the issues
involved.
>
> Perhaps expecting a 16-way parallel make to have "no impact" is
> a bit optimistic. How about nicing the make, explicitly telling
> linux that it isn't important?
Yes, but that shouldn't be necessary. If I do the same build on a local
file system everything works OK and the tasks in the build have dynamic
priorities 8 to 10 slots higher than the X server and other interactive
programs.
> Or how about giving important
> tasks extra priority?
Only root can do that. But some operating systems do just that e.g.
Solaris has an IA scheduling class (which all X based programs are run
in) that takes precedence over programs in the TS class (which is the
equivalent of Linus's SCHED_NORMAL). I'm not sure how they handle the
privileges issues related to stopping inappropriate programs misusing
the IA class. IA is really just TS with a boost which is effectively
just the reverse implementation of what the new SCHED_BATCH achieves.
Arguably, SCHED_BATCH is the superior way of doing this as it doesn't
cause any privilege issues as shifting to SCHED_BATCH can be done by the
owner of the task.
The main drawback to the SCHED_BATCH approach is that it (currently)
requires the user to explicitly set it on the relevant tasks. It's long
term success would be greatly enhanced if programmers could be convinced
to have their programs switch themselves to SCHED_BATCH unless they are
genuine interactive processes.
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
Peter Williams wrote:
> Helge Hafting wrote:
>
>> On Wed, Dec 21, 2005 at 05:32:52PM +1100, Peter Williams wrote:
>>
>>> Trond Myklebust wrote:
>>
>>
>> [...]
>>
>>>> Sorry. That theory is just plain wrong. ALL of those case _ARE_
>>>> interactive sleeps.
>>>
>>>
>>> It's not a theory. It's a result of observing a -j 16 build with the
>>> sources on an NFS mounted file system with top with and without the
>>> patches and comparing that with the same builds with the sources on a
>>> local file system. Without the patches the tasks in the kernel build
>>> all get the same dynamic priority as the X server and other
>>> interactive programs when the sources are on an NFS mounted file
>>> system. With the patches they generally have dynamic priorities
>>> between 6 to 10 higher than the X server and other interactive programs.
>>>
>>
>> A process waiting for NFS data looses cpu time, which is spent on
>> running something else. Therefore, it gains some priority so it won't be
>> forever behind when it wakes up. Same as for any other io waiting.
>
>
> That's more or less independent of this issue as the distribution of CPU
> to tasks is largely determined by the time slice mechanism and the
> dynamic priority is primarily about latency. (This distinction is a
> little distorted by the fact that, under some circumstances,
> "interactive" tasks don't get moved to the expired list at the end of
> their time slice but this usually won't matter as genuine interactive
> tasks aren't generally CPU hogs.) In other words, the issue that you
> raised is largely solved by the time tasks spend on the active queue
> before moving to the expired queue rather than the order in which they
> run when on the active queue.
>
> This problem is all about those tasks getting an inappropriate boost to
> improve their latency because they are mistakenly believed to be
> interactive.
One of the unfortunate side effects of this is that it can effect
scheduler fairness because if these tasks get sufficient bonus points
the TASK_INTERACTIVE() macro will return true for them and they will be
rescheduled on the active queue instead of the expired queue at the end
of the time slice (provided EXPIRED_STARVING()) doesn't prevent this).
This will have an adverse effect on scheduling fairness.
The ideal design of the scheduler would be for the fairness mechanism
and the interactive responsiveness mechanism to be independent but this
is not the case due to the fact that requeueing interactive tasks on the
expired array could add unacceptably to their latency. As I said above
this slight divergence from the ideal of perfect independence shouldn't
matter as genuine interactive processes aren't very CPU intensive.
In summary, inappropriate identification of CPU intensive tasks as
interactive has two bad effects: 1) responsiveness problems for genuine
interactive tasks due to the extra competition at their dynamic priority
and 2) a degradation of scheduling fairness; not just one.
For an example of the effect of inappropriate identification of CPU hogs
as interactive tasks see the thread "[SCHED] Totally WRONG priority
calculation with specific test-case (since 2.6.10-bk12)" in this list.
> Having had a closer think about the way the scheduler
> works I'm now of the opinion that completely ignoring sleeps labelled as
> TASK_NONINTERACTIVE may be a mistake and that it might be more
> appropriate to treat them the same as TASK_UNITERRUPTIBLE but I'll bow
> to Ingo on this as he would have a better understanding of the issues
> involved.
>
>>
>> Perhaps expecting a 16-way parallel make to have "no impact" is
>> a bit optimistic. How about nicing the make, explicitly telling
>> linux that it isn't important?
>
>
> Yes, but that shouldn't be necessary. If I do the same build on a local
> file system everything works OK and the tasks in the build have dynamic
> priorities 8 to 10 slots higher than the X server and other interactive
> programs.
Further analysis indicates that this is not a complete solution as the
tasks would still be identified as interactive and given a bonus.
Although the change of nice value would be sufficient to stop these
tasks competing with the genuine interactive tasks, they would probably
still get a positive return value from TASK_INTERACTIVE() (as it's
effectively based on the bonus acquired i.e. difference between prio and
static_prio) and hence preferential treatment at the end of their time
slice with a consequent degradation of scheduling fairness.
>
>> Or how about giving important
>> tasks extra priority?
>
>
> Only root can do that. But some operating systems do just that e.g.
> Solaris has an IA scheduling class (which all X based programs are run
> in) that takes precedence over programs in the TS class (which is the
> equivalent of Linus's SCHED_NORMAL). I'm not sure how they handle the
> privileges issues related to stopping inappropriate programs misusing
> the IA class. IA is really just TS with a boost which is effectively
> just the reverse implementation of what the new SCHED_BATCH achieves.
> Arguably, SCHED_BATCH is the superior way of doing this as it doesn't
> cause any privilege issues as shifting to SCHED_BATCH can be done by the
> owner of the task.
>
> The main drawback to the SCHED_BATCH approach is that it (currently)
> requires the user to explicitly set it on the relevant tasks. It's long
> term success would be greatly enhanced if programmers could be convinced
> to have their programs switch themselves to SCHED_BATCH unless they are
> genuine interactive processes.
>
> Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
Hi Peter,
On Wed, Jan 04, 2006 at 12:25:40PM +1100, Peter Williams wrote:
> Peter Williams wrote:
> >Helge Hafting wrote:
> >
> >>On Wed, Dec 21, 2005 at 05:32:52PM +1100, Peter Williams wrote:
> >>
> >>>Trond Myklebust wrote:
> >>
> >>
> >>[...]
> >>
> >>>>Sorry. That theory is just plain wrong. ALL of those case _ARE_
> >>>>interactive sleeps.
> >>>
> >>>
> >>>It's not a theory. It's a result of observing a -j 16 build with the
> >>>sources on an NFS mounted file system with top with and without the
> >>>patches and comparing that with the same builds with the sources on a
> >>>local file system. Without the patches the tasks in the kernel build
> >>>all get the same dynamic priority as the X server and other
> >>>interactive programs when the sources are on an NFS mounted file
> >>>system. With the patches they generally have dynamic priorities
> >>>between 6 to 10 higher than the X server and other interactive programs.
> >>>
> >>
> >>A process waiting for NFS data looses cpu time, which is spent on
> >>running something else. Therefore, it gains some priority so it won't be
> >>forever behind when it wakes up. Same as for any other io waiting.
> >
> >
> >That's more or less independent of this issue as the distribution of CPU
> >to tasks is largely determined by the time slice mechanism and the
> >dynamic priority is primarily about latency. (This distinction is a
> >little distorted by the fact that, under some circumstances,
> >"interactive" tasks don't get moved to the expired list at the end of
> >their time slice but this usually won't matter as genuine interactive
> >tasks aren't generally CPU hogs.) In other words, the issue that you
> >raised is largely solved by the time tasks spend on the active queue
> >before moving to the expired queue rather than the order in which they
> >run when on the active queue.
> >
> >This problem is all about those tasks getting an inappropriate boost to
> >improve their latency because they are mistakenly believed to be
> >interactive.
>
> One of the unfortunate side effects of this is that it can effect
> scheduler fairness because if these tasks get sufficient bonus points
> the TASK_INTERACTIVE() macro will return true for them and they will be
> rescheduled on the active queue instead of the expired queue at the end
> of the time slice (provided EXPIRED_STARVING()) doesn't prevent this).
> This will have an adverse effect on scheduling fairness.
>
> The ideal design of the scheduler would be for the fairness mechanism
> and the interactive responsiveness mechanism to be independent but this
> is not the case due to the fact that requeueing interactive tasks on the
> expired array could add unacceptably to their latency. As I said above
> this slight divergence from the ideal of perfect independence shouldn't
> matter as genuine interactive processes aren't very CPU intensive.
>
> In summary, inappropriate identification of CPU intensive tasks as
> interactive has two bad effects: 1) responsiveness problems for genuine
> interactive tasks due to the extra competition at their dynamic priority
> and 2) a degradation of scheduling fairness; not just one.
>
> For an example of the effect of inappropriate identification of CPU hogs
> as interactive tasks see the thread "[SCHED] Totally WRONG priority
> calculation with specific test-case (since 2.6.10-bk12)" in this list.
And another real-life example of the issue you describe above.
>From [email protected] Fri Dec 2 18:51:59 2005
Date: Fri, 2 Dec 2005 18:51:59 -0200
From: Marcelo Tosatti <[email protected]>
To: Ingo Molnar <[email protected]>, Nick Piggin <[email protected]>
Cc: Regina Kodato <[email protected]>,
Wanda Rosalino <[email protected]>,
Edson Seabra <[email protected]>
Subject: scheduler starvation with v2.6.11 on embedded PPC appliance
We are experiencing what seems to be a scheduler starvation issue on our
application, running v2.6.11. The same load works as expected on v2.4.
We would like to know if v2.6.14 could possibly fix this problem.
Hardware is a PowerPC 8xx at 48Mhz (embedded SoC) with 128MB RAM,
handling remote access to its own 48 serial ports running at 9600bps
each (8N1, HW flow control).
Access to the ports is performed via SSH (one sshd instance for each
port), and there are two different configurations:
1) slim socket mode: Each SSH process is responsible for handling IO to
its own serial port.
2) buffering mode: Where a single process handles IO on the 48 tty's,
copying data to a shared memory region and signalling the respective ssh
daemon with SIGIO once a certain amount of data is ready.
The test transfers a 78k file via each serial port (total = 48*78k =
3.7MB) from an x86 Linux box, usually taking:
78110 bytes after 81 seconds, 964 cps (+-9640 bps).
Time varies from 77 sec upto 85 sec.
Problem description:
Using slim socket mode, where each SSH process handles IO to its own
port, the scheduler starves a certain number of processes, causing their
connections to timeout.
Further investigation with schedstats allowed us to notice that
"wait_ticks" is much higher using this mode.
Follows the output of "latency" and "vmstat 2" with buffering mode (low
wait_ticks, high number of context switches):
913 (cy_buffering) 25(25) 1077(1077) 843(843) 0.03 1.28
1166 (sshd) 220(220) 143(143) 1276(1276) 0.17 0.11
913 (cy_buffering) 36(11) 1078(1) 952(109) 0.10 0.01
1166 (sshd) 231(11) 191(48) 1883(607) 0.02 0.08
913 (cy_buffering) 242(206) 1131(53) 3200(2248) 0.09 0.02
1166 (sshd) 294(63) 383(192) 2523(640) 0.10 0.30
913 (cy_buffering) 440(198) 1172(41) 5637(2437) 0.08 0.02
1166 (sshd) 353(59) 574(191) 3160(637) 0.09 0.30
913 (cy_buffering) 644(204) 1199(27) 7918(2281) 0.09 0.01
1166 (sshd) 372(19) 678(104) 3771(611) 0.03 0.17
913 (cy_buffering) 644(0) 1201(2) 7978(60) 0.00 0.03
1166 (sshd) 372(0) 681(3) 4372(601) 0.00 0.00
procs memory swap io system cpu
r b swpd free buff cache si so bi bo in cs us sy wa id
0 0 0 159752 51200 9960 0 0 0 0 23 1171 1 11 0 88
0 0 0 159752 51200 9960 0 0 0 0 10 1111 0 5 0 94
1 0 0 159752 51200 9964 0 0 2 0 311 1226 35 55 0 10
1 0 0 159752 51200 9964 0 0 0 0 934 1718 50 50 0 0
1 0 0 159752 51200 9964 0 0 0 0 874 1519 52 48 0 0
11 0 0 159752 51200 9964 0 0 0 0 800 1358 47 53 0 0
7 0 0 159752 51200 9964 0 0 0 0 527 1235 44 56 0 0
1 0 0 159752 51200 9964 0 0 0 0 301 1144 47 53 0 0
1 0 0 159752 51200 9964 0 0 0 0 363 1241 43 57 0 0
2 0 0 159752 51200 9964 0 0 0 1 428 1194 45 55 0 0
1 0 0 159752 51200 9964 0 0 0 0 428 1141 42 58 0 0
1 0 0 159752 51200 9964 0 0 0 0 433 1255 44 56 0 0
2 0 0 159752 51200 9964 0 0 0 0 444 1067 46 54 0 0
1 0 0 159752 51200 9964 0 0 0 0 465 1071 55 45 0 0
1 0 0 159752 51200 9964 0 0 0 0 510 1101 42 58 0 0
1 0 0 159752 51200 9964 0 0 0 0 409 1082 47 53 0 0
1 0 0 159752 51200 9964 0 0 0 0 401 1075 40 60 0 0
1 0 0 159752 51200 9964 0 0 0 0 409 1081 44 56 0 0
And with slim socket mode (very high wait_ticks, low number of context
switches):
1200 (sshd) 382(0) 3891(0) 1879(30) 0.00 0.00
1216 (sshd) 479(0) 7216(0) 2387(30) 0.00 0.00
1241 (sshd) 802(0) 6869(2) 4069(31) 0.00 0.06
1276 (sshd) 499(2) 8807(42) 3204(34) 0.06 1.24
1301 (sshd) 601(2) 8319(38) 2752(32) 0.06 1.19
1200 (sshd) 388(6) 4184(293) 1909(30) 0.20 9.77
1216 (sshd) 487(8) 7516(300) 2413(26) 0.31 11.54
1241 (sshd) 866(64) 7575(706) 4427(358) 0.18 1.97
1276 (sshd) 656(157) 9824(1017) 3756(552) 0.28 1.84
1301 (sshd) 610(9) 8422(103) 2761(9) 1.00 11.44
1200 (sshd) 415(27) 7132(2948) 1982(73) 0.37 40.38
1216 (sshd) 511(24) 10537(3021) 2496(83) 0.29 36.40
1241 (sshd) 943(77) 8537(962) 4875(448) 0.17 2.15
1276 (sshd) 776(120) 10892(1068) 4336(580) 0.21 1.84
1301 (sshd) 620(10) 11034(2612) 2771(10) 1.00 261.20
procs memory swap io system cpu
r b swpd free buff cache si so bi bo in cs us sy wa id
5 0 0 159816 51200 9916 0 0 0 0 18 113 0 1 0 99
0 0 0 159816 51200 9916 0 0 0 0 19 112 0 2 0 98
0 0 0 159816 51200 9916 0 0 0 0 166 176 1 6 0 93
37 0 0 159880 51200 9916 0 0 0 0 2857 1219 46 50 0 4
38 0 0 159880 51200 9916 0 0 0 0 2662 1059 58 42 0 0
33 0 0 159880 51200 9916 0 0 0 0 1058 496 72 28 0 0
33 0 0 159880 51200 9916 0 0 0 0 1593 743 70 30 0 0
33 0 0 159880 51200 9916 0 0 0 0 1519 706 71 29 0 0
34 0 0 159880 51200 9916 0 0 0 0 1073 520 74 26 0 0
35 0 0 159880 51200 9916 0 0 0 0 1047 493 67 33 0 0
49 0 0 159880 51200 9916 0 0 0 0 1130 543 70 30 0 0
34 0 0 159880 51200 9916 0 0 0 0 1239 612 70 30 0 0
46 0 0 159880 51200 9916 0 0 0 0 1427 737 69 31 0 0
34 0 0 159880 51200 9916 0 0 0 0 835 423 73 27 0 0
36 0 0 159880 51200 9916 0 0 0 1 1036 414 69 31 0 0
37 0 0 159880 51200 9916 0 0 0 0 917 379 73 27 0 0
44 0 0 159880 51200 9916 0 0 0 0 3401 1311 65 35 0 0
Another noticeable difference on schedstat output is that slim mode
causes the scheduler to switch the active/expired queues 4 times during
the total run, while buffering mode switches the queues 38 times.
Attached you can find schedstats-buffering.txt and schedstats-slim.txt.
On v2.4.17 both modes work fine, with a high context-switch number.
We suspected that the TASK_INTERACTIVE() logic in kernel/sched.c would
be moving some processes directly to the active list, thus starving some
others. So we set the nice value of all 48 processes to "nice +19" to
disable TASK_INTERACTIVE() and the starvation is gone. However with +19
it becomes impossible to use the box interactively while the test runs,
which is the case with the default "0" nice value.
Are there significant changes between v2.6.11 -> v2.6.14 aimed at fixing
this problem?
On Wednesday 04 January 2006 20:40, Marcelo Tosatti wrote:
> We suspected that the TASK_INTERACTIVE() logic in kernel/sched.c would
> be moving some processes directly to the active list, thus starving some
> others. So we set the nice value of all 48 processes to "nice +19" to
> disable TASK_INTERACTIVE() and the starvation is gone. However with +19
> it becomes impossible to use the box interactively while the test runs,
> which is the case with the default "0" nice value.
>
> Are there significant changes between v2.6.11 -> v2.6.14 aimed at fixing
> this problem?
The SCHED_BATCH policy Ingo has implemented should help just such a problem.
Con
On Wed, Jan 04, 2006 at 11:18:01PM +1100, Con Kolivas wrote:
> On Wednesday 04 January 2006 20:40, Marcelo Tosatti wrote:
> > We suspected that the TASK_INTERACTIVE() logic in kernel/sched.c would
> > be moving some processes directly to the active list, thus starving some
> > others. So we set the nice value of all 48 processes to "nice +19" to
> > disable TASK_INTERACTIVE() and the starvation is gone. However with +19
> > it becomes impossible to use the box interactively while the test runs,
> > which is the case with the default "0" nice value.
> >
> > Are there significant changes between v2.6.11 -> v2.6.14 aimed at fixing
> > this problem?
>
> The SCHED_BATCH policy Ingo has implemented should help just such a problem.
Yeap, he sent me the patch (which I promised to test), but still haven't.
Will do ASAP.
Peter Williams wrote:
> Peter Williams wrote:
>
>> Helge Hafting wrote:
>>
>>> On Wed, Dec 21, 2005 at 05:32:52PM +1100, Peter Williams wrote:
>>>
>>>> Trond Myklebust wrote:
>>>
>>>
>>>
>>> [...]
>>>
>>>>> Sorry. That theory is just plain wrong. ALL of those case _ARE_
>>>>> interactive sleeps.
>>>>
>>>>
>>>>
>>>> It's not a theory. It's a result of observing a -j 16 build with
>>>> the sources on an NFS mounted file system with top with and without
>>>> the patches and comparing that with the same builds with the sources
>>>> on a local file system. Without the patches the tasks in the kernel
>>>> build all get the same dynamic priority as the X server and other
>>>> interactive programs when the sources are on an NFS mounted file
>>>> system. With the patches they generally have dynamic priorities
>>>> between 6 to 10 higher than the X server and other interactive
>>>> programs.
>>>>
>>>
>>> A process waiting for NFS data looses cpu time, which is spent on
>>> running something else. Therefore, it gains some priority so it
>>> won't be
>>> forever behind when it wakes up. Same as for any other io waiting.
>>
>>
>>
>> That's more or less independent of this issue as the distribution of
>> CPU to tasks is largely determined by the time slice mechanism and the
>> dynamic priority is primarily about latency. (This distinction is a
>> little distorted by the fact that, under some circumstances,
>> "interactive" tasks don't get moved to the expired list at the end of
>> their time slice but this usually won't matter as genuine interactive
>> tasks aren't generally CPU hogs.) In other words, the issue that you
>> raised is largely solved by the time tasks spend on the active queue
>> before moving to the expired queue rather than the order in which they
>> run when on the active queue.
>>
>> This problem is all about those tasks getting an inappropriate boost
>> to improve their latency because they are mistakenly believed to be
>> interactive.
>
>
> One of the unfortunate side effects of this is that it can effect
> scheduler fairness because if these tasks get sufficient bonus points
> the TASK_INTERACTIVE() macro will return true for them and they will be
> rescheduled on the active queue instead of the expired queue at the end
> of the time slice (provided EXPIRED_STARVING()) doesn't prevent this).
> This will have an adverse effect on scheduling fairness.
I should have added here that if EXPIRED_STARVING() stops these tasks
from being requeued on the active queue at the end of their time slice
then it will also stop genuine interactive tasks from being requeued on
the active queue with bad effects for interactive responsiveness.
>
> The ideal design of the scheduler would be for the fairness mechanism
> and the interactive responsiveness mechanism to be independent but this
> is not the case due to the fact that requeueing interactive tasks on the
> expired array could add unacceptably to their latency. As I said above
> this slight divergence from the ideal of perfect independence shouldn't
> matter as genuine interactive processes aren't very CPU intensive.
>
> In summary, inappropriate identification of CPU intensive tasks as
> interactive has two bad effects: 1) responsiveness problems for genuine
> interactive tasks due to the extra competition at their dynamic priority
> and 2) a degradation of scheduling fairness; not just one.
>
> For an example of the effect of inappropriate identification of CPU hogs
> as interactive tasks see the thread "[SCHED] Totally WRONG priority
> calculation with specific test-case (since 2.6.10-bk12)" in this list.
>
>> Having had a closer think about the way the scheduler works I'm now
>> of the opinion that completely ignoring sleeps labelled as
>> TASK_NONINTERACTIVE may be a mistake and that it might be more
>> appropriate to treat them the same as TASK_UNITERRUPTIBLE but I'll bow
>> to Ingo on this as he would have a better understanding of the issues
>> involved.
I've changed my mind again on this and now think that, rather than
treating TASK_NONINTERACTIVE sleeps the way TASK_UNINTERRUPTIBLE sleeps
are currently treated, TASK_UNINTERRUPTIBLE sleeps should be ignored
just like TASK_NONINTERACTIVE sleeps currently are.
>>
>>>
>>> Perhaps expecting a 16-way parallel make to have "no impact" is
>>> a bit optimistic. How about nicing the make, explicitly telling
>>> linux that it isn't important?
>>
>>
>>
>> Yes, but that shouldn't be necessary. If I do the same build on a
>> local file system everything works OK and the tasks in the build have
>> dynamic priorities 8 to 10 slots higher than the X server and other
>> interactive programs.
>
>
> Further analysis indicates that this is not a complete solution as the
> tasks would still be identified as interactive and given a bonus.
> Although the change of nice value would be sufficient to stop these
> tasks competing with the genuine interactive tasks, they would probably
> still get a positive return value from TASK_INTERACTIVE() (as it's
> effectively based on the bonus acquired i.e. difference between prio and
> static_prio) and hence preferential treatment at the end of their time
> slice with a consequent degradation of scheduling fairness.
>
>>
>>> Or how about giving important
>>> tasks extra priority?
>>
>>
>>
>> Only root can do that. But some operating systems do just that e.g.
>> Solaris has an IA scheduling class (which all X based programs are run
>> in) that takes precedence over programs in the TS class (which is the
>> equivalent of Linus's SCHED_NORMAL). I'm not sure how they handle the
>> privileges issues related to stopping inappropriate programs misusing
>> the IA class. IA is really just TS with a boost which is effectively
>> just the reverse implementation of what the new SCHED_BATCH achieves.
>> Arguably, SCHED_BATCH is the superior way of doing this as it doesn't
>> cause any privilege issues as shifting to SCHED_BATCH can be done by
>> the owner of the task.
>>
>> The main drawback to the SCHED_BATCH approach is that it (currently)
>> requires the user to explicitly set it on the relevant tasks. It's
>> long term success would be greatly enhanced if programmers could be
>> convinced to have their programs switch themselves to SCHED_BATCH
>> unless they are genuine interactive processes.
I think that some of the harder to understand parts of the scheduler
code are actually attempts to overcome the undesirable effects (such as
those I've described) of inappropriately identifying tasks as
interactive. I think that it would have been better to attempt to fix
the inappropriate identifications rather than their effects and I think
the prudent use of TASK_NONINTERACTIVE is an important tool for
achieving this.
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
At 08:51 AM 1/5/2006 +1100, Peter Williams wrote:
>I think that some of the harder to understand parts of the scheduler code
>are actually attempts to overcome the undesirable effects (such as those
>I've described) of inappropriately identifying tasks as interactive. I
>think that it would have been better to attempt to fix the inappropriate
>identifications rather than their effects and I think the prudent use of
>TASK_NONINTERACTIVE is an important tool for achieving this.
IMHO, that's nothing but a cover for the weaknesses induced by using
exclusively sleep time as an information source for the priority
calculation. While this heuristic does work pretty darn well, it's easily
fooled (intentionally or otherwise). The challenge is to find the right
low cost informational component, and to stir it in at O(1).
The fundamental problem with the whole interactivity issue is that the
kernel has no way to know if there's a human involved or not. My 100%cpu
GL screensaver is interactive while I'm mindlessly staring at it.
-Mike
Mike Galbraith wrote:
> At 08:51 AM 1/5/2006 +1100, Peter Williams wrote:
>
>> I think that some of the harder to understand parts of the scheduler
>> code are actually attempts to overcome the undesirable effects (such
>> as those I've described) of inappropriately identifying tasks as
>> interactive. I think that it would have been better to attempt to fix
>> the inappropriate identifications rather than their effects and I
>> think the prudent use of TASK_NONINTERACTIVE is an important tool for
>> achieving this.
>
>
> IMHO, that's nothing but a cover for the weaknesses induced by using
> exclusively sleep time as an information source for the priority
> calculation. While this heuristic does work pretty darn well, it's
> easily fooled (intentionally or otherwise). The challenge is to find
> the right low cost informational component, and to stir it in at O(1).
TASK_NONINTERACTIVE helps in this regard, is no cost in the code where
it's used and probably decreases the costs in the scheduler code by
enabling some processing to be skipped. If by its judicious use the
heuristic is only fed interactive sleep data the heuristics accuracy in
identifying interactive tasks should be improved. It may also allow the
heuristic to be simplified.
Other potential information sources the priority calculation may also
benefit from TASK_INTERACTIVE. E.g. measuring interactive latency
requires knowing that the task is waking from an interactive sleep.
>
> The fundamental problem with the whole interactivity issue is that the
> kernel has no way to know if there's a human involved or not.
Which is why SCHED_BATCH has promise. The key for it becoming really
useful will be getting authors of non interactive programs to use it.
The hard part will be getting them to admit that their programs are non
interactive and undeserving of a boost.
> My
> 100%cpu GL screensaver is interactive while I'm mindlessly staring at it.
I've never actually seen what bonuses the screensaver gets :-) but I
imagine any sleeping they do is in a very regular sleep/run pattern and
this regularity can be measured and used to exclude them from bonuses.
However, it would need some extra parameters to avoid depriving audio
and video programs of bonuses as they too have very regular sleep/run
patterns. The average sleep/run interval is one possibility as
audio/video programs tend to use small intervals.
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
At 10:31 PM 1/5/2006 +1100, Peter Williams wrote:
>Mike Galbraith wrote:
>>At 08:51 AM 1/5/2006 +1100, Peter Williams wrote:
>>
>>>I think that some of the harder to understand parts of the scheduler
>>>code are actually attempts to overcome the undesirable effects (such as
>>>those I've described) of inappropriately identifying tasks as
>>>interactive. I think that it would have been better to attempt to fix
>>>the inappropriate identifications rather than their effects and I think
>>>the prudent use of TASK_NONINTERACTIVE is an important tool for achieving this.
>>
>>IMHO, that's nothing but a cover for the weaknesses induced by using
>>exclusively sleep time as an information source for the priority
>>calculation. While this heuristic does work pretty darn well, it's
>>easily fooled (intentionally or otherwise). The challenge is to find the
>>right low cost informational component, and to stir it in at O(1).
>
>TASK_NONINTERACTIVE helps in this regard, is no cost in the code where
>it's used and probably decreases the costs in the scheduler code by
>enabling some processing to be skipped. If by its judicious use the
>heuristic is only fed interactive sleep data the heuristics accuracy in
>identifying interactive tasks should be improved. It may also allow the
>heuristic to be simplified.
I disagree. You can nip and tuck all the bits of sleep time you want, and
it'll just shift the lumpy spots around (btdt).
-Mike
Mike Galbraith wrote:
> At 10:31 PM 1/5/2006 +1100, Peter Williams wrote:
>
>> Mike Galbraith wrote:
>>
>>> At 08:51 AM 1/5/2006 +1100, Peter Williams wrote:
>>>
>>>> I think that some of the harder to understand parts of the scheduler
>>>> code are actually attempts to overcome the undesirable effects (such
>>>> as those I've described) of inappropriately identifying tasks as
>>>> interactive. I think that it would have been better to attempt to
>>>> fix the inappropriate identifications rather than their effects and
>>>> I think the prudent use of TASK_NONINTERACTIVE is an important tool
>>>> for achieving this.
>>>
>>>
>>> IMHO, that's nothing but a cover for the weaknesses induced by using
>>> exclusively sleep time as an information source for the priority
>>> calculation. While this heuristic does work pretty darn well, it's
>>> easily fooled (intentionally or otherwise). The challenge is to find
>>> the right low cost informational component, and to stir it in at O(1).
>>
>>
>> TASK_NONINTERACTIVE helps in this regard, is no cost in the code where
>> it's used and probably decreases the costs in the scheduler code by
>> enabling some processing to be skipped. If by its judicious use the
>> heuristic is only fed interactive sleep data the heuristics accuracy
>> in identifying interactive tasks should be improved. It may also
>> allow the heuristic to be simplified.
>
>
> I disagree. You can nip and tuck all the bits of sleep time you want,
> and it'll just shift the lumpy spots around (btdt).
Yes, but there's a lot of (understandable) reluctance to do any major
rework of this part of the scheduler so we're stuck with nips and tucks
for the time being. This patch is a zero cost nip and tuck.
If the plugsched patches were included in -mm we could get wider testing
of alternative scheduling mechanisms. But I think it will take a lot of
testing of the new schedulers to allay fears that they may introduce new
problems of their own.
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
On Fri, 6 Jan 2006 10:13 am, Peter Williams wrote:
> If the plugsched patches were included in -mm we could get wider testing
> of alternative scheduling mechanisms. But I think it will take a lot of
> testing of the new schedulers to allay fears that they may introduce new
> problems of their own.
When I first generated plugsched and posted it to lkml for inclusion in -mm it
was blocked as having no chance of being included by both Ingo and Linus and
I doubt they've changed their position since then. As you're well aware this
is why I gave up working on it and let you maintain it since then. Obviously
I thought it was a useful feature or I wouldn't have worked on it.
Con
Con Kolivas wrote:
> On Fri, 6 Jan 2006 10:13 am, Peter Williams wrote:
>
>>If the plugsched patches were included in -mm we could get wider testing
>>of alternative scheduling mechanisms. But I think it will take a lot of
>>testing of the new schedulers to allay fears that they may introduce new
>>problems of their own.
>
>
> When I first generated plugsched and posted it to lkml for inclusion in -mm it
> was blocked as having no chance of being included by both Ingo and Linus and
> I doubt they've changed their position since then. As you're well aware this
> is why I gave up working on it and let you maintain it since then. Obviously
> I thought it was a useful feature or I wouldn't have worked on it.
I've put a lot of effort into reducing code duplication and reducing the
size of the interface and making it completely orthogonal to load
balancing so I'm hopeful (perhaps mistakenly) that this makes it more
acceptable (at least in -mm).
My testing shows that there's no observable difference in performance
between a stock kernel and plugsched with ingosched selected at the
total system level (although micro benchmarking may show slight
increases in individual operations).
Anyway, I'll just keep plugging away,
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
On Fri, 6 Jan 2006 11:02 am, Peter Williams wrote:
> Con Kolivas wrote:
> > On Fri, 6 Jan 2006 10:13 am, Peter Williams wrote:
> >>If the plugsched patches were included in -mm we could get wider testing
> >>of alternative scheduling mechanisms. But I think it will take a lot of
> >>testing of the new schedulers to allay fears that they may introduce new
> >>problems of their own.
> >
> > When I first generated plugsched and posted it to lkml for inclusion in
> > -mm it was blocked as having no chance of being included by both Ingo and
> > Linus and I doubt they've changed their position since then. As you're
> > well aware this is why I gave up working on it and let you maintain it
> > since then. Obviously I thought it was a useful feature or I wouldn't
> > have worked on it.
>
> I've put a lot of effort into reducing code duplication and reducing the
> size of the interface and making it completely orthogonal to load
> balancing so I'm hopeful (perhaps mistakenly) that this makes it more
> acceptable (at least in -mm).
The objection was to dilution of developer effort towards one cpu scheduler to
rule them all. Linus' objection was against specialisation - he preferred one
cpu scheduler that could do everything rather than unique cpu schedulers for
NUMA, SMP, UP, embedded... Each approach has its own arguments and there
isn't much point bringing them up again. We shall use Linux as the
"steamroller to crack a nut" no matter what that nut is.
> My testing shows that there's no observable difference in performance
> between a stock kernel and plugsched with ingosched selected at the
> total system level (although micro benchmarking may show slight
> increases in individual operations).
I could find no difference either, but IA64 which does not cope with
indirection well would probably suffer a demonstrable performance hit I have
been told. I do not have access to such hardware.
> Anyway, I'll just keep plugging away,
Nice pun.
Cheers,
Con
Con Kolivas wrote:
> On Fri, 6 Jan 2006 11:02 am, Peter Williams wrote:
>
>>Con Kolivas wrote:
>>
>>>On Fri, 6 Jan 2006 10:13 am, Peter Williams wrote:
>>>
>>>>If the plugsched patches were included in -mm we could get wider testing
>>>>of alternative scheduling mechanisms. But I think it will take a lot of
>>>>testing of the new schedulers to allay fears that they may introduce new
>>>>problems of their own.
>>>
>>>When I first generated plugsched and posted it to lkml for inclusion in
>>>-mm it was blocked as having no chance of being included by both Ingo and
>>>Linus and I doubt they've changed their position since then. As you're
>>>well aware this is why I gave up working on it and let you maintain it
>>>since then. Obviously I thought it was a useful feature or I wouldn't
>>>have worked on it.
>>
>>I've put a lot of effort into reducing code duplication and reducing the
>>size of the interface and making it completely orthogonal to load
>>balancing so I'm hopeful (perhaps mistakenly) that this makes it more
>>acceptable (at least in -mm).
>
>
> The objection was to dilution of developer effort towards one cpu scheduler to
> rule them all.
I think that I've partially addressed that objection by narrowing the
focus of the alternative schedulers so that the dilution of effort is
reduced. The dichotomy between the dual array schedulers (ingosched and
nicksched) and the single array schedulers (staircase and the SPA
schedulers) is the main stumbling block to narrowing the focus further.
> Linus' objection was against specialisation - he preferred one
> cpu scheduler that could do everything rather than unique cpu schedulers for
> NUMA, SMP, UP, embedded...
kernbench results show that the penalties for an all purpose scheduler
aren't very big so it's probably not a bad philosophy. In spite of this
I think specialization is worth pursuing if it can be achieved with very
small configurable differences to the mechanism. If the configuration
change can be done at boot time or on a running system then it's even
better e.g. your "compute" switch in staircase.
> Each approach has its own arguments and there
> isn't much point bringing them up again. We shall use Linux as the
> "steamroller to crack a nut" no matter what that nut is.
>
Even if plugsched has no hope of getting into the mainline kernel, I see
it as a useful tool for the practical evaluation of the various
approaches. If it could go into -mm for a while this evaluation could
be more widespread.
In it's current state it should not interfere with other scheduling
related development such as the load balancing changes, cpusets etc.
>
>>My testing shows that there's no observable difference in performance
>>between a stock kernel and plugsched with ingosched selected at the
>>total system level (although micro benchmarking may show slight
>>increases in individual operations).
>
>
> I could find no difference either, but IA64 which does not cope with
> indirection well would probably suffer a demonstrable performance hit I have
> been told.
I wasn't aware of that.
> I do not have access to such hardware.
Nor do I.
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
At 10:13 AM 1/6/2006 +1100, Peter Williams wrote:
>Mike Galbraith wrote:
>>At 10:31 PM 1/5/2006 +1100, Peter Williams wrote:
>>
>>>Mike Galbraith wrote:
>>>
>>>>At 08:51 AM 1/5/2006 +1100, Peter Williams wrote:
>>>>
>>>>>I think that some of the harder to understand parts of the scheduler
>>>>>code are actually attempts to overcome the undesirable effects (such
>>>>>as those I've described) of inappropriately identifying tasks as
>>>>>interactive. I think that it would have been better to attempt to fix
>>>>>the inappropriate identifications rather than their effects and I
>>>>>think the prudent use of TASK_NONINTERACTIVE is an important tool for
>>>>>achieving this.
>>>>
>>>>
>>>>IMHO, that's nothing but a cover for the weaknesses induced by using
>>>>exclusively sleep time as an information source for the priority
>>>>calculation. While this heuristic does work pretty darn well, it's
>>>>easily fooled (intentionally or otherwise). The challenge is to find
>>>>the right low cost informational component, and to stir it in at O(1).
>>>
>>>
>>>TASK_NONINTERACTIVE helps in this regard, is no cost in the code where
>>>it's used and probably decreases the costs in the scheduler code by
>>>enabling some processing to be skipped. If by its judicious use the
>>>heuristic is only fed interactive sleep data the heuristics accuracy in
>>>identifying interactive tasks should be improved. It may also allow the
>>>heuristic to be simplified.
>>
>>I disagree. You can nip and tuck all the bits of sleep time you want,
>>and it'll just shift the lumpy spots around (btdt).
>
>Yes, but there's a lot of (understandable) reluctance to do any major
>rework of this part of the scheduler so we're stuck with nips and tucks
>for the time being. This patch is a zero cost nip and tuck.
Color me skeptical, but nonetheless, it looks to me like the mechanism
might need the attached.
On the subject of nip and tuck, take a look at the little proggy posted in
thread [SCHED] wrong priority calc - SIMPLE test case. That testcase was
the result of Paolo Ornati looking into a real problem on his system. I
just 'fixed' that nanosleep() problem by judicious application of
TASK_NONINTERACTIVE to the schedule_timeout(). Sure, it works, but it
doesn't look like anything but a bandaid (tourniquet in this case:) to me.
-Mike
Mike Galbraith wrote:
> At 10:13 AM 1/6/2006 +1100, Peter Williams wrote:
>
>> Mike Galbraith wrote:
>>
>>> At 10:31 PM 1/5/2006 +1100, Peter Williams wrote:
>>>
>>>> Mike Galbraith wrote:
>>>>
>>>>> At 08:51 AM 1/5/2006 +1100, Peter Williams wrote:
>>>>>
>>>>>> I think that some of the harder to understand parts of the
>>>>>> scheduler code are actually attempts to overcome the undesirable
>>>>>> effects (such as those I've described) of inappropriately
>>>>>> identifying tasks as interactive. I think that it would have been
>>>>>> better to attempt to fix the inappropriate identifications rather
>>>>>> than their effects and I think the prudent use of
>>>>>> TASK_NONINTERACTIVE is an important tool for achieving this.
>>>>>
>>>>>
>>>>>
>>>>> IMHO, that's nothing but a cover for the weaknesses induced by
>>>>> using exclusively sleep time as an information source for the
>>>>> priority calculation. While this heuristic does work pretty darn
>>>>> well, it's easily fooled (intentionally or otherwise). The
>>>>> challenge is to find the right low cost informational component,
>>>>> and to stir it in at O(1).
>>>>
>>>>
>>>>
>>>> TASK_NONINTERACTIVE helps in this regard, is no cost in the code
>>>> where it's used and probably decreases the costs in the scheduler
>>>> code by enabling some processing to be skipped. If by its judicious
>>>> use the heuristic is only fed interactive sleep data the heuristics
>>>> accuracy in identifying interactive tasks should be improved. It
>>>> may also allow the heuristic to be simplified.
>>>
>>>
>>> I disagree. You can nip and tuck all the bits of sleep time you
>>> want, and it'll just shift the lumpy spots around (btdt).
>>
>>
>> Yes, but there's a lot of (understandable) reluctance to do any major
>> rework of this part of the scheduler so we're stuck with nips and
>> tucks for the time being. This patch is a zero cost nip and tuck.
>
>
> Color me skeptical, but nonetheless, it looks to me like the mechanism
> might need the attached.
Is that patch complete? (This is all I got.)
--- linux-2.6.15/kernel/sched.c.org Fri Jan 6 08:44:09 2006
+++ linux-2.6.15/kernel/sched.c Fri Jan 6 08:51:03 2006
@@ -1353,7 +1353,7 @@
out_activate:
#endif /* CONFIG_SMP */
- if (old_state == TASK_UNINTERRUPTIBLE) {
+ if (old_state & TASK_UNINTERRUPTIBLE) {
rq->nr_uninterruptible--;
/*
* Tasks on involuntary sleep don't earn
@@ -3010,7 +3010,7 @@
unlikely(signal_pending(prev))))
prev->state = TASK_RUNNING;
else {
- if (prev->state == TASK_UNINTERRUPTIBLE)
+ if (prev->state & TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
deactivate_task(prev, rq);
}
In the absence of any use of TASK_NONINTERACTIVE in conjunction with
TASK_UNINTERRUPTIBLE it will have no effect. Personally, I think that
all TASK_UNINTERRUPTIBLE sleeps should be treated as non interactive
rather than just be heavily discounted (and that TASK_NONINTERACTIVE
shouldn't be needed in conjunction with it) BUT I may be wrong
especially w.r.t. media streamers such as audio and video players and
the mechanisms they use to do sleeps between cpu bursts.
>
> On the subject of nip and tuck, take a look at the little proggy posted
> in thread [SCHED] wrong priority calc - SIMPLE test case. That testcase
> was the result of Paolo Ornati looking into a real problem on his
> system. I just 'fixed' that nanosleep() problem by judicious
> application of TASK_NONINTERACTIVE to the schedule_timeout(). Sure, it
> works, but it doesn't look like anything but a bandaid (tourniquet in
> this case:) to me.
>
> -Mike
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
At 12:11 PM 1/7/2006 +1100, Peter Williams wrote:
>Is that patch complete? (This is all I got.)
Yes.
>--- linux-2.6.15/kernel/sched.c.org Fri Jan 6 08:44:09 2006
>+++ linux-2.6.15/kernel/sched.c Fri Jan 6 08:51:03 2006
>@@ -1353,7 +1353,7 @@
>
> out_activate:
> #endif /* CONFIG_SMP */
>- if (old_state == TASK_UNINTERRUPTIBLE) {
>+ if (old_state & TASK_UNINTERRUPTIBLE) {
> rq->nr_uninterruptible--;
> /*
> * Tasks on involuntary sleep don't earn
>@@ -3010,7 +3010,7 @@
> unlikely(signal_pending(prev))))
> prev->state = TASK_RUNNING;
> else {
>- if (prev->state == TASK_UNINTERRUPTIBLE)
>+ if (prev->state & TASK_UNINTERRUPTIBLE)
> rq->nr_uninterruptible++;
> deactivate_task(prev, rq);
> }
>
>In the absence of any use of TASK_NONINTERACTIVE in conjunction with
>TASK_UNINTERRUPTIBLE it will have no effect.
Exactly. It's only life insurance.
> Personally, I think that all TASK_UNINTERRUPTIBLE sleeps should be
> treated as non interactive rather than just be heavily discounted (and
> that TASK_NONINTERACTIVE shouldn't be needed in conjunction with it) BUT
> I may be wrong especially w.r.t. media streamers such as audio and video
> players and the mechanisms they use to do sleeps between cpu bursts.
Try it, you won't like it. When I first examined sleep_avg woes, my
reaction was to nuke uninterruptible sleep too... boy did that ever _suck_ :)
I'm trying to think of ways to quell the nasty side of sleep_avg without
destroying the good. One method I've tinkered with in the past with
encouraging results is to compute a weighted slice_avg, which is a measure
of how long it takes you to use your slice, and scale it to match
MAX_SLEEPAVG for easy comparison. A possible use thereof: In order to be
classified interactive, you need the sleep_avg, but that's not enough...
you also have to have a record of sharing the cpu. When your slice_avg
degrades enough as you burn cpu, you no longer get to loop in the active
queue. Being relegated to the expired array though will improve your
slice_avg and let you regain your status. Your priority remains, so you
can still preempt, but you become mortal and have to share. When there is
a large disparity between sleep_avg and slice_avg, it can be used as a
general purpose throttle to trigger TASK_NONINTERACTIVE flagging in
schedule() as negative feedback for the ill behaved. Thoughts?
-Mike
Mike Galbraith wrote:
> At 12:11 PM 1/7/2006 +1100, Peter Williams wrote:
>
>> Is that patch complete? (This is all I got.)
>
>
> Yes.
>
>> --- linux-2.6.15/kernel/sched.c.org Fri Jan 6 08:44:09 2006
>> +++ linux-2.6.15/kernel/sched.c Fri Jan 6 08:51:03 2006
>> @@ -1353,7 +1353,7 @@
>>
>> out_activate:
>> #endif /* CONFIG_SMP */
>> - if (old_state == TASK_UNINTERRUPTIBLE) {
>> + if (old_state & TASK_UNINTERRUPTIBLE) {
>> rq->nr_uninterruptible--;
>> /*
>> * Tasks on involuntary sleep don't earn
>> @@ -3010,7 +3010,7 @@
>> unlikely(signal_pending(prev))))
>> prev->state = TASK_RUNNING;
>> else {
>> - if (prev->state == TASK_UNINTERRUPTIBLE)
>> + if (prev->state & TASK_UNINTERRUPTIBLE)
>> rq->nr_uninterruptible++;
>> deactivate_task(prev, rq);
>> }
>>
>> In the absence of any use of TASK_NONINTERACTIVE in conjunction with
>> TASK_UNINTERRUPTIBLE it will have no effect.
>
>
> Exactly. It's only life insurance.
>
>> Personally, I think that all TASK_UNINTERRUPTIBLE sleeps should be
>> treated as non interactive rather than just be heavily discounted (and
>> that TASK_NONINTERACTIVE shouldn't be needed in conjunction with it)
>> BUT I may be wrong especially w.r.t. media streamers such as audio and
>> video players and the mechanisms they use to do sleeps between cpu
>> bursts.
>
>
> Try it, you won't like it.
It's on my list of things to try.
> When I first examined sleep_avg woes, my
> reaction was to nuke uninterruptible sleep too... boy did that ever
> _suck_ :)
I look forward to seeing it. :-)
>
> I'm trying to think of ways to quell the nasty side of sleep_avg without
> destroying the good. One method I've tinkered with in the past with
> encouraging results is to compute a weighted slice_avg, which is a
> measure of how long it takes you to use your slice, and scale it to
> match MAX_SLEEPAVG for easy comparison. A possible use thereof: In
> order to be classified interactive, you need the sleep_avg, but that's
> not enough... you also have to have a record of sharing the cpu. When
> your slice_avg degrades enough as you burn cpu, you no longer get to
> loop in the active queue. Being relegated to the expired array though
> will improve your slice_avg and let you regain your status. Your
> priority remains, so you can still preempt, but you become mortal and
> have to share. When there is a large disparity between sleep_avg and
> slice_avg, it can be used as a general purpose throttle to trigger
> TASK_NONINTERACTIVE flagging in schedule() as negative feedback for the
> ill behaved. Thoughts?
Sounds like the kind of thing that's required. I think the deferred
shift from active to expired is safe as long as CPU hogs can't exploit
it and your scheme sounds like it might provide that assurance. One
problem this solution will experience is that when the system gets
heavily loaded every task will have small CPU usage rates (even the CPU
hogs) and this makes it harder to detect the CPU hogs. One slight
variation of your scheme would be to measure the average length of the
CPU runs that the task does (i.e. how long it runs without voluntarily
relinquishing the CPU) and not allowing them to defer the shift to the
expired array if this average run length is greater than some specified
value. The length of this average for each task shouldn't change with
system load. (This is more or less saying that it's ok for a task to
stay on the active array provided it's unlikely to delay the switch
between the active and expired arrays for very long.)
My own way around the problem is to nuke the expired/active arrays and
use a single priority array. That gets rid of the problem of deferred
shifting from active to expired all together. :-)
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
At 05:34 PM 1/7/2006 +1100, Peter Williams wrote:
>Mike Galbraith wrote:
>
>>I'm trying to think of ways to quell the nasty side of sleep_avg without
>>destroying the good. One method I've tinkered with in the past with
>>encouraging results is to compute a weighted slice_avg, which is a
>>measure of how long it takes you to use your slice, and scale it to match
>>MAX_SLEEPAVG for easy comparison. A possible use thereof: In order to
>>be classified interactive, you need the sleep_avg, but that's not
>>enough... you also have to have a record of sharing the cpu. When your
>>slice_avg degrades enough as you burn cpu, you no longer get to loop in
>>the active queue. Being relegated to the expired array though will
>>improve your slice_avg and let you regain your status. Your priority
>>remains, so you can still preempt, but you become mortal and have to
>>share. When there is a large disparity between sleep_avg and slice_avg,
>>it can be used as a general purpose throttle to trigger
>>TASK_NONINTERACTIVE flagging in schedule() as negative feedback for the
>>ill behaved. Thoughts?
>
>Sounds like the kind of thing that's required. I think the deferred shift
>from active to expired is safe as long as CPU hogs can't exploit it and
>your scheme sounds like it might provide that assurance. One problem this
>solution will experience is that when the system gets heavily loaded every
>task will have small CPU usage rates (even the CPU hogs) and this makes it
>harder to detect the CPU hogs.
True. A gaggle of more or less equally well (or not) behaving tasks will
have their 'hogginess' diluted. I'll have to think more about scaling
with nr_running or maybe starting the clock at first tick of a new slice...
that should still catch most of the guys who are burning hard without being
preempted, or only sleeping for short intervals only to keep coming right
back to beat up poor cc1. I think the real problem children should stick
out enough for a proof of concept even without additional complexity.
> One slight variation of your scheme would be to measure the average
> length of the CPU runs that the task does (i.e. how long it runs without
> voluntarily relinquishing the CPU) and not allowing them to defer the
> shift to the expired array if this average run length is greater than
> some specified value. The length of this average for each task shouldn't
> change with system load. (This is more or less saying that it's ok for a
> task to stay on the active array provided it's unlikely to delay the
> switch between the active and expired arrays for very long.)
Average burn time would indeed probably be a better metric, but that would
require doing bookkeeping is the fast path. I'd like to stick to tick time
or even better, slice renewal time if possible to keep it down on the 'dead
simple and dirt cheap' shelf. After all, this kind of thing is supposed to
accomplish absolutely nothing meaningful the vast majority of the time :)
Thanks for the feedback,
-Mike
On Saturday 07 January 2006 16:27, Mike Galbraith wrote:
> > Personally, I think that all TASK_UNINTERRUPTIBLE sleeps should be
> > treated as non interactive rather than just be heavily discounted (and
> > that TASK_NONINTERACTIVE shouldn't be needed in conjunction with it) BUT
> > I may be wrong especially w.r.t. media streamers such as audio and video
> > players and the mechanisms they use to do sleeps between cpu bursts.
>
> Try it, you won't like it. When I first examined sleep_avg woes, my
> reaction was to nuke uninterruptible sleep too... boy did that ever _suck_
> :)
Glad you've seen why I put the uninterruptible sleep logic in there. In
essence this is why the NFS client interactive case is not as nice - the NFS
code doesn't do "work on behalf of" a cpu hog with the TASK_UNINTERRUPTIBLE
state. The uninterruptible sleep detection logic made a massive difference to
interactivity when cpu bound tasks do disk I/O.
Cheers,
Con
At 08:30 PM 1/7/2006 +1100, Con Kolivas wrote:
>On Saturday 07 January 2006 16:27, Mike Galbraith wrote:
> > > Personally, I think that all TASK_UNINTERRUPTIBLE sleeps should be
> > > treated as non interactive rather than just be heavily discounted (and
> > > that TASK_NONINTERACTIVE shouldn't be needed in conjunction with it) BUT
> > > I may be wrong especially w.r.t. media streamers such as audio and video
> > > players and the mechanisms they use to do sleeps between cpu bursts.
> >
> > Try it, you won't like it. When I first examined sleep_avg woes, my
> > reaction was to nuke uninterruptible sleep too... boy did that ever _suck_
> > :)
>
>Glad you've seen why I put the uninterruptible sleep logic in there.
Yeah, if there's one thing worse than too much preemption, it's too little
preemption.
-Mike
Con Kolivas wrote:
> On Saturday 07 January 2006 16:27, Mike Galbraith wrote:
>
>>> Personally, I think that all TASK_UNINTERRUPTIBLE sleeps should be
>>>treated as non interactive rather than just be heavily discounted (and
>>>that TASK_NONINTERACTIVE shouldn't be needed in conjunction with it) BUT
>>>I may be wrong especially w.r.t. media streamers such as audio and video
>>>players and the mechanisms they use to do sleeps between cpu bursts.
>>
>>Try it, you won't like it. When I first examined sleep_avg woes, my
>>reaction was to nuke uninterruptible sleep too... boy did that ever _suck_
>>:)
>
>
> Glad you've seen why I put the uninterruptible sleep logic in there. In
> essence this is why the NFS client interactive case is not as nice - the NFS
> code doesn't do "work on behalf of" a cpu hog with the TASK_UNINTERRUPTIBLE
> state. The uninterruptible sleep detection logic made a massive difference to
> interactivity when cpu bound tasks do disk I/O.
TASK_NONINTERACTIVE doesn't mean that the task is a CPU hog. It just
means that this sleep should be ignored as far as determining whether
this task is interactive or not.
Also, compensation for uninterruptible sleeps should be handled by the
"fairness" mechanism (i.e. time slices and the active/expired arrays)
not the "interactive response" mechanism. In other words, doing a lot
of uninterruptible sleeps is (theoretically) not a sign that the task is
interactive or for that matter that it's non interactive so
(theoretically) should just be ignored. That bad things happen when it
isn't needs explaining.
I see two possible reasons:
1. Audio/video streamers aren't really interactive but we want to treat
them as such (to ensure they have low latency). The fact that they
aren't really interactive may mean that the sleeps they do between runs
are uninterruptible and if we don't count uninterruptible sleep we'll
miss them.
2. The X server isn't really a completely interactive program either.
It handles a lot of interactive on behalf of interactive programs (which
should involve interactive sleeps and help get it classified as
interactive) but also does a lot of non interactive stuff (which can be
CPU intensive and make it loose points due to CPU hoggishness) which
probably involves uninterruptible sleep. The combination of ignoring
the uninterruptible sleep and the occasional high CPU usage rate could
result in losing too much bonus with consequent poor interactive
responsiveness.
So it would be interesting to know which programs suffered badly when
uninterruptible sleep was ignored? This may enable an alternate
solution to be found.
In any case and in the meantime, perhaps the solution is to use
TASK_NONINTERACTIVE where needed but treat
TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE sleep the same as
TASK_UNINTERRUPTIBLE sleep instead of ignoring it?
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
Mike Galbraith wrote:
> At 05:34 PM 1/7/2006 +1100, Peter Williams wrote:
>
>> Mike Galbraith wrote:
>>
>>> I'm trying to think of ways to quell the nasty side of sleep_avg
>>> without destroying the good. One method I've tinkered with in the
>>> past with encouraging results is to compute a weighted slice_avg,
>>> which is a measure of how long it takes you to use your slice, and
>>> scale it to match MAX_SLEEPAVG for easy comparison. A possible use
>>> thereof: In order to be classified interactive, you need the
>>> sleep_avg, but that's not enough... you also have to have a record of
>>> sharing the cpu. When your slice_avg degrades enough as you burn cpu,
>>> you no longer get to loop in the active queue. Being relegated to
>>> the expired array though will improve your slice_avg and let you
>>> regain your status. Your priority remains, so you can still preempt,
>>> but you become mortal and have to share. When there is a large
>>> disparity between sleep_avg and slice_avg, it can be used as a
>>> general purpose throttle to trigger TASK_NONINTERACTIVE flagging in
>>> schedule() as negative feedback for the ill behaved. Thoughts?
>>
>>
>> Sounds like the kind of thing that's required. I think the deferred
>> shift from active to expired is safe as long as CPU hogs can't exploit
>> it and your scheme sounds like it might provide that assurance. One
>> problem this solution will experience is that when the system gets
>> heavily loaded every task will have small CPU usage rates (even the
>> CPU hogs) and this makes it harder to detect the CPU hogs.
>
>
> True. A gaggle of more or less equally well (or not) behaving tasks
> will have their 'hogginess' diluted. I'll have to think more about
> scaling with nr_running or maybe starting the clock at first tick of a
> new slice... that should still catch most of the guys who are burning
> hard without being preempted, or only sleeping for short intervals only
> to keep coming right back to beat up poor cc1. I think the real problem
> children should stick out enough for a proof of concept even without
> additional complexity.
>
>> One slight variation of your scheme would be to measure the average
>> length of the CPU runs that the task does (i.e. how long it runs
>> without voluntarily relinquishing the CPU) and not allowing them to
>> defer the shift to the expired array if this average run length is
>> greater than some specified value. The length of this average for
>> each task shouldn't change with system load. (This is more or less
>> saying that it's ok for a task to stay on the active array provided
>> it's unlikely to delay the switch between the active and expired
>> arrays for very long.)
>
>
> Average burn time would indeed probably be a better metric, but that
> would require doing bookkeeping is the fast path.
Most of the infrastructure is already there and the cost of doing the
extra bits required to get this metric would be extremely small. The
hardest bit would be deciding on the "limit" to be applied when deciding
whether to let a supposed interactive task stay on the active array.
From the statistical point of view, the distribution of random time
intervals with a given average length is such that about 99% of them
will be less than four times the average length. So a value of 1/4 of
the delay in array switches that can be tolerated would be about right.
But that still leaves the problem of what delay can be tolerated :-).
> I'd like to stick to
> tick time or even better, slice renewal time if possible to keep it down
> on the 'dead simple and dirt cheap' shelf. After all, this kind of
> thing is supposed to accomplish absolutely nothing meaningful the vast
> majority of the time :)
>
By the way, it seems you have your own scheduler versions? If so are
you interested in adding them to the collection in PlugSched?
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
On Sunday 08 January 2006 10:31, Peter Williams wrote:
> In any case and in the meantime, perhaps the solution is to use
> TASK_NONINTERACTIVE where needed but treat
> TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE sleep the same as
> TASK_UNINTERRUPTIBLE sleep instead of ignoring it?
That's how I would tackle it.
Con
At 10:40 AM 1/8/2006 +1100, Peter Williams wrote:
>Mike Galbraith wrote:
>>> One slight variation of your scheme would be to measure the average
>>> length of the CPU runs that the task does (i.e. how long it runs
>>> without voluntarily relinquishing the CPU) and not allowing them to
>>> defer the shift to the expired array if this average run length is
>>> greater than some specified value. The length of this average for each
>>> task shouldn't change with system load. (This is more or less saying
>>> that it's ok for a task to stay on the active array provided it's
>>> unlikely to delay the switch between the active and expired arrays for
>>> very long.)
>>
>>Average burn time would indeed probably be a better metric, but that
>>would require doing bookkeeping is the fast path.
>
>Most of the infrastructure is already there and the cost of doing the
>extra bits required to get this metric would be extremely small. The
>hardest bit would be deciding on the "limit" to be applied when deciding
>whether to let a supposed interactive task stay on the active array.
Yeah, I noticed run_time when I started implementing my first cut. (which
is of course buggy)
>By the way, it seems you have your own scheduler versions? If so are you
>interested in adding them to the collection in PlugSched?
No, I used to do a bunch of experimentation in fairness vs interactivity,
but they all ended up just trading one weakness for an other.
-Mike