Changelog
Fixes comments by akpm
- avoid creating new per-process flag PF_SWAPIN
Other changes
- do not mix spaces and tabs
delayacct-blkio-swapin.patch
Collect per-task block I/O delay statistics.
Unlike earlier iterations of the delay accounting
patches, now delays are only collected for the actual
I/O waits rather than try and cover the delays seen in
I/O submission paths.
Account separately for block I/O delays
incurred as a result of swapin page faults whose
frequency can be affected by the task/process' rss limit.
Hence swapin delays can act as feedback for rss limit changes
independent of I/O priority changes.
Signed-off-by: Shailabh Nagar <[email protected]>
Signed-off-by: Balbir Singh <[email protected]>
---
include/linux/delayacct.h | 25 +++++++++++++++++++++++++
include/linux/sched.h | 6 ++++++
kernel/delayacct.c | 19 +++++++++++++++++++
kernel/sched.c | 5 +++++
mm/memory.c | 4 ++++
5 files changed, 59 insertions(+)
diff -puN include/linux/delayacct.h~delayacct-blkio-swapin include/linux/delayacct.h
--- linux-2.6.17-rc3/include/linux/delayacct.h~delayacct-blkio-swapin 2006-04-28 23:48:43.000000000 +0530
+++ linux-2.6.17-rc3-balbir/include/linux/delayacct.h 2006-05-02 07:05:21.000000000 +0530
@@ -19,6 +19,13 @@
#include <linux/sched.h>
+/*
+ * Per-task flags relevant to delay accounting
+ * maintained privately to avoid exhausting similar flags in sched.h:PF_*
+ * Used to set current->delays->flags
+ */
+#define DELAYACCT_PF_SWAPIN 0x00000001 /* I am doing a swapin */
+
#ifdef CONFIG_TASK_DELAY_ACCT
extern int delayacct_on; /* Delay accounting turned on/off */
@@ -26,6 +33,8 @@ extern kmem_cache_t *delayacct_cache;
extern void delayacct_init(void);
extern void __delayacct_tsk_init(struct task_struct *);
extern void __delayacct_tsk_exit(struct task_struct *);
+extern void __delayacct_blkio_start(void);
+extern void __delayacct_blkio_end(void);
static inline void delayacct_set_flag(int flag)
{
@@ -53,6 +62,18 @@ static inline void delayacct_tsk_exit(st
__delayacct_tsk_exit(tsk);
}
+static inline void delayacct_blkio_start(void)
+{
+ if (current->delays)
+ __delayacct_blkio_start();
+}
+
+static inline void delayacct_blkio_end(void)
+{
+ if (current->delays)
+ __delayacct_blkio_end();
+}
+
#else
static inline void delayacct_set_flag(int flag)
{}
@@ -64,6 +85,10 @@ static inline void delayacct_tsk_init(st
{}
static inline void delayacct_tsk_exit(struct task_struct *tsk)
{}
+static inline void delayacct_blkio_start(void)
+{}
+static inline void delayacct_blkio_end(void)
+{}
#endif /* CONFIG_TASK_DELAY_ACCT */
#endif
diff -puN include/linux/sched.h~delayacct-blkio-swapin include/linux/sched.h
--- linux-2.6.17-rc3/include/linux/sched.h~delayacct-blkio-swapin 2006-04-28 23:48:43.000000000 +0530
+++ linux-2.6.17-rc3-balbir/include/linux/sched.h 2006-05-02 07:05:25.000000000 +0530
@@ -550,6 +550,12 @@ struct task_delay_info {
* Atomicity of updates to XXX_delay, XXX_count protected by
* single lock above (split into XXX_lock if contention is an issue).
*/
+
+ struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */
+ u64 blkio_delay; /* wait for sync block io completion */
+ u64 swapin_delay; /* wait for swapin block io completion */
+ u32 blkio_count;
+ u32 swapin_count;
};
#endif
diff -puN kernel/delayacct.c~delayacct-blkio-swapin kernel/delayacct.c
--- linux-2.6.17-rc3/kernel/delayacct.c~delayacct-blkio-swapin 2006-04-28 23:48:43.000000000 +0530
+++ linux-2.6.17-rc3-balbir/kernel/delayacct.c 2006-05-02 07:27:03.000000000 +0530
@@ -85,3 +85,22 @@ static inline void delayacct_end(struct
spin_unlock(¤t->delays->lock);
}
+void __delayacct_blkio_start(void)
+{
+ delayacct_start(¤t->delays->blkio_start);
+}
+
+void __delayacct_blkio_end(void)
+{
+ if (current->delays->flags & DELAYACCT_PF_SWAPIN)
+ /* Swapin block I/O */
+ delayacct_end(¤t->delays->blkio_start,
+ ¤t->delays->blkio_end,
+ ¤t->delays->swapin_delay,
+ ¤t->delays->swapin_count);
+ else /* Other block I/O */
+ delayacct_end(¤t->delays->blkio_start,
+ ¤t->delays->blkio_end,
+ ¤t->delays->blkio_delay,
+ ¤t->delays->blkio_count);
+}
diff -puN kernel/sched.c~delayacct-blkio-swapin kernel/sched.c
--- linux-2.6.17-rc3/kernel/sched.c~delayacct-blkio-swapin 2006-04-28 23:48:43.000000000 +0530
+++ linux-2.6.17-rc3-balbir/kernel/sched.c 2006-05-02 07:05:25.000000000 +0530
@@ -50,6 +50,7 @@
#include <linux/times.h>
#include <linux/acct.h>
#include <linux/kprobes.h>
+#include <linux/delayacct.h>
#include <asm/tlb.h>
#include <asm/unistd.h>
@@ -4170,9 +4171,11 @@ void __sched io_schedule(void)
{
struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
+ delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
schedule();
atomic_dec(&rq->nr_iowait);
+ delayacct_blkio_end();
}
EXPORT_SYMBOL(io_schedule);
@@ -4182,9 +4185,11 @@ long __sched io_schedule_timeout(long ti
struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
long ret;
+ delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
ret = schedule_timeout(timeout);
atomic_dec(&rq->nr_iowait);
+ delayacct_blkio_end();
return ret;
}
diff -puN mm/memory.c~delayacct-blkio-swapin mm/memory.c
--- linux-2.6.17-rc3/mm/memory.c~delayacct-blkio-swapin 2006-04-28 23:48:43.000000000 +0530
+++ linux-2.6.17-rc3-balbir/mm/memory.c 2006-04-28 23:48:43.000000000 +0530
@@ -48,6 +48,7 @@
#include <linux/rmap.h>
#include <linux/module.h>
#include <linux/init.h>
+#include <linux/delayacct.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -1880,6 +1881,7 @@ static int do_swap_page(struct mm_struct
entry = pte_to_swp_entry(orig_pte);
again:
+ delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
@@ -1892,6 +1894,7 @@ again:
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte)))
ret = VM_FAULT_OOM;
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto unlock;
}
@@ -1903,6 +1906,7 @@ again:
mark_page_accessed(page);
lock_page(page);
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (!PageSwapCache(page)) {
/* Page migration has occured */
unlock_page(page);
_
Balbir Singh <[email protected]> wrote:
>
> @@ -550,6 +550,12 @@ struct task_delay_info {
> * Atomicity of updates to XXX_delay, XXX_count protected by
> * single lock above (split into XXX_lock if contention is an issue).
> */
> +
> + struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */
> + u64 blkio_delay; /* wait for sync block io completion */
> + u64 swapin_delay; /* wait for swapin block io completion */
> + u32 blkio_count;
> + u32 swapin_count;
These fields are a bit mystifying.
In what units are blkio_delay and swapin_delay?
What is the meaning behind blkio_count and swapin_count?
Better comments needed, please.
On Mon, May 08, 2006 at 02:19:52PM -0700, Andrew Morton wrote:
> Balbir Singh <[email protected]> wrote:
> >
> > @@ -550,6 +550,12 @@ struct task_delay_info {
> > * Atomicity of updates to XXX_delay, XXX_count protected by
> > * single lock above (split into XXX_lock if contention is an issue).
> > */
> > +
> > + struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */
> > + u64 blkio_delay; /* wait for sync block io completion */
> > + u64 swapin_delay; /* wait for swapin block io completion */
> > + u32 blkio_count;
> > + u32 swapin_count;
>
> These fields are a bit mystifying.
>
> In what units are blkio_delay and swapin_delay?
>
> What is the meaning behind blkio_count and swapin_count?
>
> Better comments needed, please.
Will add more detailed comments and send them as updates.
Thanks,
Balbir Singh,
Linux Technology Center,
IBM Software Labs
Balbir Singh wrote:
>On Mon, May 08, 2006 at 02:19:52PM -0700, Andrew Morton wrote:
>
>>Balbir Singh <[email protected]> wrote:
>>
>>>@@ -550,6 +550,12 @@ struct task_delay_info {
>>> * Atomicity of updates to XXX_delay, XXX_count protected by
>>> * single lock above (split into XXX_lock if contention is an issue).
>>> */
>>>+
>>>+ struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */
>>>+ u64 blkio_delay; /* wait for sync block io completion */
>>>+ u64 swapin_delay; /* wait for swapin block io completion */
>>>+ u32 blkio_count;
>>>+ u32 swapin_count;
>>>
>>These fields are a bit mystifying.
>>
>>In what units are blkio_delay and swapin_delay?
>>
>>What is the meaning behind blkio_count and swapin_count?
>>
>>Better comments needed, please.
>>
>
>Will add more detailed comments and send them as updates.
>
What kinds of usages will this stuff see? Will the CONFIG be usually
turned on,
with some tasks occasionally using the statistics?
In which case, might it be better to make each delay collector in its
own data
structure { .list; .start; .end; .delay; .count; .private; .name }, and
allocate
them and hang them off the task structure when they're in use?
Or even put them in their own data structure (a small hash or something).
OTOH if they're often going to be in use by many tasks, then what you
have might
be the best option.
Nick
Send instant messages to your online friends http://au.messenger.yahoo.com
On Tue, May 09, 2006 at 02:23:15PM +1000, Nick Piggin wrote:
> Balbir Singh wrote:
>
> >On Mon, May 08, 2006 at 02:19:52PM -0700, Andrew Morton wrote:
> >
> >>Balbir Singh <[email protected]> wrote:
> >>
> >>>@@ -550,6 +550,12 @@ struct task_delay_info {
> >>> * Atomicity of updates to XXX_delay, XXX_count protected by
> >>> * single lock above (split into XXX_lock if contention is an issue).
> >>> */
> >>>+
> >>>+ struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */
> >>>+ u64 blkio_delay; /* wait for sync block io completion */
> >>>+ u64 swapin_delay; /* wait for swapin block io completion */
> >>>+ u32 blkio_count;
> >>>+ u32 swapin_count;
> >>>
> >>These fields are a bit mystifying.
> >>
> >>In what units are blkio_delay and swapin_delay?
> >>
> >>What is the meaning behind blkio_count and swapin_count?
> >>
> >>Better comments needed, please.
> >>
> >
> >Will add more detailed comments and send them as updates.
> >
>
> What kinds of usages will this stuff see? Will the CONFIG be usually
> turned on,
> with some tasks occasionally using the statistics?
>
> In which case, might it be better to make each delay collector in its
> own data
> structure { .list; .start; .end; .delay; .count; .private; .name }, and
> allocate
> them and hang them off the task structure when they're in use?
>
> Or even put them in their own data structure (a small hash or something).
>
> OTOH if they're often going to be in use by many tasks, then what you
> have might
> be the best option.
>
> Nick
>
> Send instant messages to your online friends http://au.messenger.yahoo.com
I expect/hope that the CONFIG will be turned on. There is a boot
option (called delayacct) to enable/disable the statistics collection.
Once turned on and enabled, all tasks will be filling in/using the statistics.
Thanks,
Balbir Singh,
Linux Technology Center,
IBM Software Labs
Balbir Singh wrote:
>
>I expect/hope that the CONFIG will be turned on. There is a boot
>option (called delayacct) to enable/disable the statistics collection.
>Once turned on and enabled, all tasks will be filling in/using the statistics.
>
Well they'll be _collecting_ the stats, yes. Will they really be using
them for anything?
If you make the whole thing much lighter weight for tasks which aren't
using the accounting, you have a better chance of people turning the
CONFIG option on.
--
Send instant messages to your online friends http://au.messenger.yahoo.com
On Tue, May 09, 2006 at 03:57:06PM +1000, Nick Piggin wrote:
> Balbir Singh wrote:
>
> >
> >I expect/hope that the CONFIG will be turned on. There is a boot
> >option (called delayacct) to enable/disable the statistics collection.
> >Once turned on and enabled, all tasks will be filling in/using the
> >statistics.
> >
>
> Well they'll be _collecting_ the stats, yes. Will they really be using
> them for anything?
Hmm.. No, the statistics are sent down using the netlink interface
to listeners on the netlink group (on every task exit) or to the task that
actually requested for the delay accounting data.
The stats are currently gathered in kernel and used by user space.
>
> If you make the whole thing much lighter weight for tasks which aren't
> using the accounting, you have a better chance of people turning the
> CONFIG option on.
>
I am not sure I understand the point completely. Are you suggesting that
struct task_delay_info be moved to common data structure as an aggregate
containing all the delay stats data?
Thanks,
Balbir Singh,
Linux Technology Center,
IBM Software Labs
Balbir Singh wrote:
>On Tue, May 09, 2006 at 03:57:06PM +1000, Nick Piggin wrote:
>
>>Well they'll be _collecting_ the stats, yes. Will they really be using
>>them for anything?
>>
>
>Hmm.. No, the statistics are sent down using the netlink interface
>to listeners on the netlink group (on every task exit) or to the task that
>actually requested for the delay accounting data.
>
>The stats are currently gathered in kernel and used by user space.
>
So... what are the consumers of this data going to be? That is my question.
>>If you make the whole thing much lighter weight for tasks which aren't
>>using the accounting, you have a better chance of people turning the
>>CONFIG option on.
>>
>>
>
>I am not sure I understand the point completely. Are you suggesting that
>struct task_delay_info be moved to common data structure as an aggregate
>containing all the delay stats data?
>
My suggestion is basically this: if the accounting is going to be used
infrequently, it might be a good idea to allocate the accounting structures
on demand, and only perform the accounting when these structures are
allocated.
It all adds up. Extra cache misses, more icache, more logic, etc... I
suspect
that relatively few people will care about these stats.
--
Send instant messages to your online friends http://au.messenger.yahoo.com
On Tue, May 09, 2006 at 06:20:12PM +1000, Nick Piggin wrote:
> Balbir Singh wrote:
>
> >On Tue, May 09, 2006 at 03:57:06PM +1000, Nick Piggin wrote:
> >
> >>Well they'll be _collecting_ the stats, yes. Will they really be using
> >>them for anything?
> >>
> >
> >Hmm.. No, the statistics are sent down using the netlink interface
> >to listeners on the netlink group (on every task exit) or to the task that
> >actually requested for the delay accounting data.
> >
> >The stats are currently gathered in kernel and used by user space.
> >
>
> So... what are the consumers of this data going to be? That is my question.
More details on the consumers of this data is available at
http://lkml.org/lkml/2006/3/13/367
>
> >>If you make the whole thing much lighter weight for tasks which aren't
> >>using the accounting, you have a better chance of people turning the
> >>CONFIG option on.
> >>
> >>
> >
> >I am not sure I understand the point completely. Are you suggesting that
> >struct task_delay_info be moved to common data structure as an aggregate
> >containing all the delay stats data?
> >
>
> My suggestion is basically this: if the accounting is going to be used
> infrequently, it might be a good idea to allocate the accounting structures
> on demand, and only perform the accounting when these structures are
> allocated.
>
> It all adds up. Extra cache misses, more icache, more logic, etc... I
> suspect
> that relatively few people will care about these stats.
>
Thanks for clarifying. I now understand your suggestion better.
The accounting is going to be frequent, with data from all tasks in the
system being collected and processed frequently. Since the accounting is
frequent, I think the current scheme works better than on-demand allocation.
Regarding the usefulness of these stats, please see
http://www.uwsg.iu.edu/hypermail/linux/kernel/0604.2/1731.html
Balbir Singh,
Linux Technology Center,
IBM Software Labs
Balbir Singh wrote:
> On Tue, May 09, 2006 at 06:20:12PM +1000, Nick Piggin wrote:
>>So... what are the consumers of this data going to be? That is my question.
>
>
> More details on the consumers of this data is available at
> http://lkml.org/lkml/2006/3/13/367
Profiling, monitoring, and control (workload management).
Sounds like something that at the moment, most users and most
applications will not use, most of the time.
I won't question the usefulness of the statistics to some
applications, but at the moment is it is reasonable to add this
overhead for everyone, or even for all tasks? Adding a bit more
work for those that want the stats won't be too bad.
Thanks,
Nick
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Mon, May 08, 2006 at 02:19:52PM -0700, Andrew Morton wrote:
> Balbir Singh <[email protected]> wrote:
> >
> > @@ -550,6 +550,12 @@ struct task_delay_info {
> > * Atomicity of updates to XXX_delay, XXX_count protected by
> > * single lock above (split into XXX_lock if contention is an issue).
> > */
> > +
> > + struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */
> > + u64 blkio_delay; /* wait for sync block io completion */
> > + u64 swapin_delay; /* wait for swapin block io completion */
> > + u32 blkio_count;
> > + u32 swapin_count;
>
> These fields are a bit mystifying.
>
> In what units are blkio_delay and swapin_delay?
>
> What is the meaning behind blkio_count and swapin_count?
>
> Better comments needed, please.
Hi, Andrew,
Here is an update, that adds comments to the fields as suggested in the
review comments
Balbir Singh,
Linux Technology Center,
IBM Software Labs
Changelog
1. Add comments to the task_delay_info structure, documenting the units
of delay and document the meaning of the count fields in the structure.
Signed-off-by: Balbir Singh <[email protected]>
---
include/linux/sched.h | 11 +++++++++--
1 files changed, 9 insertions(+), 2 deletions(-)
diff -puN include/linux/sched.h~task-delay-add-comments-on-units include/linux/sched.h
--- linux-2.6.17-rc3/include/linux/sched.h~task-delay-add-comments-on-units 2006-05-10 14:35:46.000000000 +0530
+++ linux-2.6.17-rc3-balbir/include/linux/sched.h 2006-05-10 14:43:39.000000000 +0530
@@ -553,11 +553,18 @@ struct task_delay_info {
* single lock above (split into XXX_lock if contention is an issue).
*/
+ /*
+ * XXX_count is incremented on every XXX operation, the delay
+ * associated with the operation is added to XXX_delay.
+ * XXX_delay contains the accumulated delay time in nanoseconds.
+ */
struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */
u64 blkio_delay; /* wait for sync block io completion */
u64 swapin_delay; /* wait for swapin block io completion */
- u32 blkio_count;
- u32 swapin_count;
+ u32 blkio_count; /* total count of the number of sync block */
+ /* io operations performed */
+ u32 swapin_count; /* total count of the number of swapin block */
+ /* io operations performed */
};
#endif /* CONFIG_TASK_DELAY_ACCT */
_