From cb80fe74bab69449c828d7433fbbea807816eee8 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <[email protected]>
Date: Mon, 20 Jul 2009 11:26:58 -0700
Subject: [PATCH] sched: Provide iowait counters
For counting how long an application has been waiting for (disk) IO,
there currently is only the HZ sample driven information available, while
for all other counters in this class, a high resolution version is
available via CONFIG_SCHEDSTATS.
In order to make an improved bootchart tool possible, we also need
a higher resolution version of the iowait time.
This patch below adds this scheduler statistic to the kernel.
Signed-off-by: Arjan van de Ven <[email protected]>
---
include/linux/sched.h | 3 +++
kernel/sched.c | 4 ++++
kernel/sched_debug.c | 4 ++++
kernel/sched_fair.c | 5 +++++
4 files changed, 16 insertions(+), 0 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 16a982e..2b8f1f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1110,6 +1110,8 @@ struct sched_entity {
u64 wait_max;
u64 wait_count;
u64 wait_sum;
+ u64 iowait_count;
+ u64 iowait_sum;
u64 sleep_start;
u64 sleep_max;
@@ -1229,6 +1231,7 @@ struct task_struct {
unsigned did_exec:1;
unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */
+ unsigned in_iowait:1;
pid_t pid;
pid_t tgid;
diff --git a/kernel/sched.c b/kernel/sched.c
index 98972d3..caa1974 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6662,7 +6662,9 @@ void __sched io_schedule(void)
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ current->in_iowait = 1;
schedule();
+ current->in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
}
@@ -6675,7 +6677,9 @@ long __sched io_schedule_timeout(long timeout)
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ current->in_iowait = 1;
ret = schedule_timeout(timeout);
+ current->in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
return ret;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 70c7e0b..5ddbd08 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.wait_max);
PN(se.wait_sum);
P(se.wait_count);
+ PN(se.iowait_sum);
+ P(se.iowait_count);
P(sched_info.bkl_count);
P(se.nr_migrations);
P(se.nr_migrations_cold);
@@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p)
p->se.wait_max = 0;
p->se.wait_sum = 0;
p->se.wait_count = 0;
+ p->se.iowait_sum = 0;
+ p->se.iowait_count = 0;
p->se.sleep_max = 0;
p->se.sum_sleep_runtime = 0;
p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7c248dc..3db22e6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -633,6 +633,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->block_start = 0;
se->sum_sleep_runtime += delta;
+ if (tsk->in_iowait) {
+ se->iowait_sum += delta;
+ se->iowait_count++;
+ }
+
/*
* Blocking time is in units of nanosecs, so shift by 20 to
* get a milliseconds-range estimation of the amount of
--
1.6.0.6
On Mon, 2009-07-20 at 11:31 -0700, Arjan van de Ven wrote:
> From cb80fe74bab69449c828d7433fbbea807816eee8 Mon Sep 17 00:00:00 2001
> From: Arjan van de Ven <[email protected]>
> Date: Mon, 20 Jul 2009 11:26:58 -0700
> Subject: [PATCH] sched: Provide iowait counters
>
> For counting how long an application has been waiting for (disk) IO,
> there currently is only the HZ sample driven information available, while
> for all other counters in this class, a high resolution version is
> available via CONFIG_SCHEDSTATS.
>
> In order to make an improved bootchart tool possible, we also need
> a higher resolution version of the iowait time.
>
> This patch below adds this scheduler statistic to the kernel.
>
> Signed-off-by: Arjan van de Ven <[email protected]>
Seems sane enough, I'll pick her up. Thanks!
> +++ b/kernel/sched_fair.c
> @@ -633,6 +633,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
> se->block_start = 0;
> se->sum_sleep_runtime += delta;
>
> + if (tsk->in_iowait) {
> + se->iowait_sum += delta;
> + se->iowait_count++;
> + }
> +
> /*
> * Blocking time is in units of nanosecs, so shift by 20 to
> * get a milliseconds-range estimation of the amount of
It might be nice to put a tracepoint there as well, now if there was a
way to specify perf counter attributes in the TRACE_EVENT() magic so
that we can feed stuff into perf_tpcounter_event().
TP_perf_addr() -- defaults to 0 when not specified
TP_perf_count() -- defaults to 1 when not specified.
Steve, Frederic, is there any way to make that happen?
Failing that we could put an actual swcounter in there I suppose.
That way we could profile applications based on IO-wait, which would be
cool.
Peter Zijlstra wrote:
>
> It might be nice to put a tracepoint there as well, now if there was a
> way to specify perf counter attributes in the TRACE_EVENT() magic so
> that we can feed stuff into perf_tpcounter_event().
>
> TP_perf_addr() -- defaults to 0 when not specified
> TP_perf_count() -- defaults to 1 when not specified.
>
> Steve, Frederic, is there any way to make that happen?
>
> Failing that we could put an actual swcounter in there I suppose.
>
> That way we could profile applications based on IO-wait, which would be
> cool.
it'd be nice if this was also a perfcounter event, no doubt about that
(I already have some usages in mind).... but that'd be a separate patch.
On Mon, 20 Jul 2009, Peter Zijlstra wrote:
>
> > +++ b/kernel/sched_fair.c
> > @@ -633,6 +633,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > se->block_start = 0;
> > se->sum_sleep_runtime += delta;
> >
> > + if (tsk->in_iowait) {
> > + se->iowait_sum += delta;
> > + se->iowait_count++;
> > + }
> > +
> > /*
> > * Blocking time is in units of nanosecs, so shift by 20 to
> > * get a milliseconds-range estimation of the amount of
>
> It might be nice to put a tracepoint there as well, now if there was a
> way to specify perf counter attributes in the TRACE_EVENT() magic so
> that we can feed stuff into perf_tpcounter_event().
>
> TP_perf_addr() -- defaults to 0 when not specified
> TP_perf_count() -- defaults to 1 when not specified.
>
> Steve, Frederic, is there any way to make that happen?
>
> Failing that we could put an actual swcounter in there I suppose.
>
> That way we could profile applications based on IO-wait, which would be
> cool.
How would you imagine an interface for this? Could you come up with
something that you would like to see, and then I could see if we could
implement it.
How would the TRACE_EVENT look exactly? Would there be anything different
in the trace point location itself?
Thanks,
-- Steve
On Mon, 2009-07-20 at 15:42 -0400, Steven Rostedt wrote:
> On Mon, 20 Jul 2009, Peter Zijlstra wrote:
> >
> > > +++ b/kernel/sched_fair.c
> > > @@ -633,6 +633,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > > se->block_start = 0;
> > > se->sum_sleep_runtime += delta;
> > >
> > > + if (tsk->in_iowait) {
> > > + se->iowait_sum += delta;
> > > + se->iowait_count++;
> > > + }
> > > +
> > > /*
> > > * Blocking time is in units of nanosecs, so shift by 20 to
> > > * get a milliseconds-range estimation of the amount of
> >
> > It might be nice to put a tracepoint there as well, now if there was a
> > way to specify perf counter attributes in the TRACE_EVENT() magic so
> > that we can feed stuff into perf_tpcounter_event().
> >
> > TP_perf_addr() -- defaults to 0 when not specified
> > TP_perf_count() -- defaults to 1 when not specified.
> >
> > Steve, Frederic, is there any way to make that happen?
> >
> > Failing that we could put an actual swcounter in there I suppose.
> >
> > That way we could profile applications based on IO-wait, which would be
> > cool.
>
> How would you imagine an interface for this? Could you come up with
> something that you would like to see, and then I could see if we could
> implement it.
>
> How would the TRACE_EVENT look exactly? Would there be anything different
> in the trace point location itself?
Something like:
TRACE_EVENT(sched_iowait,
TP_PROTO(struct task_struct *p, u64 time),
TP_ARGS(p, time),
TP_STRUCT__entry(
__field(pid_t, pid )
__field(u64, time)
),
TP_fast_assign(
__entry->pid = p->pid;
__entry->time = time;
__perf_count(time);
),
TP_printk("task %d waited for IO for %Lu ns",
__entry->pid, __entry->time)
);
Something like the below, except that its probably borken in interesting
ways..
---
include/trace/ftrace.h | 19 ++++++++++++++-----
kernel/perf_counter.c | 6 +++---
2 files changed, 17 insertions(+), 8 deletions(-)
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 1867553..7e550a2 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -447,11 +447,20 @@ static inline int ftrace_get_offsets_##call( \
#define TP_FMT(fmt, args...) fmt "\n", ##args
#ifdef CONFIG_EVENT_PROFILE
-#define _TRACE_PROFILE(call, proto, args) \
+
+#undef __perf_addr
+#define __perf_addr(a) addr = (a)
+
+#undef __perf_count
+#define __perf_count(c) count = (c)
+
+#define _TRACE_PROFILE(call, proto, args, assign) \
static void ftrace_profile_##call(proto) \
{ \
- extern void perf_tpcounter_event(int); \
- perf_tpcounter_event(event_##call.id); \
+ extern void perf_tpcounter_event(int, u64, u64); \
+ u64 addr = 0, count = 1; \
+ { assign; } \
+ perf_tpcounter_event(event_##call.id, addr, count); \
} \
\
static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
@@ -476,7 +485,7 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
.profile_disable = ftrace_profile_disable_##call,
#else
-#define _TRACE_PROFILE(call, proto, args)
+#define _TRACE_PROFILE(call, proto, args, assign)
#define _TRACE_PROFILE_INIT(call)
#endif
@@ -502,7 +511,7 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
#undef TRACE_EVENT
#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
+_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args), PARAMS(assign)) \
\
static struct ftrace_event_call event_##call; \
\
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5498890..879a6ce 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3680,17 +3680,17 @@ static const struct pmu perf_ops_task_clock = {
};
#ifdef CONFIG_EVENT_PROFILE
-void perf_tpcounter_event(int event_id)
+void perf_tpcounter_event(int event_id, u64 addr, u64 count)
{
struct perf_sample_data data = {
.regs = get_irq_regs(),
- .addr = 0,
+ .addr = addr,
};
if (!data.regs)
data.regs = task_pt_regs(current);
- do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
+ do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
}
EXPORT_SYMBOL_GPL(perf_tpcounter_event);
On Mon, 20 Jul 2009, Peter Zijlstra wrote:
> >
> > How would the TRACE_EVENT look exactly? Would there be anything different
> > in the trace point location itself?
>
> Something like:
>
> TRACE_EVENT(sched_iowait,
>
> TP_PROTO(struct task_struct *p, u64 time),
>
> TP_ARGS(p, time),
>
> TP_STRUCT__entry(
> __field(pid_t, pid )
> __field(u64, time)
> ),
>
> TP_fast_assign(
> __entry->pid = p->pid;
> __entry->time = time;
>
> __perf_count(time);
> ),
>
> TP_printk("task %d waited for IO for %Lu ns",
> __entry->pid, __entry->time)
> );
>
> Something like the below, except that its probably borken in interesting
> ways..
>
> ---
> include/trace/ftrace.h | 19 ++++++++++++++-----
> kernel/perf_counter.c | 6 +++---
> 2 files changed, 17 insertions(+), 8 deletions(-)
>
> diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
> index 1867553..7e550a2 100644
> --- a/include/trace/ftrace.h
> +++ b/include/trace/ftrace.h
> @@ -447,11 +447,20 @@ static inline int ftrace_get_offsets_##call( \
> #define TP_FMT(fmt, args...) fmt "\n", ##args
>
> #ifdef CONFIG_EVENT_PROFILE
> -#define _TRACE_PROFILE(call, proto, args) \
> +
> +#undef __perf_addr
> +#define __perf_addr(a) addr = (a)
> +
> +#undef __perf_count
> +#define __perf_count(c) count = (c)
> +
> +#define _TRACE_PROFILE(call, proto, args, assign) \
> static void ftrace_profile_##call(proto) \
> { \
> - extern void perf_tpcounter_event(int); \
> - perf_tpcounter_event(event_##call.id); \
> + extern void perf_tpcounter_event(int, u64, u64); \
> + u64 addr = 0, count = 1; \
> + { assign; } \
> + perf_tpcounter_event(event_##call.id, addr, count); \
The problem here is that the assign also will do the :
TP_fast_assign(
__entry->pid = p->pid;
__entry->time = time;
part, thus you will probably get errors in processing the __entry part.
What about doing instead:
TRACE_EVENT(sched_iowait,
TP_PROTO(struct task_struct *p, u64 time),
TP_ARGS(p, time),
TP_STRUCT__entry(
__field(pid_t, pid )
__field(u64, time)
),
TP_fast_assign(
__entry->pid = p->pid;
__entry->time = time;
) __perf_count(time),
TP_printk("task %d waited for IO for %Lu ns",
__entry->pid, __entry->time)
);
Then we could simply do:
#undef __perf_count
#define __perf_count(a)
[ do all the ftrace event work ]
#undef TP_fast_assign
#define TP_fast_assign(a...)
#undef __perf_count
#define __perf_count(c) count = (c);
-- Steve
> } \
> \
> static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
> @@ -476,7 +485,7 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
> .profile_disable = ftrace_profile_disable_##call,
>
> #else
> -#define _TRACE_PROFILE(call, proto, args)
> +#define _TRACE_PROFILE(call, proto, args, assign)
> #define _TRACE_PROFILE_INIT(call)
> #endif
>
> @@ -502,7 +511,7 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
>
> #undef TRACE_EVENT
> #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
> -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
> +_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args), PARAMS(assign)) \
> \
> static struct ftrace_event_call event_##call; \
> \
> diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
> index 5498890..879a6ce 100644
> --- a/kernel/perf_counter.c
> +++ b/kernel/perf_counter.c
> @@ -3680,17 +3680,17 @@ static const struct pmu perf_ops_task_clock = {
> };
>
> #ifdef CONFIG_EVENT_PROFILE
> -void perf_tpcounter_event(int event_id)
> +void perf_tpcounter_event(int event_id, u64 addr, u64 count)
> {
> struct perf_sample_data data = {
> .regs = get_irq_regs(),
> - .addr = 0,
> + .addr = addr,
> };
>
> if (!data.regs)
> data.regs = task_pt_regs(current);
>
> - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
> + do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
> }
> EXPORT_SYMBOL_GPL(perf_tpcounter_event);
>
>
>
>
On Mon, 2009-07-20 at 16:26 -0400, Steven Rostedt wrote:
> > +#define _TRACE_PROFILE(call, proto, args, assign) \
> > static void ftrace_profile_##call(proto) \
> > { \
> > - extern void perf_tpcounter_event(int); \
> > - perf_tpcounter_event(event_##call.id); \
> > + extern void perf_tpcounter_event(int, u64, u64); \
> > + u64 addr = 0, count = 1; \
> > + { assign; } \
> > + perf_tpcounter_event(event_##call.id, addr, count); \
>
> The problem here is that the assign also will do the :
>
> TP_fast_assign(
> __entry->pid = p->pid;
> __entry->time = time;
>
> part, thus you will probably get errors in processing the __entry part.
Oh, right, no macros wrapping that..
> What about doing instead:
>
> TRACE_EVENT(sched_iowait,
>
> TP_PROTO(struct task_struct *p, u64 time),
>
> TP_ARGS(p, time),
>
> TP_STRUCT__entry(
> __field(pid_t, pid )
> __field(u64, time)
> ),
>
> TP_fast_assign(
> __entry->pid = p->pid;
> __entry->time = time;
>
> ) __perf_count(time),
>
> TP_printk("task %d waited for IO for %Lu ns",
> __entry->pid, __entry->time)
> );
>
> Then we could simply do:
>
> #undef __perf_count
> #define __perf_count(a)
>
> [ do all the ftrace event work ]
>
> #undef TP_fast_assign
> #define TP_fast_assign(a...)
>
> #undef __perf_count
> #define __perf_count(c) count = (c);
Hmm, should we maybe do something like:
TP_fast_assign(
...
)
TP_perf_assign(
addr = 0;
count = time;
),
So we can extend it over time?
But otherwise, yes, looks nice. Lets make it happen ;-)
On Mon, 20 Jul 2009, Peter Zijlstra wrote:
>
> Hmm, should we maybe do something like:
>
> TP_fast_assign(
> ...
> )
> TP_perf_assign(
> addr = 0;
> count = time;
> ),
>
> So we can extend it over time?
Sure.
>
> But otherwise, yes, looks nice. Lets make it happen ;-)
Want to send me a patch that I can play with ;-)
-- Steve
On Mon, 20 Jul 2009 11:31:47 -0700 Arjan van de Ven <[email protected]> wrote:
> For counting how long an application has been waiting for (disk) IO,
> there currently is only the HZ sample driven information available, while
> for all other counters in this class, a high resolution version is
> available via CONFIG_SCHEDSTATS.
>
> In order to make an improved bootchart tool possible, we also need
> a higher resolution version of the iowait time.
>
> This patch below adds this scheduler statistic to the kernel.
Doesn't this duplicate the delay accounting already available via the
taskstats interface?
Andrew Morton wrote:
> On Mon, 20 Jul 2009 11:31:47 -0700 Arjan van de Ven <[email protected]> wrote:
>
>> For counting how long an application has been waiting for (disk) IO,
>> there currently is only the HZ sample driven information available, while
>> for all other counters in this class, a high resolution version is
>> available via CONFIG_SCHEDSTATS.
>>
>> In order to make an improved bootchart tool possible, we also need
>> a higher resolution version of the iowait time.
>>
>> This patch below adds this scheduler statistic to the kernel.
>
> Doesn't this duplicate the delay accounting already available via the
> taskstats interface?
we have how long we wait. we do not have how long we iowait afaik...
at least not in nanosecond granularity. (We do have the sampled data, but that
is milisecond sampled data, not very useful for making charts based on time
to show the sequence of events)
On Fri, 24 Jul 2009 21:33:02 -0700 Arjan van de Ven <[email protected]> wrote:
> Andrew Morton wrote:
> > On Mon, 20 Jul 2009 11:31:47 -0700 Arjan van de Ven <[email protected]> wrote:
> >
> >> For counting how long an application has been waiting for (disk) IO,
> >> there currently is only the HZ sample driven information available, while
> >> for all other counters in this class, a high resolution version is
> >> available via CONFIG_SCHEDSTATS.
> >>
> >> In order to make an improved bootchart tool possible, we also need
> >> a higher resolution version of the iowait time.
> >>
> >> This patch below adds this scheduler statistic to the kernel.
> >
> > Doesn't this duplicate the delay accounting already available via the
> > taskstats interface?
>
> we have how long we wait. we do not have how long we iowait afaik...
> at least not in nanosecond granularity. (We do have the sampled data, but that
> is milisecond sampled data, not very useful for making charts based on time
> to show the sequence of events)
See include/linux/sched.h's definition of task_delay_info - u64
blkio_delay is in nanoseconds. It uses
do_posix_clock_monotonic_gettime() internally.
Andrew Morton wrote:
> On Fri, 24 Jul 2009 21:33:02 -0700 Arjan van de Ven <[email protected]> wrote:
>
>> Andrew Morton wrote:
>>> On Mon, 20 Jul 2009 11:31:47 -0700 Arjan van de Ven <[email protected]> wrote:
>>>
>>>> For counting how long an application has been waiting for (disk) IO,
>>>> there currently is only the HZ sample driven information available, while
>>>> for all other counters in this class, a high resolution version is
>>>> available via CONFIG_SCHEDSTATS.
>>>>
>>>> In order to make an improved bootchart tool possible, we also need
>>>> a higher resolution version of the iowait time.
>>>>
>>>> This patch below adds this scheduler statistic to the kernel.
>>> Doesn't this duplicate the delay accounting already available via the
>>> taskstats interface?
>> we have how long we wait. we do not have how long we iowait afaik...
>> at least not in nanosecond granularity. (We do have the sampled data, but that
>> is milisecond sampled data, not very useful for making charts based on time
>> to show the sequence of events)
>
> See include/linux/sched.h's definition of task_delay_info - u64
> blkio_delay is in nanoseconds. It uses
> do_posix_clock_monotonic_gettime() internally.
looks like it does.. to bad we don't expose that data in a /proc/<pid>/delay or something field
like we do with the scheduler info...
On Fri, 24 Jul 2009 21:48:22 -0700 Arjan van de Ven <[email protected]> wrote:
> Andrew Morton wrote:
> > On Fri, 24 Jul 2009 21:33:02 -0700 Arjan van de Ven <[email protected]> wrote:
> >
> >> Andrew Morton wrote:
> >>> On Mon, 20 Jul 2009 11:31:47 -0700 Arjan van de Ven <[email protected]> wrote:
> >>>
> >>>> For counting how long an application has been waiting for (disk) IO,
> >>>> there currently is only the HZ sample driven information available, while
> >>>> for all other counters in this class, a high resolution version is
> >>>> available via CONFIG_SCHEDSTATS.
> >>>>
> >>>> In order to make an improved bootchart tool possible, we also need
> >>>> a higher resolution version of the iowait time.
> >>>>
> >>>> This patch below adds this scheduler statistic to the kernel.
> >>> Doesn't this duplicate the delay accounting already available via the
> >>> taskstats interface?
> >> we have how long we wait. we do not have how long we iowait afaik...
> >> at least not in nanosecond granularity. (We do have the sampled data, but that
> >> is milisecond sampled data, not very useful for making charts based on time
> >> to show the sequence of events)
> >
> > See include/linux/sched.h's definition of task_delay_info - u64
> > blkio_delay is in nanoseconds. It uses
> > do_posix_clock_monotonic_gettime() internally.
>
> looks like it does.. to bad we don't expose that data in a /proc/<pid>/delay or something field
> like we do with the scheduler info...
>
I thought we did deliver a few of the taskstats counters via procfs,
but maybe I dreamed it. It would have been a rather bad thing to do.
taskstats has a large advantage over /proc-based things: it delivers a
packet to the monitoring process(es) when the monitored task exits. So
with no polling at all it is possible to gather all that information
about the just-completed task. This isn't possible with /proc.
There's a patch on the list now to teach taskstats to emit a packet at
fork- and exit-time too.
The monitored task can be polled at any time during its execution also,
like /proc files.
Please consider switching whatever-you're-working-on over to use
taskstats rather than adding (duplicative) things to /proc (which
require CONFIG_SCHED_DEBUG, btw).
If there's stuff missing from taskstats then we can add it - it's
versioned and upgradeable and is a better interface. It's better
to make taskstats stronger than it is to add /proc/pid fields, methinks.
On Fri, 2009-07-24 at 22:04 -0700, Andrew Morton wrote:
>
> > > See include/linux/sched.h's definition of task_delay_info - u64
> > > blkio_delay is in nanoseconds. It uses
> > > do_posix_clock_monotonic_gettime() internally.
> >
> > looks like it does.. to bad we don't expose that data in
> a /proc/<pid>/delay or something field
> > like we do with the scheduler info...
> >
>
> I thought we did deliver a few of the taskstats counters via procfs,
> but maybe I dreamed it. It would have been a rather bad thing to do.
>
> taskstats has a large advantage over /proc-based things: it delivers a
> packet to the monitoring process(es) when the monitored task exits.
> So
> with no polling at all it is possible to gather all that information
> about the just-completed task. This isn't possible with /proc.
>
> There's a patch on the list now to teach taskstats to emit a packet at
> fork- and exit-time too.
>
> The monitored task can be polled at any time during its execution
> also,
> like /proc files.
>
> Please consider switching whatever-you're-working-on over to use
> taskstats rather than adding (duplicative) things to /proc (which
> require CONFIG_SCHED_DEBUG, btw).
>
> If there's stuff missing from taskstats then we can add it - it's
> versioned and upgradeable and is a better interface. It's better
> to make taskstats stronger than it is to add /proc/pid fields,
> methinks.
The below exposes the information to ftrace and perf counters, it uses
the scheduler accounting (which is often much cheaper than
do_posix_clock_monotonic_gettime, and more 'accurate' in the sense that
its what the scheduler itself uses).
This allows profiling tasks based on iowait time, for example, something
not possible with taskstats afaik.
Maybe there's a use for taskstats still, maybe not.
---
Subject: sched: wait, sleep and iowait accounting tracepoints
From: Peter Zijlstra <[email protected]>
Date: Thu Jul 23 20:13:26 CEST 2009
Add 3 schedstat tracepoints to help account for wait-time, sleep-time
and iowait-time.
They can also be used as a perf-counter source to profile tasks on
these clocks.
Cc: Steven Rostedt <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Arjan van de Ven <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
LKML-Reference: <new-submission>
---
include/trace/events/sched.h | 95 +++++++++++++++++++++++++++++++++++++++++++
kernel/sched_fair.c | 10 ++++
2 files changed, 104 insertions(+), 1 deletion(-)
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -546,6 +546,11 @@ update_stats_wait_end(struct cfs_rq *cfs
schedstat_set(se->wait_sum, se->wait_sum +
rq_of(cfs_rq)->clock - se->wait_start);
schedstat_set(se->wait_start, 0);
+
+ if (entity_is_task(se)) {
+ trace_sched_stat_wait(task_of(se),
+ rq_of(cfs_rq)->clock - se->wait_start);
+ }
}
static inline void
@@ -636,8 +641,10 @@ static void enqueue_sleeper(struct cfs_r
se->sleep_start = 0;
se->sum_sleep_runtime += delta;
- if (tsk)
+ if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
+ trace_sched_stat_sleep(tsk, delta);
+ }
}
if (se->block_start) {
u64 delta = rq_of(cfs_rq)->clock - se->block_start;
@@ -655,6 +662,7 @@ static void enqueue_sleeper(struct cfs_r
if (tsk->in_iowait) {
se->iowait_sum += delta;
se->iowait_count++;
+ trace_sched_stat_iowait(tsk, delta);
}
/*
Index: linux-2.6/include/trace/events/sched.h
===================================================================
--- linux-2.6.orig/include/trace/events/sched.h
+++ linux-2.6/include/trace/events/sched.h
@@ -340,6 +340,101 @@ TRACE_EVENT(sched_signal_send,
__entry->sig, __entry->comm, __entry->pid)
);
+/*
+ * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
+ * adding sched_stat support to SCHED_FIFO/RR would be welcome.
+ */
+
+/*
+ * Tracepoint for accounting wait time (time the task is runnable
+ * but not actually running due to scheduler contention).
+ */
+TRACE_EVENT(sched_stat_wait,
+
+ TP_PROTO(struct task_struct *tsk, u64 delay),
+
+ TP_ARGS(tsk, delay),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( u64, delay )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->delay = delay;
+ )
+ TP_perf_assign(
+ __perf_count(delay);
+ ),
+
+ TP_printk("task: %s:%d wait: %Lu [ns]",
+ __entry->comm, __entry->pid,
+ (unsigned long long)__entry->delay)
+);
+
+/*
+ * Tracepoint for accounting sleep time (time the task is not runnable,
+ * including iowait, see below).
+ */
+TRACE_EVENT(sched_stat_sleep,
+
+ TP_PROTO(struct task_struct *tsk, u64 delay),
+
+ TP_ARGS(tsk, delay),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( u64, delay )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->delay = delay;
+ )
+ TP_perf_assign(
+ __perf_count(delay);
+ ),
+
+ TP_printk("task: %s:%d sleep: %Lu [ns]",
+ __entry->comm, __entry->pid,
+ (unsigned long long)__entry->delay)
+);
+
+/*
+ * Tracepoint for accounting iowait time (time the task is not runnable
+ * due to waiting on IO to complete).
+ */
+TRACE_EVENT(sched_stat_iowait,
+
+ TP_PROTO(struct task_struct *tsk, u64 delay),
+
+ TP_ARGS(tsk, delay),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( u64, delay )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->delay = delay;
+ )
+ TP_perf_assign(
+ __perf_count(delay);
+ ),
+
+ TP_printk("task: %s:%d iowait: %Lu [ns]",
+ __entry->comm, __entry->pid,
+ (unsigned long long)__entry->delay)
+);
+
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
On Sat, 25 Jul 2009 08:05:46 +0200 Peter Zijlstra <[email protected]> wrote:
> On Fri, 2009-07-24 at 22:04 -0700, Andrew Morton wrote:
> >
> > > > See include/linux/sched.h's definition of task_delay_info - u64
> > > > blkio_delay is in nanoseconds. It uses
> > > > do_posix_clock_monotonic_gettime() internally.
> > >
> > > looks like it does.. to bad we don't expose that data in
> > a /proc/<pid>/delay or something field
> > > like we do with the scheduler info...
> > >
> >
> > I thought we did deliver a few of the taskstats counters via procfs,
> > but maybe I dreamed it. It would have been a rather bad thing to do.
> >
> > taskstats has a large advantage over /proc-based things: it delivers a
> > packet to the monitoring process(es) when the monitored task exits.
> > So
> > with no polling at all it is possible to gather all that information
> > about the just-completed task. This isn't possible with /proc.
> >
> > There's a patch on the list now to teach taskstats to emit a packet at
> > fork- and exit-time too.
> >
> > The monitored task can be polled at any time during its execution
> > also,
> > like /proc files.
> >
> > Please consider switching whatever-you're-working-on over to use
> > taskstats rather than adding (duplicative) things to /proc (which
> > require CONFIG_SCHED_DEBUG, btw).
> >
> > If there's stuff missing from taskstats then we can add it - it's
> > versioned and upgradeable and is a better interface. It's better
> > to make taskstats stronger than it is to add /proc/pid fields,
> > methinks.
>
> The below exposes the information to ftrace and perf counters, it uses
> the scheduler accounting (which is often much cheaper than
> do_posix_clock_monotonic_gettime, and more 'accurate' in the sense that
> its what the scheduler itself uses).
Well. The do_posix_clock_monotonic_gettime() call is already there,
and this change adds more code on top of Arjan's code which wasn't
needed if he can use taskstats.
> This allows profiling tasks based on iowait time, for example, something
> not possible with taskstats afaik.
>
> Maybe there's a use for taskstats still, maybe not.
>
> ---
> Subject: sched: wait, sleep and iowait accounting tracepoints
> From: Peter Zijlstra <[email protected]>
> Date: Thu Jul 23 20:13:26 CEST 2009
>
> Add 3 schedstat tracepoints to help account for wait-time, sleep-time
> and iowait-time.
>
> They can also be used as a perf-counter source to profile tasks on
> these clocks.
This may be a useful feature, dunno. But it seems to be unrelated to
Arjan's requirement, apart from building on top of it.
What _is_ Arjan's requirement, anyway? I don't think it's really been
spelled out.
Andrew Morton wrote:
>
> What _is_ Arjan's requirement, anyway? I don't think it's really been
> spelled out.
ok long description coming ;)
Actually we're building a tool that conceptually is two different tools, that operate at different levels,
and tool two is a zoomed in version of zoom one. I'll describe both below as if they are independent tools,
to not confuse things too much.
Tool One
--------
This can be summarized as "a better bootchart", although it's not (just) for measuring boot.
What the tool provides is a graphical overview of all process activity over a period of time
(the period of time needs to be long enough to cover a system boot, so around 10 seconds).
In this overview, time is the horizontal axis, and processes are on the vertical axis.
This overview is basically a horizontal bar per "process", that shows the name of the process. The
bar starts (X axis) when the process starts, and ends when the process ends. Bars of processes that
either start before the measurement period, or continue to live on after the measurement period
just run to the edge of the graph.
Within these bars, we make "boxes" that cover 1 millisecond (typical) of time, and these boxes get colored
depending on what kind of activity has happened.
* Shade of blue for CPU activity; basically we take the number of nanoseconds that the process executed at the beginning
and at the end of the box, scale this to the size of the box (1 msec) to get a ratio/percentage, and then make the
shade of blue represent this percentage. (eg if the process was running for the full msec, it gets very dark blue,
but if it was only running for 50%, it gets a gray-ish blue)
* Shade of red for "waiting for IO"; we take the io wait nanoseconds at the beginning and end, scale this like we do cpu
and color the box red appropriately
* Share of yellow for "waiting for the scheduler to give us a time slice"; the kernel exposes the (accumulated) time
between wakeup and actually running, basically the scheduler delay (due to other processes running etc). Similar to
cpu and io time, we scale this to a yellow color
(if there is more than one of these during the 1 msec activity, we have an arbitrage algorithm to resolve this)
In addition, we show a system view (at the top of the diagram for now) that shows
* For each logical cpu, how the CPU utilization is. Again in 1 msec bars, we calculate how busy each logical cpu is
and have a bar for which the height scales with utilization
* We show the amount of IO, in megabytes per second in another bar, so that one can see how well the system is doing
on IO throughput (we're considering splitting this into a separate display for read and write)
For the process bars to work, we need to track the name of processes as they change (fork+exec first creates a process that shares
the name with the parent process, and then during the exec changes name to the new process for example). The visualization
switches to a new bar when such a process changes name.
This tool is very useful (for us already, but I expect it's not just us) to see what is going on performance wise on a system level.
We can see that process A uses CPU, but causes the dbus process to use CPU which then causes process B to use CPU etc; we can
track performance bottlenecks/costs beyond just a single process.
Right now we're just polling every millisecond various /proc/pid files to collect this information. Will investigate
the task accounting flow to see if we can kill the polling entirely, but if we can't kill this polling entirely we'll likely need
to stick to polling, since otherwise consolidating the two sources of information gets problematic.
The current "bootchart" tool provides this mostly, but it is using the millisecond sampled data, which is so inaccurate
that major things are missed in terms of system activity, and in addition it's so heavy in how it operates that you can't
sample fine grained enough without impacting the system too much.
Tool Two
--------
This is sort of a zoomed in version of Tool One. Tool Two does not show utilization percentages, but shows a time flow
of what happens, nanosecond accurate. The same bars like in tool one, per process etc, but now we don't show boxes that represent a
millisecond, but we start a box of blue every time a process gets scheduled, that lasts until the process gets unscheduled. Same for the
red/yellow boxes.
Ideally we also get "process A wakes up process B" information, that we can represent as little arrows in our diagram.
Tool Two also gives cpu utilization in a time format rather than in a utilization %age format.
We're using "perf" for collecting this information; for todays mainline we can collect the blue bar (When scheduled in/out) already,
and Peter has been adding the data for iowait etc in an experimental branch.
In case you wonder why you need both tools, and can't just use one: we wondered the same, but it turns out that if you only build
Tool Two, you don't get a good overview of what is going on on a higher level. It's like looking at the world via a microscope all the time,
you just don't get a good feeling what the world is like; you just can't find which areas are interesting to look at (with the microscope)/
So we provide a higher level view tool (Tool One), with the capability to zoom into what happens in minute (well.. nanosecond) detail for
areas of interest (Tool Two).
[again, while I describe them here as separate tools, that's just for the description, the idea is that the user will see it as just one
unified tool with different zoom levels]
On Sat, 2009-07-25 at 09:42 -0700, Arjan van de Ven wrote:
>
> In case you wonder why you need both tools, and can't just use one: we
> wondered the same, but it turns out that if you only build
> Tool Two, you don't get a good overview of what is going on on a
> higher level. It's like looking at the world via a microscope all the
> time,
I'd be thinking you could compose your ms based picture on the sample
data. For instance, if you sample on cpu-clock at 100kHz, then a 100
samples get you a full blue slice, 50 get you a 50% blue slice, etc.
Of course, since its sample based, you can miss some detail, but I think
you're looking at the large picture anyway.
Would this work?
Peter Zijlstra wrote:
> On Sat, 2009-07-25 at 09:42 -0700, Arjan van de Ven wrote:
>> In case you wonder why you need both tools, and can't just use one: we
>> wondered the same, but it turns out that if you only build
>> Tool Two, you don't get a good overview of what is going on on a
>> higher level. It's like looking at the world via a microscope all the
>> time,
>
> I'd be thinking you could compose your ms based picture on the sample
> data. For instance, if you sample on cpu-clock at 100kHz, then a 100
> samples get you a full blue slice, 50 get you a 50% blue slice, etc.
we can compose part of it from the details (but there's a lot of details),
but not nearly all of it, things like the IO bandwidth etc we just can't
do this with.
we could do this for cpu use to a large degree though I suppose
Peter Zijlstra wrote:
> On Sat, 2009-07-25 at 09:42 -0700, Arjan van de Ven wrote:
>> In case you wonder why you need both tools, and can't just use one: we
>> wondered the same, but it turns out that if you only build
>> Tool Two, you don't get a good overview of what is going on on a
>> higher level. It's like looking at the world via a microscope all the
>> time,
>
> I'd be thinking you could compose your ms based picture on the sample
> data. For instance, if you sample on cpu-clock at 100kHz, then a 100
> samples get you a full blue slice, 50 get you a 50% blue slice, etc.
>
> Of course, since its sample based, you can miss some detail, but I think
> you're looking at the large picture anyway.
>
> Would this work?
>
ok thinking some more, in principle we can make a hybrid, that only uses the
perf data, as long as the perf data includes run/iowait/woken events.
(and process exit, name change. ideally also exec).
We'll just poll /proc only for the io throughput data, and collect that on a 1 msec
scale.
Commit-ID: 8f0dfc34e9b323a028c2ec41abb7e9de477b7a94
Gitweb: http://git.kernel.org/tip/8f0dfc34e9b323a028c2ec41abb7e9de477b7a94
Author: Arjan van de Ven <[email protected]>
AuthorDate: Mon, 20 Jul 2009 11:26:58 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 2 Sep 2009 08:44:08 +0200
sched: Provide iowait counters
For counting how long an application has been waiting for
(disk) IO, there currently is only the HZ sample driven
information available, while for all other counters in this
class, a high resolution version is available via
CONFIG_SCHEDSTATS.
In order to make an improved bootchart tool possible, we also
need a higher resolution version of the iowait time.
This patch below adds this scheduler statistic to the kernel.
Signed-off-by: Arjan van de Ven <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/sched.h | 4 ++++
kernel/sched.c | 4 ++++
kernel/sched_debug.c | 4 ++++
kernel/sched_fair.c | 5 +++++
4 files changed, 17 insertions(+), 0 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e209ae0..9c96ef2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1111,6 +1111,8 @@ struct sched_entity {
u64 wait_max;
u64 wait_count;
u64 wait_sum;
+ u64 iowait_count;
+ u64 iowait_sum;
u64 sleep_start;
u64 sleep_max;
@@ -1231,6 +1233,8 @@ struct task_struct {
unsigned did_exec:1;
unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */
+ unsigned in_iowait:1;
+
/* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6244d24..38d05a8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6754,7 +6754,9 @@ void __sched io_schedule(void)
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ current->in_iowait = 1;
schedule();
+ current->in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
}
@@ -6767,7 +6769,9 @@ long __sched io_schedule_timeout(long timeout)
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ current->in_iowait = 1;
ret = schedule_timeout(timeout);
+ current->in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
return ret;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 70c7e0b..5ddbd08 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.wait_max);
PN(se.wait_sum);
P(se.wait_count);
+ PN(se.iowait_sum);
+ P(se.iowait_count);
P(sched_info.bkl_count);
P(se.nr_migrations);
P(se.nr_migrations_cold);
@@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p)
p->se.wait_max = 0;
p->se.wait_sum = 0;
p->se.wait_count = 0;
+ p->se.iowait_sum = 0;
+ p->se.iowait_count = 0;
p->se.sleep_max = 0;
p->se.sum_sleep_runtime = 0;
p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 342000b..471fa28 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -652,6 +652,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->sum_sleep_runtime += delta;
if (tsk) {
+ if (tsk->in_iowait) {
+ se->iowait_sum += delta;
+ se->iowait_count++;
+ }
+
/*
* Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the