2007-08-16 07:09:39

by Michael Neuling

[permalink] [raw]
Subject: [PATCH 1/2] Add scaled time to taskstats based process accounting

This adds two items to the taststats struct to account for user and
system time based on scaling the CPU frequency and instruction issue
rates.

Adds account_(user|system)_time_scaled callbacks which architectures
can use to account for time using this mechanism.

Signed-off-by: Michael Neuling <[email protected]>

---

include/linux/kernel_stat.h | 2 ++
include/linux/sched.h | 2 +-
include/linux/taskstats.h | 6 +++++-
kernel/fork.c | 2 ++
kernel/sched.c | 21 +++++++++++++++++++++
kernel/timer.c | 7 +++++--
kernel/tsacct.c | 4 ++++
7 files changed, 40 insertions(+), 4 deletions(-)

Index: linux-2.6-ozlabs/include/linux/kernel_stat.h
===================================================================
--- linux-2.6-ozlabs.orig/include/linux/kernel_stat.h
+++ linux-2.6-ozlabs/include/linux/kernel_stat.h
@@ -52,7 +52,9 @@ static inline int kstat_irqs(int irq)
}

extern void account_user_time(struct task_struct *, cputime_t);
+extern void account_user_time_scaled(struct task_struct *, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t);
+extern void account_system_time_scaled(struct task_struct *, cputime_t);
extern void account_steal_time(struct task_struct *, cputime_t);

#endif /* _LINUX_KERNEL_STAT_H */
Index: linux-2.6-ozlabs/include/linux/sched.h
===================================================================
--- linux-2.6-ozlabs.orig/include/linux/sched.h
+++ linux-2.6-ozlabs/include/linux/sched.h
@@ -1020,7 +1020,7 @@ struct task_struct {
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */

unsigned int rt_priority;
- cputime_t utime, stime;
+ cputime_t utime, stime, utimescaled, stimescaled;
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
struct timespec real_start_time; /* boot based time */
Index: linux-2.6-ozlabs/include/linux/taskstats.h
===================================================================
--- linux-2.6-ozlabs.orig/include/linux/taskstats.h
+++ linux-2.6-ozlabs/include/linux/taskstats.h
@@ -31,7 +31,7 @@
*/


-#define TASKSTATS_VERSION 5
+#define TASKSTATS_VERSION 6
#define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
* in linux/sched.h */

@@ -142,6 +142,10 @@ struct taskstats {
__u64 write_char; /* bytes written */
__u64 read_syscalls; /* read syscalls */
__u64 write_syscalls; /* write syscalls */
+
+ /* time accounting for SMT machines */
+ __u64 ac_utimescaled; /* utime scaled on frequency etc */
+ __u64 ac_stimescaled; /* stime scaled on frequency etc */
/* Extended accounting fields end */

#define TASKSTATS_HAS_IO_ACCOUNTING
Index: linux-2.6-ozlabs/kernel/fork.c
===================================================================
--- linux-2.6-ozlabs.orig/kernel/fork.c
+++ linux-2.6-ozlabs/kernel/fork.c
@@ -1045,6 +1045,8 @@ static struct task_struct *copy_process(

p->utime = cputime_zero;
p->stime = cputime_zero;
+ p->utimescaled = cputime_zero;
+ p->stimescaled = cputime_zero;

#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
Index: linux-2.6-ozlabs/kernel/sched.c
===================================================================
--- linux-2.6-ozlabs.orig/kernel/sched.c
+++ linux-2.6-ozlabs/kernel/sched.c
@@ -3249,6 +3249,16 @@ void account_user_time(struct task_struc
}

/*
+ * Account scaled user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user space since the last update
+ */
+void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
+{
+ p->utimescaled = cputime_add(p->utimescaled, cputime);
+}
+
+/*
* Account system cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3280,6 +3290,17 @@ void account_system_time(struct task_str
}

/*
+ * Account scaled system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ */
+void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
+{
+ p->stimescaled = cputime_add(p->stimescaled, cputime);
+}
+
+/*
* Account for involuntary wait time.
* @p: the process from which the cpu time has been stolen
* @steal: the cpu time spent in involuntary wait
Index: linux-2.6-ozlabs/kernel/timer.c
===================================================================
--- linux-2.6-ozlabs.orig/kernel/timer.c
+++ linux-2.6-ozlabs/kernel/timer.c
@@ -826,10 +826,13 @@ void update_process_times(int user_tick)
int cpu = smp_processor_id();

/* Note: this timer irq context must be accounted for as well. */
- if (user_tick)
+ if (user_tick) {
account_user_time(p, jiffies_to_cputime(1));
- else
+ account_user_time_scaled(p, jiffies_to_cputime(1));
+ } else {
account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
+ account_system_time_scaled(p, jiffies_to_cputime(1));
+ }
run_local_timers();
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_tick);
Index: linux-2.6-ozlabs/kernel/tsacct.c
===================================================================
--- linux-2.6-ozlabs.orig/kernel/tsacct.c
+++ linux-2.6-ozlabs/kernel/tsacct.c
@@ -62,6 +62,10 @@ void bacct_add_tsk(struct taskstats *sta
rcu_read_unlock();
stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
+ stats->ac_utimescaled =
+ cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
+ stats->ac_stimescaled =
+ cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
stats->ac_minflt = tsk->min_flt;
stats->ac_majflt = tsk->maj_flt;


2007-08-16 07:27:20

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH 1/2] Add scaled time to taskstats based process accounting

Hi, Michael,

Thanks for doing this, this is really useful.

Michael Neuling wrote:
> This adds two items to the taststats struct to account for user and
> system time based on scaling the CPU frequency and instruction issue
> rates.
>
> Adds account_(user|system)_time_scaled callbacks which architectures
> can use to account for time using this mechanism.
>
> Signed-off-by: Michael Neuling <[email protected]>
>
> ---
>
> include/linux/kernel_stat.h | 2 ++
> include/linux/sched.h | 2 +-
> include/linux/taskstats.h | 6 +++++-
> kernel/fork.c | 2 ++
> kernel/sched.c | 21 +++++++++++++++++++++
> kernel/timer.c | 7 +++++--
> kernel/tsacct.c | 4 ++++
> 7 files changed, 40 insertions(+), 4 deletions(-)
>
> Index: linux-2.6-ozlabs/include/linux/kernel_stat.h
> ===================================================================
> --- linux-2.6-ozlabs.orig/include/linux/kernel_stat.h
> +++ linux-2.6-ozlabs/include/linux/kernel_stat.h
> @@ -52,7 +52,9 @@ static inline int kstat_irqs(int irq)
> }
>
> extern void account_user_time(struct task_struct *, cputime_t);
> +extern void account_user_time_scaled(struct task_struct *, cputime_t);
> extern void account_system_time(struct task_struct *, int, cputime_t);
> +extern void account_system_time_scaled(struct task_struct *, cputime_t);
> extern void account_steal_time(struct task_struct *, cputime_t);
>
> #endif /* _LINUX_KERNEL_STAT_H */
> Index: linux-2.6-ozlabs/include/linux/sched.h
> ===================================================================
> --- linux-2.6-ozlabs.orig/include/linux/sched.h
> +++ linux-2.6-ozlabs/include/linux/sched.h
> @@ -1020,7 +1020,7 @@ struct task_struct {
> int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
>
> unsigned int rt_priority;
> - cputime_t utime, stime;
> + cputime_t utime, stime, utimescaled, stimescaled;
> unsigned long nvcsw, nivcsw; /* context switch counts */
> struct timespec start_time; /* monotonic time */
> struct timespec real_start_time; /* boot based time */
> Index: linux-2.6-ozlabs/include/linux/taskstats.h
> ===================================================================
> --- linux-2.6-ozlabs.orig/include/linux/taskstats.h
> +++ linux-2.6-ozlabs/include/linux/taskstats.h
> @@ -31,7 +31,7 @@
> */
>
>
> -#define TASKSTATS_VERSION 5
> +#define TASKSTATS_VERSION 6
> #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
> * in linux/sched.h */
>
> @@ -142,6 +142,10 @@ struct taskstats {
> __u64 write_char; /* bytes written */
> __u64 read_syscalls; /* read syscalls */
> __u64 write_syscalls; /* write syscalls */
> +
> + /* time accounting for SMT machines */
> + __u64 ac_utimescaled; /* utime scaled on frequency etc */
> + __u64 ac_stimescaled; /* stime scaled on frequency etc */
> /* Extended accounting fields end */
>

I'd also request for you to add a cpu_scaled_run_real_total for use
by delay accounting. cpu_scaled_run_real_total should be similar in
functionality to cpu_run_real_total.

> #define TASKSTATS_HAS_IO_ACCOUNTING
> Index: linux-2.6-ozlabs/kernel/fork.c
> ===================================================================
> --- linux-2.6-ozlabs.orig/kernel/fork.c
> +++ linux-2.6-ozlabs/kernel/fork.c
> @@ -1045,6 +1045,8 @@ static struct task_struct *copy_process(
>
> p->utime = cputime_zero;
> p->stime = cputime_zero;
> + p->utimescaled = cputime_zero;
> + p->stimescaled = cputime_zero;
>
> #ifdef CONFIG_TASK_XACCT
> p->rchar = 0; /* I/O counter: bytes read */
> Index: linux-2.6-ozlabs/kernel/sched.c
> ===================================================================
> --- linux-2.6-ozlabs.orig/kernel/sched.c
> +++ linux-2.6-ozlabs/kernel/sched.c
> @@ -3249,6 +3249,16 @@ void account_user_time(struct task_struc
> }
>
> /*
> + * Account scaled user cpu time to a process.
> + * @p: the process that the cpu time gets accounted to
> + * @cputime: the cpu time spent in user space since the last update
> + */
> +void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
> +{
> + p->utimescaled = cputime_add(p->utimescaled, cputime);
> +}
> +
> +/*
> * Account system cpu time to a process.
> * @p: the process that the cpu time gets accounted to
> * @hardirq_offset: the offset to subtract from hardirq_count()
> @@ -3280,6 +3290,17 @@ void account_system_time(struct task_str
> }
>
> /*
> + * Account scaled system cpu time to a process.
> + * @p: the process that the cpu time gets accounted to
> + * @hardirq_offset: the offset to subtract from hardirq_count()
> + * @cputime: the cpu time spent in kernel space since the last update
> + */
> +void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
> +{
> + p->stimescaled = cputime_add(p->stimescaled, cputime);
> +}
> +
> +/*
> * Account for involuntary wait time.
> * @p: the process from which the cpu time has been stolen
> * @steal: the cpu time spent in involuntary wait
> Index: linux-2.6-ozlabs/kernel/timer.c
> ===================================================================
> --- linux-2.6-ozlabs.orig/kernel/timer.c
> +++ linux-2.6-ozlabs/kernel/timer.c
> @@ -826,10 +826,13 @@ void update_process_times(int user_tick)
> int cpu = smp_processor_id();
>
> /* Note: this timer irq context must be accounted for as well. */
> - if (user_tick)
> + if (user_tick) {
> account_user_time(p, jiffies_to_cputime(1));
> - else
> + account_user_time_scaled(p, jiffies_to_cputime(1));
> + } else {
> account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
> + account_system_time_scaled(p, jiffies_to_cputime(1));
> + }

I am a little confused here, scaled accounting and regular accounting
go hand in hand?


> run_local_timers();
> if (rcu_pending(cpu))
> rcu_check_callbacks(cpu, user_tick);
> Index: linux-2.6-ozlabs/kernel/tsacct.c
> ===================================================================
> --- linux-2.6-ozlabs.orig/kernel/tsacct.c
> +++ linux-2.6-ozlabs/kernel/tsacct.c
> @@ -62,6 +62,10 @@ void bacct_add_tsk(struct taskstats *sta
> rcu_read_unlock();
> stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
> stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
> + stats->ac_utimescaled =
> + cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
> + stats->ac_stimescaled =
> + cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
> stats->ac_minflt = tsk->min_flt;
> stats->ac_majflt = tsk->maj_flt;
>


--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL

2007-08-16 16:39:09

by linas

[permalink] [raw]
Subject: Re: [PATCH 1/2] Add scaled time to taskstats based process accounting

On Thu, Aug 16, 2007 at 05:09:22PM +1000, Michael Neuling wrote:
> This adds two items to the taststats struct to account for user and
> system time based on scaling the CPU frequency and instruction issue
> rates.
>
> Adds account_(user|system)_time_scaled callbacks which architectures
> can use to account for time using this mechanism.

There's something simple here that I just don't understand.

> /*
> + * Account scaled user cpu time to a process.
> + * @p: the process that the cpu time gets accounted to
> + * @cputime: the cpu time spent in user space since the last update
> + */
> +void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
> +{
> + p->utimescaled = cputime_add(p->utimescaled, cputime);
> +}

My gut impression (maybe wrong?) is that the scaled time is,
in a certain sense, "more accurate" than the unscaled time.
In fact, the unscaled time gives me the impression of being
rather meaningless, as it has no particular significance
with respect to the wall-clock, and it also doesn't give
any accurate hint of how much cpu resource was actually
consumed.

If one has a cpu with frequency scaling, then when would
one ever be interested in the non-scaled time? If the answer
is "never", then why not just always use the scaled time,
instead of adding more stuff to the kernel structs?

--linas

2007-08-16 22:23:00

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH 1/2] Add scaled time to taskstats based process accounting

Linas Vepstas writes:

> My gut impression (maybe wrong?) is that the scaled time is,
> in a certain sense, "more accurate" than the unscaled time.

The "unscaled" time is just time, as in "how many seconds did this
task spend on the CPU". It's what all the tools (except a certain
proprietary workload manager) expect. Top, ps, etc. get unhappy if
the times reported (user, system, hardirq, softirq, idle, stolen)
don't add up to elapsed wall-clock time.

The "scaled" time is really CPU cycles divided by some arbitrary
factor (the notional CPU frequency). So yes it does give some
indication of how much progress the task should have made, in some
sense.

Both measures are useful. Because the current user API is in terms of
real time rather than cycles, we have to continue reporting real time,
not scaled time, which is why the existing interfaces report unscaled
time, and the scaled time values are reported through a new extension
to the taskstats interface.

Paul.

2007-08-17 00:23:42

by Michael Neuling

[permalink] [raw]
Subject: Re: [PATCH 1/2] Add scaled time to taskstats based process accounting



In message <[email protected]> you wrote:
> Hi, Michael,
>
> Thanks for doing this, this is really useful.
>
> Michael Neuling wrote:
> > This adds two items to the taststats struct to account for user and
> > system time based on scaling the CPU frequency and instruction issue
> > rates.
> >
> > Adds account_(user|system)_time_scaled callbacks which architectures
> > can use to account for time using this mechanism.
> >
> > Signed-off-by: Michael Neuling <[email protected]>
> >
> > ---
> >
> > include/linux/kernel_stat.h | 2 ++
> > include/linux/sched.h | 2 +-
> > include/linux/taskstats.h | 6 +++++-
> > kernel/fork.c | 2 ++
> > kernel/sched.c | 21 +++++++++++++++++++++
> > kernel/timer.c | 7 +++++--
> > kernel/tsacct.c | 4 ++++
> > 7 files changed, 40 insertions(+), 4 deletions(-)
> >
> > Index: linux-2.6-ozlabs/include/linux/kernel_stat.h
> > ===================================================================
> > --- linux-2.6-ozlabs.orig/include/linux/kernel_stat.h
> > +++ linux-2.6-ozlabs/include/linux/kernel_stat.h
> > @@ -52,7 +52,9 @@ static inline int kstat_irqs(int irq)
> > }
> >
> > extern void account_user_time(struct task_struct *, cputime_t);
> > +extern void account_user_time_scaled(struct task_struct *, cputime_t);
> > extern void account_system_time(struct task_struct *, int, cputime_t);
> > +extern void account_system_time_scaled(struct task_struct *, cputime_t);
> > extern void account_steal_time(struct task_struct *, cputime_t);
> >
> > #endif /* _LINUX_KERNEL_STAT_H */
> > Index: linux-2.6-ozlabs/include/linux/sched.h
> > ===================================================================
> > --- linux-2.6-ozlabs.orig/include/linux/sched.h
> > +++ linux-2.6-ozlabs/include/linux/sched.h
> > @@ -1020,7 +1020,7 @@ struct task_struct {
> > int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
> >
> > unsigned int rt_priority;
> > - cputime_t utime, stime;
> > + cputime_t utime, stime, utimescaled, stimescaled;
> > unsigned long nvcsw, nivcsw; /* context switch counts */
> > struct timespec start_time; /* monotonic time */
> > struct timespec real_start_time; /* boot based time */
> > Index: linux-2.6-ozlabs/include/linux/taskstats.h
> > ===================================================================
> > --- linux-2.6-ozlabs.orig/include/linux/taskstats.h
> > +++ linux-2.6-ozlabs/include/linux/taskstats.h
> > @@ -31,7 +31,7 @@
> > */
> >
> >
> > -#define TASKSTATS_VERSION 5
> > +#define TASKSTATS_VERSION 6
> > #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
> > * in linux/sched.h */
> >
> > @@ -142,6 +142,10 @@ struct taskstats {
> > __u64 write_char; /* bytes written */
> > __u64 read_syscalls; /* read syscalls */
> > __u64 write_syscalls; /* write syscalls */
> > +
> > + /* time accounting for SMT machines */
> > + __u64 ac_utimescaled; /* utime scaled on frequency etc */
> > + __u64 ac_stimescaled; /* stime scaled on frequency etc */
> > /* Extended accounting fields end */
> >
>
> I'd also request for you to add a cpu_scaled_run_real_total for use
> by delay accounting. cpu_scaled_run_real_total should be similar in
> functionality to cpu_run_real_total.

Will do. Should I add cpu_scaled_run_real_total to the end of the
struct taskstat, or next to cpu_run_real_total?

>
> > #define TASKSTATS_HAS_IO_ACCOUNTING
> > Index: linux-2.6-ozlabs/kernel/fork.c
> > ===================================================================
> > --- linux-2.6-ozlabs.orig/kernel/fork.c
> > +++ linux-2.6-ozlabs/kernel/fork.c
> > @@ -1045,6 +1045,8 @@ static struct task_struct *copy_process(
> >
> > p->utime = cputime_zero;
> > p->stime = cputime_zero;
> > + p->utimescaled = cputime_zero;
> > + p->stimescaled = cputime_zero;
> >
> > #ifdef CONFIG_TASK_XACCT
> > p->rchar = 0; /* I/O counter: bytes read */
> > Index: linux-2.6-ozlabs/kernel/sched.c
> > ===================================================================
> > --- linux-2.6-ozlabs.orig/kernel/sched.c
> > +++ linux-2.6-ozlabs/kernel/sched.c
> > @@ -3249,6 +3249,16 @@ void account_user_time(struct task_struc
> > }
> >
> > /*
> > + * Account scaled user cpu time to a process.
> > + * @p: the process that the cpu time gets accounted to
> > + * @cputime: the cpu time spent in user space since the last update
> > + */
> > +void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
> > +{
> > + p->utimescaled = cputime_add(p->utimescaled, cputime);
> > +}
> > +
> > +/*
> > * Account system cpu time to a process.
> > * @p: the process that the cpu time gets accounted to
> > * @hardirq_offset: the offset to subtract from hardirq_count()
> > @@ -3280,6 +3290,17 @@ void account_system_time(struct task_str
> > }
> >
> > /*
> > + * Account scaled system cpu time to a process.
> > + * @p: the process that the cpu time gets accounted to
> > + * @hardirq_offset: the offset to subtract from hardirq_count()
> > + * @cputime: the cpu time spent in kernel space since the last update
> > + */
> > +void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
> > +{
> > + p->stimescaled = cputime_add(p->stimescaled, cputime);
> > +}
> > +
> > +/*
> > * Account for involuntary wait time.
> > * @p: the process from which the cpu time has been stolen
> > * @steal: the cpu time spent in involuntary wait
> > Index: linux-2.6-ozlabs/kernel/timer.c
> > ===================================================================
> > --- linux-2.6-ozlabs.orig/kernel/timer.c
> > +++ linux-2.6-ozlabs/kernel/timer.c
> > @@ -826,10 +826,13 @@ void update_process_times(int user_tick)
> > int cpu = smp_processor_id();
> >
> > /* Note: this timer irq context must be accounted for as well. */
> > - if (user_tick)
> > + if (user_tick) {
> > account_user_time(p, jiffies_to_cputime(1));
> > - else
> > + account_user_time_scaled(p, jiffies_to_cputime(1));
> > + } else {
> > account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
> > + account_system_time_scaled(p, jiffies_to_cputime(1));
> > + }
>
> I am a little confused here, scaled accounting and regular accounting
> go hand in hand?

We need to account for scaled and normal time in this generic code.
All other calls to account_(user|system)_time are in arch code.

>
> > run_local_timers();
> > if (rcu_pending(cpu))
> > rcu_check_callbacks(cpu, user_tick);
> > Index: linux-2.6-ozlabs/kernel/tsacct.c
> > ===================================================================
> > --- linux-2.6-ozlabs.orig/kernel/tsacct.c
> > +++ linux-2.6-ozlabs/kernel/tsacct.c
> > @@ -62,6 +62,10 @@ void bacct_add_tsk(struct taskstats *sta
> > rcu_read_unlock();
> > stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
> > stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
> > + stats->ac_utimescaled =
> > + cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
> > + stats->ac_stimescaled =
> > + cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
> > stats->ac_minflt = tsk->min_flt;
> > stats->ac_majflt = tsk->maj_flt;
> >
>
>
> --
> Warm Regards,
> Balbir Singh
> Linux Technology Center
> IBM, ISTL
>

2007-08-17 01:09:57

by Michael Neuling

[permalink] [raw]
Subject: [PATCH 1/2] Add scaled time to taskstats based process accounting

This adds items to the taststats struct to account for user and system
time based on scaling the CPU frequency and instruction issue rates.

Adds account_(user|system)_time_scaled callbacks which architectures
can use to account for time using this mechanism.

Signed-off-by: Michael Neuling <[email protected]>
---
Updated based on comments from Balbir

include/linux/kernel_stat.h | 2 ++
include/linux/sched.h | 2 +-
include/linux/taskstats.h | 11 +++++++++--
kernel/delayacct.c | 6 ++++++
kernel/fork.c | 2 ++
kernel/sched.c | 21 +++++++++++++++++++++
kernel/timer.c | 7 +++++--
kernel/tsacct.c | 4 ++++
8 files changed, 50 insertions(+), 5 deletions(-)

Index: linux-2.6-ozlabs/include/linux/kernel_stat.h
===================================================================
--- linux-2.6-ozlabs.orig/include/linux/kernel_stat.h
+++ linux-2.6-ozlabs/include/linux/kernel_stat.h
@@ -52,7 +52,9 @@ static inline int kstat_irqs(int irq)
}

extern void account_user_time(struct task_struct *, cputime_t);
+extern void account_user_time_scaled(struct task_struct *, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t);
+extern void account_system_time_scaled(struct task_struct *, cputime_t);
extern void account_steal_time(struct task_struct *, cputime_t);

#endif /* _LINUX_KERNEL_STAT_H */
Index: linux-2.6-ozlabs/include/linux/sched.h
===================================================================
--- linux-2.6-ozlabs.orig/include/linux/sched.h
+++ linux-2.6-ozlabs/include/linux/sched.h
@@ -1020,7 +1020,7 @@ struct task_struct {
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */

unsigned int rt_priority;
- cputime_t utime, stime;
+ cputime_t utime, stime, utimescaled, stimescaled;
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
struct timespec real_start_time; /* boot based time */
Index: linux-2.6-ozlabs/include/linux/taskstats.h
===================================================================
--- linux-2.6-ozlabs.orig/include/linux/taskstats.h
+++ linux-2.6-ozlabs/include/linux/taskstats.h
@@ -31,7 +31,7 @@
*/


-#define TASKSTATS_VERSION 5
+#define TASKSTATS_VERSION 6
#define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
* in linux/sched.h */

@@ -85,9 +85,12 @@ struct taskstats {
* On some architectures, value will adjust for cpu time stolen
* from the kernel in involuntary waits due to virtualization.
* Value is cumulative, in nanoseconds, without a corresponding count
- * and wraps around to zero silently on overflow
+ * and wraps around to zero silently on overflow. The
+ * _scaled_ version accounts for cpus which can scale the
+ * number of instructions executed each cycle.
*/
__u64 cpu_run_real_total;
+ __u64 cpu_scaled_run_real_total;

/* cpu "virtual" running time
* Uses time intervals seen by the kernel i.e. no adjustment
@@ -142,6 +145,10 @@ struct taskstats {
__u64 write_char; /* bytes written */
__u64 read_syscalls; /* read syscalls */
__u64 write_syscalls; /* write syscalls */
+
+ /* time accounting for SMT machines */
+ __u64 ac_utimescaled; /* utime scaled on frequency etc */
+ __u64 ac_stimescaled; /* stime scaled on frequency etc */
/* Extended accounting fields end */

#define TASKSTATS_HAS_IO_ACCOUNTING
Index: linux-2.6-ozlabs/kernel/delayacct.c
===================================================================
--- linux-2.6-ozlabs.orig/kernel/delayacct.c
+++ linux-2.6-ozlabs/kernel/delayacct.c
@@ -115,6 +115,12 @@ int __delayacct_add_tsk(struct taskstats
tmp += timespec_to_ns(&ts);
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;

+ tmp = (s64)d->cpu_scaled_run_real_total;
+ cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
+ tmp += timespec_to_ns(&ts);
+ d->cpu_scaled_run_real_total =
+ (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
+
/*
* No locking available for sched_info (and too expensive to add one)
* Mitigate by taking snapshot of values
Index: linux-2.6-ozlabs/kernel/fork.c
===================================================================
--- linux-2.6-ozlabs.orig/kernel/fork.c
+++ linux-2.6-ozlabs/kernel/fork.c
@@ -1045,6 +1045,8 @@ static struct task_struct *copy_process(

p->utime = cputime_zero;
p->stime = cputime_zero;
+ p->utimescaled = cputime_zero;
+ p->stimescaled = cputime_zero;

#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
Index: linux-2.6-ozlabs/kernel/sched.c
===================================================================
--- linux-2.6-ozlabs.orig/kernel/sched.c
+++ linux-2.6-ozlabs/kernel/sched.c
@@ -3249,6 +3249,16 @@ void account_user_time(struct task_struc
}

/*
+ * Account scaled user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user space since the last update
+ */
+void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
+{
+ p->utimescaled = cputime_add(p->utimescaled, cputime);
+}
+
+/*
* Account system cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3280,6 +3290,17 @@ void account_system_time(struct task_str
}

/*
+ * Account scaled system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ */
+void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
+{
+ p->stimescaled = cputime_add(p->stimescaled, cputime);
+}
+
+/*
* Account for involuntary wait time.
* @p: the process from which the cpu time has been stolen
* @steal: the cpu time spent in involuntary wait
Index: linux-2.6-ozlabs/kernel/timer.c
===================================================================
--- linux-2.6-ozlabs.orig/kernel/timer.c
+++ linux-2.6-ozlabs/kernel/timer.c
@@ -826,10 +826,13 @@ void update_process_times(int user_tick)
int cpu = smp_processor_id();

/* Note: this timer irq context must be accounted for as well. */
- if (user_tick)
+ if (user_tick) {
account_user_time(p, jiffies_to_cputime(1));
- else
+ account_user_time_scaled(p, jiffies_to_cputime(1));
+ } else {
account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
+ account_system_time_scaled(p, jiffies_to_cputime(1));
+ }
run_local_timers();
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_tick);
Index: linux-2.6-ozlabs/kernel/tsacct.c
===================================================================
--- linux-2.6-ozlabs.orig/kernel/tsacct.c
+++ linux-2.6-ozlabs/kernel/tsacct.c
@@ -62,6 +62,10 @@ void bacct_add_tsk(struct taskstats *sta
rcu_read_unlock();
stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
+ stats->ac_utimescaled =
+ cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
+ stats->ac_stimescaled =
+ cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
stats->ac_minflt = tsk->min_flt;
stats->ac_majflt = tsk->maj_flt;

2007-08-17 04:48:29

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH 1/2] Add scaled time to taskstats based process accounting

Michael Neuling wrote:
>> I'd also request for you to add a cpu_scaled_run_real_total for use
>> by delay accounting. cpu_scaled_run_real_total should be similar in
>> functionality to cpu_run_real_total.
>
> Will do. Should I add cpu_scaled_run_real_total to the end of the
> struct taskstat, or next to cpu_run_real_total?
>

Please add it to the end, that helps maintain binary compatibility
across all versions of taskstats.

>>> /* Note: this timer irq context must be accounted for as well. */
>>> - if (user_tick)
>>> + if (user_tick) {
>>> account_user_time(p, jiffies_to_cputime(1));
>>> - else
>>> + account_user_time_scaled(p, jiffies_to_cputime(1));
>>> + } else {
>>> account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
>>> + account_system_time_scaled(p, jiffies_to_cputime(1));
>>> + }
>> I am a little confused here, scaled accounting and regular accounting
>> go hand in hand?
>
> We need to account for scaled and normal time in this generic code.
> All other calls to account_(user|system)_time are in arch code.
>

So the assumption here is that we ran at full frequency during
this time, is my understanding correct?

--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL

2007-08-17 04:57:25

by Michael Neuling

[permalink] [raw]
Subject: Re: [PATCH 1/2] Add scaled time to taskstats based process accounting

In message <[email protected]> you wrote:
> Michael Neuling wrote:
> >> I'd also request for you to add a cpu_scaled_run_real_total for use
> >> by delay accounting. cpu_scaled_run_real_total should be similar in
> >> functionality to cpu_run_real_total.
> >
> > Will do. Should I add cpu_scaled_run_real_total to the end of the
> > struct taskstat, or next to cpu_run_real_total?
> >
>
> Please add it to the end, that helps maintain binary compatibility
> across all versions of taskstats.

OK

>
> >>> /* Note: this timer irq context must be accounted for as well. */
> >>> - if (user_tick)
> >>> + if (user_tick) {
> >>> account_user_time(p, jiffies_to_cputime(1));
> >>> - else
> >>> + account_user_time_scaled(p, jiffies_to_cputime(1));
> >>> + } else {
> >>> account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
> >>> + account_system_time_scaled(p, jiffies_to_cputime(1));
> >>> + }
> >> I am a little confused here, scaled accounting and regular accounting
> >> go hand in hand?
> >
> > We need to account for scaled and normal time in this generic code.
> > All other calls to account_(user|system)_time are in arch code.
> >
>
> So the assumption here is that we ran at full frequency during
> this time, is my understanding correct?

Yes.

I guess we could keep a per CPU last scaling factor for this case
(similar to what we are storing in the POWERPC paca)

Mikey

2007-08-17 17:11:11

by linas

[permalink] [raw]
Subject: Re: [PATCH 1/2] Add scaled time to taskstats based process accounting

On Fri, Aug 17, 2007 at 08:22:40AM +1000, Paul Mackerras wrote:
> Linas Vepstas writes:
>
> > My gut impression (maybe wrong?) is that the scaled time is,
> > in a certain sense, "more accurate" than the unscaled time.
>
> The "unscaled" time is just time, as in "how many seconds did this
> task spend on the CPU". It's what all the tools (except a certain
> proprietary workload manager) expect. Top, ps, etc. get unhappy if
> the times reported (user, system, hardirq, softirq, idle, stolen)
> don't add up to elapsed wall-clock time.

OK, so to keep the tools happy, the total time needs to add up
to wall-clock time. Which tells me that the "scaled idle time"
should be defined as "wall clock time minus the other stuff".

> The "scaled" time is really CPU cycles divided by some arbitrary
> factor (the notional CPU frequency). So yes it does give some
> indication of how much progress the task should have made, in some
> sense.

Yes, good, that's what I was expecting. As a sysadmin and/or
back-of-the-envelope performance person, I would certainly like
to have ps and top report the scaled time. When I do "performance
tuning", I almost always can get away with quick-n-dirty use of
vmstat and top, and only rarely have to descend into more complex
tools. I'd hate to loose this quick-n-dirty utility, which,
again ... my gut impression is that these numbers suddenly turn
mostly meaningless.

That is, if I run the same task 3 times over the next few hours,
will vmstat/top/ps report more or less he same figures? I'm
concerned that they won't ... that I'll see different values
come out, depending on whether the chip is overheating, or whether
some other partition is stealing, or whatever causes this thing to
dynamically scale.

> Both measures are useful. Because the current user API is in terms of
> real time rather than cycles, we have to continue reporting real time,
> not scaled time, which is why the existing interfaces report unscaled
> time, and the scaled time values are reported through a new extension
> to the taskstats interface.

This begs the question of "what is the real, actual elapsed time?"
... currently, the "real time" depends very much on how often your
process got scheduled -- but, if your process is scheduled but
(due to scaling) isn't "actually running", should that count towards
the "real time"?

---
I supose that its inevitable that this stuff will get more complex;
I'm just trying to make sure we don't end up doing this backwards,
and deciding to change it around later.

I already notice that "stolen time" is causing confusion in some
areas. Its disconcerting to have lots of cores, and lots of threads
per core, only to find that some of your time has been "stolen".
I'm still wondering ... was this the right way to report this?

--linas

2007-08-17 19:00:52

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] Add scaled time to taskstats based process accounting

On Fri, 17 Aug 2007 11:09:41 +1000
Michael Neuling <[email protected]> wrote:

> This adds items to the taststats struct to account for user and system
> time based on scaling the CPU frequency and instruction issue rates.
>
> Adds account_(user|system)_time_scaled callbacks which architectures
> can use to account for time using this mechanism.
>
> ...
>
> Index: linux-2.6-ozlabs/include/linux/kernel_stat.h
> ===================================================================
> --- linux-2.6-ozlabs.orig/include/linux/kernel_stat.h
> +++ linux-2.6-ozlabs/include/linux/kernel_stat.h
> @@ -52,7 +52,9 @@ static inline int kstat_irqs(int irq)
> }
>
> extern void account_user_time(struct task_struct *, cputime_t);
> +extern void account_user_time_scaled(struct task_struct *, cputime_t);
> extern void account_system_time(struct task_struct *, int, cputime_t);
> +extern void account_system_time_scaled(struct task_struct *, cputime_t);
> extern void account_steal_time(struct task_struct *, cputime_t);
>
> #endif /* _LINUX_KERNEL_STAT_H */
> Index: linux-2.6-ozlabs/include/linux/sched.h
> ===================================================================
> --- linux-2.6-ozlabs.orig/include/linux/sched.h
> +++ linux-2.6-ozlabs/include/linux/sched.h
> @@ -1020,7 +1020,7 @@ struct task_struct {
> int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
>
> unsigned int rt_priority;
> - cputime_t utime, stime;
> + cputime_t utime, stime, utimescaled, stimescaled;

Adding 8 or 16 bytes to the task_struct for all architectures for something
which only powerpc uses?

Is there any prospect that other CPUs can use this?

> unsigned long nvcsw, nivcsw; /* context switch counts */
> struct timespec start_time; /* monotonic time */
> struct timespec real_start_time; /* boot based time */
> Index: linux-2.6-ozlabs/include/linux/taskstats.h
> ===================================================================
> --- linux-2.6-ozlabs.orig/include/linux/taskstats.h
> +++ linux-2.6-ozlabs/include/linux/taskstats.h
> @@ -31,7 +31,7 @@
> */
>
>
> -#define TASKSTATS_VERSION 5
> +#define TASKSTATS_VERSION 6
> #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
> * in linux/sched.h */
>
> @@ -85,9 +85,12 @@ struct taskstats {
> * On some architectures, value will adjust for cpu time stolen
> * from the kernel in involuntary waits due to virtualization.
> * Value is cumulative, in nanoseconds, without a corresponding count
> - * and wraps around to zero silently on overflow
> + * and wraps around to zero silently on overflow. The
> + * _scaled_ version accounts for cpus which can scale the
> + * number of instructions executed each cycle.
> */
> __u64 cpu_run_real_total;
> + __u64 cpu_scaled_run_real_total;
>
> /* cpu "virtual" running time
> * Uses time intervals seen by the kernel i.e. no adjustment
> @@ -142,6 +145,10 @@ struct taskstats {
> __u64 write_char; /* bytes written */
> __u64 read_syscalls; /* read syscalls */
> __u64 write_syscalls; /* write syscalls */
> +
> + /* time accounting for SMT machines */
> + __u64 ac_utimescaled; /* utime scaled on frequency etc */
> + __u64 ac_stimescaled; /* stime scaled on frequency etc */
> /* Extended accounting fields end */

umm, should we be adding new fields in the middle of this message? I
thought we should only add to the end, for back-compatibility, but maybe I
misremember.


2007-08-17 19:08:39

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH 1/2] Add scaled time to taskstats based process accounting

Andrew Morton wrote:
> On Fri, 17 Aug 2007 11:09:41 +1000
> Michael Neuling <[email protected]> wrote:
>
>> This adds items to the taststats struct to account for user and system
>> time based on scaling the CPU frequency and instruction issue rates.
>>
>> Adds account_(user|system)_time_scaled callbacks which architectures
>> can use to account for time using this mechanism.
>>
>> ...
>>
>> Index: linux-2.6-ozlabs/include/linux/kernel_stat.h
>> ===================================================================
>> --- linux-2.6-ozlabs.orig/include/linux/kernel_stat.h
>> +++ linux-2.6-ozlabs/include/linux/kernel_stat.h
>> @@ -52,7 +52,9 @@ static inline int kstat_irqs(int irq)
>> }
>>
>> extern void account_user_time(struct task_struct *, cputime_t);
>> +extern void account_user_time_scaled(struct task_struct *, cputime_t);
>> extern void account_system_time(struct task_struct *, int, cputime_t);
>> +extern void account_system_time_scaled(struct task_struct *, cputime_t);
>> extern void account_steal_time(struct task_struct *, cputime_t);
>>
>> #endif /* _LINUX_KERNEL_STAT_H */
>> Index: linux-2.6-ozlabs/include/linux/sched.h
>> ===================================================================
>> --- linux-2.6-ozlabs.orig/include/linux/sched.h
>> +++ linux-2.6-ozlabs/include/linux/sched.h
>> @@ -1020,7 +1020,7 @@ struct task_struct {
>> int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
>>
>> unsigned int rt_priority;
>> - cputime_t utime, stime;
>> + cputime_t utime, stime, utimescaled, stimescaled;
>
> Adding 8 or 16 bytes to the task_struct for all architectures for something
> which only powerpc uses?
>
> Is there any prospect that other CPUs can use this?
>
>> unsigned long nvcsw, nivcsw; /* context switch counts */
>> struct timespec start_time; /* monotonic time */
>> struct timespec real_start_time; /* boot based time */
>> Index: linux-2.6-ozlabs/include/linux/taskstats.h
>> ===================================================================
>> --- linux-2.6-ozlabs.orig/include/linux/taskstats.h
>> +++ linux-2.6-ozlabs/include/linux/taskstats.h
>> @@ -31,7 +31,7 @@
>> */
>>
>>
>> -#define TASKSTATS_VERSION 5
>> +#define TASKSTATS_VERSION 6
>> #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
>> * in linux/sched.h */
>>
>> @@ -85,9 +85,12 @@ struct taskstats {
>> * On some architectures, value will adjust for cpu time stolen
>> * from the kernel in involuntary waits due to virtualization.
>> * Value is cumulative, in nanoseconds, without a corresponding count
>> - * and wraps around to zero silently on overflow
>> + * and wraps around to zero silently on overflow. The
>> + * _scaled_ version accounts for cpus which can scale the
>> + * number of instructions executed each cycle.
>> */
>> __u64 cpu_run_real_total;
>> + __u64 cpu_scaled_run_real_total;
>>
>> /* cpu "virtual" running time
>> * Uses time intervals seen by the kernel i.e. no adjustment
>> @@ -142,6 +145,10 @@ struct taskstats {
>> __u64 write_char; /* bytes written */
>> __u64 read_syscalls; /* read syscalls */
>> __u64 write_syscalls; /* write syscalls */
>> +
>> + /* time accounting for SMT machines */
>> + __u64 ac_utimescaled; /* utime scaled on frequency etc */
>> + __u64 ac_stimescaled; /* stime scaled on frequency etc */
>> /* Extended accounting fields end */
>
> umm, should we be adding new fields in the middle of this message? I
> thought we should only add to the end, for back-compatibility, but maybe I
> misremember.
>


You remember correctly, I've asked Michael to make those changes.


--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL

2007-08-19 08:57:20

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH 1/2] Add scaled time to taskstats based process accounting

Andrew Morton wrote:
>>
>> unsigned int rt_priority;
>> - cputime_t utime, stime;
>> + cputime_t utime, stime, utimescaled, stimescaled;
>
> Adding 8 or 16 bytes to the task_struct for all architectures for something
> which only powerpc uses?
>
> Is there any prospect that other CPUs can use this?
>

Hi, Andrew,

There is definitely the prospect for other architectures to use this
feature

x86 provides the APERF and MPERF model specific registers.
The ratio of APERF to MPERF gives the current scaled load on the
system (acpi-cpufreq, get_measured_perf()) I have been looking at
exploiting this functionality for x-series, but ran into a problem;
as per the specification, APERF and MPERF are to be reset to 0
upon reading them. As a result, I am still figuring out a good
way to share the data amongst the ondemand governor and utimescaled
statistics.

I think for now, we can

1. Put utimescaled and stimescaled under an #ifdef for ARCH_POWERPC
2. Add utimescaled and stimescaled and add a big fat comment stating
that work for other architectures is on it's way.

In either case, I think the functionality is useful and can be
exploited by other architectures. The powerpc port is complete and
I think the implementation would provide a good reference for
other implementations to follow.

--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL

2007-08-19 13:12:43

by Michael Neuling

[permalink] [raw]
Subject: Re: [PATCH 1/2] Add scaled time to taskstats based process accounting



In message <[email protected]> you wrote:
> Andrew Morton wrote:
> >>
> >> unsigned int rt_priority;
> >> - cputime_t utime, stime;
> >> + cputime_t utime, stime, utimescaled, stimescaled;
> >
> > Adding 8 or 16 bytes to the task_struct for all architectures for something
> > which only powerpc uses?
> >
> > Is there any prospect that other CPUs can use this?
> >
>
> Hi, Andrew,
>
> There is definitely the prospect for other architectures to use this
> feature
>
> x86 provides the APERF and MPERF model specific registers.
> The ratio of APERF to MPERF gives the current scaled load on the
> system (acpi-cpufreq, get_measured_perf()) I have been looking at
> exploiting this functionality for x-series, but ran into a problem;
> as per the specification, APERF and MPERF are to be reset to 0
> upon reading them. As a result, I am still figuring out a good
> way to share the data amongst the ondemand governor and utimescaled
> statistics.
>
> I think for now, we can
>
> 1. Put utimescaled and stimescaled under an #ifdef for ARCH_POWERPC

... or even #ifdef TASKSTATS

> 2. Add utimescaled and stimescaled and add a big fat comment stating
> that work for other architectures is on it's way.
>
> In either case, I think the functionality is useful and can be
> exploited by other architectures. The powerpc port is complete and
> I think the implementation would provide a good reference for
> other implementations to follow.
>
> --
> Warm Regards,
> Balbir Singh
> Linux Technology Center
> IBM, ISTL
>