2010-11-11 17:08:24

by Michael Holzheu

[permalink] [raw]
Subject: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

From: Michael Holzheu <[email protected]>

Currently steal time is only accounted for the whole system. With this
patch we add steal time to the per task CPU time accounting.
The triplet "user time", "system time" and "steal time" represents
all consumed CPU time on hypervisor based systems.

Signed-off-by: Michael Holzheu <[email protected]>
---
arch/s390/kernel/vtime.c | 19 +++++++++++--------
fs/proc/array.c | 6 +++---
include/linux/kernel_stat.h | 2 +-
include/linux/sched.h | 14 ++++++++------
include/linux/taskstats.h | 1 +
kernel/exit.c | 9 +++++++--
kernel/fork.c | 1 +
kernel/posix-cpu-timers.c | 3 +++
kernel/sched.c | 26 ++++++++++++++++++++------
kernel/sys.c | 10 +++++-----
kernel/tsacct.c | 1 +
11 files changed, 61 insertions(+), 31 deletions(-)

--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -56,31 +56,34 @@ static void do_account_vtime(struct task
{
struct thread_info *ti = task_thread_info(tsk);
__u64 timer, clock, user, system, steal;
+ unsigned char clk[16];

timer = S390_lowcore.last_update_timer;
clock = S390_lowcore.last_update_clock;
asm volatile (" STPT %0\n" /* Store current cpu timer value */
- " STCK %1" /* Store current tod clock value */
+ " STCKE 0(%2)" /* Store current tod clock value */
: "=m" (S390_lowcore.last_update_timer),
- "=m" (S390_lowcore.last_update_clock) );
+ "=m" (clk) : "a" (clk));
+ S390_lowcore.last_update_clock = *(__u64 *) &clk[1];
+ tsk->acct_time = ((clock - sched_clock_base_cc) * 125) >> 9;
S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer;
S390_lowcore.steal_timer += S390_lowcore.last_update_clock - clock;

user = S390_lowcore.user_timer - ti->user_timer;
- S390_lowcore.steal_timer -= user;
ti->user_timer = S390_lowcore.user_timer;
account_user_time(tsk, user, user);

system = S390_lowcore.system_timer - ti->system_timer;
- S390_lowcore.steal_timer -= system;
ti->system_timer = S390_lowcore.system_timer;
account_system_time(tsk, hardirq_offset, system, system);

steal = S390_lowcore.steal_timer;
- if ((s64) steal > 0) {
- S390_lowcore.steal_timer = 0;
- account_steal_time(steal);
- }
+ S390_lowcore.steal_timer = 0;
+ if (steal >= user + system)
+ steal -= user + system;
+ else
+ steal = 0;
+ account_steal_time(tsk, steal);
}

void account_vtime(struct task_struct *prev, struct task_struct *next)
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -375,7 +375,7 @@ static int do_task_stat(struct seq_file
unsigned long long start_time;
unsigned long cmin_flt = 0, cmaj_flt = 0;
unsigned long min_flt = 0, maj_flt = 0;
- cputime_t cutime, cstime, utime, stime;
+ cputime_t cutime, cstime, utime, stime, sttime;
cputime_t cgtime, gtime;
unsigned long rsslim = 0;
char tcomm[sizeof(task->comm)];
@@ -432,7 +432,7 @@ static int do_task_stat(struct seq_file

min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
- thread_group_times(task, &utime, &stime);
+ thread_group_times(task, &utime, &stime, &sttime);
gtime = cputime_add(gtime, sig->gtime);
}

@@ -448,7 +448,7 @@ static int do_task_stat(struct seq_file
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- task_times(task, &utime, &stime);
+ task_times(task, &utime, &stime, &sttime);
gtime = task->gtime;
}

--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -116,7 +116,7 @@ extern unsigned long long task_delta_exe

extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
-extern void account_steal_time(cputime_t);
+extern void account_steal_time(struct task_struct *, cputime_t);
extern void account_idle_time(cputime_t);

extern void account_process_tick(struct task_struct *, int user);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -470,6 +470,7 @@ struct cpu_itimer {
struct task_cputime {
cputime_t utime;
cputime_t stime;
+ cputime_t sttime;
unsigned long long sum_exec_runtime;
};
/* Alternate field names when used to cache expirations. */
@@ -481,6 +482,7 @@ struct task_cputime {
(struct task_cputime) { \
.utime = cputime_zero, \
.stime = cputime_zero, \
+ .sttime = cputime_zero, \
.sum_exec_runtime = 0, \
}

@@ -582,11 +584,11 @@ struct signal_struct {
* Live threads maintain their own counters and add to these
* in __exit_signal, except for the group leader.
*/
- cputime_t utime, stime, cutime, cstime;
+ cputime_t utime, stime, sttime, cutime, cstime, csttime;
cputime_t gtime;
cputime_t cgtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- cputime_t prev_utime, prev_stime;
+ cputime_t prev_utime, prev_stime, prev_sttime;
#endif
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
@@ -1294,10 +1296,10 @@ struct task_struct {
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */

- cputime_t utime, stime, utimescaled, stimescaled;
+ cputime_t utime, stime, sttime, utimescaled, stimescaled;
cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- cputime_t prev_utime, prev_stime;
+ cputime_t prev_utime, prev_stime, prev_sttime;
#endif
unsigned long long acct_time; /* Time for last accounting */
unsigned long nvcsw, nivcsw; /* context switch counts */
@@ -1694,8 +1696,8 @@ static inline void put_task_struct(struc
__put_task_struct(t);
}

-extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
-extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
+extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st, cputime_t *stt);
+extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st, cputime_t *stt);

/*
* Per process flags
--- a/include/linux/taskstats.h
+++ b/include/linux/taskstats.h
@@ -168,6 +168,7 @@ struct taskstats {
/* Timestamp where data has been collected in ns since boot time */
__u64 time_ns;
__u32 ac_tgid; /* Thread group ID */
+ __u64 ac_sttime; /* Steal CPU time [usec] */
};


--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -124,6 +124,7 @@ static void __exit_signal(struct task_st
*/
sig->utime = cputime_add(sig->utime, tsk->utime);
sig->stime = cputime_add(sig->stime, tsk->stime);
+ sig->sttime = cputime_add(sig->sttime, tsk->sttime);
sig->gtime = cputime_add(sig->gtime, tsk->gtime);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
@@ -1228,7 +1229,7 @@ static int wait_task_zombie(struct wait_
struct signal_struct *psig;
struct signal_struct *sig;
unsigned long maxrss;
- cputime_t tgutime, tgstime;
+ cputime_t tgutime, tgstime, tgsttime;

/*
* The resource counters for the group leader are in its
@@ -1249,7 +1250,7 @@ static int wait_task_zombie(struct wait_
* group, which consolidates times for all threads in the
* group including the group leader.
*/
- thread_group_times(p, &tgutime, &tgstime);
+ thread_group_times(p, &tgutime, &tgstime, &tgsttime);
spin_lock_irq(&p->real_parent->sighand->siglock);
psig = p->real_parent->signal;
sig = p->signal;
@@ -1261,6 +1262,10 @@ static int wait_task_zombie(struct wait_
cputime_add(psig->cstime,
cputime_add(tgstime,
sig->cstime));
+ psig->csttime =
+ cputime_add(psig->csttime,
+ cputime_add(tgsttime,
+ sig->csttime));
psig->cgtime =
cputime_add(psig->cgtime,
cputime_add(p->gtime,
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1062,6 +1062,7 @@ static struct task_struct *copy_process(

p->utime = cputime_zero;
p->stime = cputime_zero;
+ p->sttime = cputime_zero;
p->gtime = cputime_zero;
p->utimescaled = cputime_zero;
p->stimescaled = cputime_zero;
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -237,6 +237,7 @@ void thread_group_cputime(struct task_st

times->utime = sig->utime;
times->stime = sig->stime;
+ times->sttime = sig->sttime;
times->sum_exec_runtime = sig->sum_sched_runtime;

rcu_read_lock();
@@ -248,6 +249,7 @@ void thread_group_cputime(struct task_st
do {
times->utime = cputime_add(times->utime, t->utime);
times->stime = cputime_add(times->stime, t->stime);
+ times->sttime = cputime_add(times->sttime, t->sttime);
times->sum_exec_runtime += t->se.sum_exec_runtime;
} while_each_thread(tsk, t);
out:
@@ -1276,6 +1278,7 @@ static inline int fastpath_timer_check(s
struct task_cputime task_sample = {
.utime = tsk->utime,
.stime = tsk->stime,
+ .sttime = tsk->sttime,
.sum_exec_runtime = tsk->se.sum_exec_runtime
};

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3542,11 +3542,15 @@ void account_system_time(struct task_str
* Account for involuntary wait time.
* @steal: the cpu time spent in involuntary wait
*/
-void account_steal_time(cputime_t cputime)
+void account_steal_time(struct task_struct *p, cputime_t cputime)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
cputime64_t cputime64 = cputime_to_cputime64(cputime);

+ /* Add steal time to process. */
+ p->sttime = cputime_add(p->sttime, cputime);
+
+ /* Add steal time to cpustat. */
cpustat->steal = cputime64_add(cpustat->steal, cputime64);
}

@@ -3594,7 +3598,7 @@ void account_process_tick(struct task_st
*/
void account_steal_ticks(unsigned long ticks)
{
- account_steal_time(jiffies_to_cputime(ticks));
+ account_steal_time(current, jiffies_to_cputime(ticks));
}

/*
@@ -3612,13 +3616,16 @@ void account_idle_ticks(unsigned long ti
* Use precise platform statistics if available:
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st,
+ cputime_t *stt)
{
*ut = p->utime;
*st = p->stime;
+ *stt = p->sttime;
}

-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st,
+ cputime_t *stt)
{
struct task_cputime cputime;

@@ -3626,6 +3633,7 @@ void thread_group_times(struct task_stru

*ut = cputime.utime;
*st = cputime.stime;
+ *stt = cputime.sttime;
}
#else

@@ -3633,7 +3641,8 @@ void thread_group_times(struct task_stru
# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
#endif

-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st,
+ cputime_t *stt)
{
cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);

@@ -3656,15 +3665,18 @@ void task_times(struct task_struct *p, c
*/
p->prev_utime = max(p->prev_utime, utime);
p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
+ p->prev_sttime = cputime_zero;

*ut = p->prev_utime;
*st = p->prev_stime;
+ *stt = p->prev_sttime;
}

/*
* Must be called with siglock held.
*/
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st,
+ cputime_t *stt)
{
struct signal_struct *sig = p->signal;
struct task_cputime cputime;
@@ -3687,9 +3699,11 @@ void thread_group_times(struct task_stru
sig->prev_utime = max(sig->prev_utime, utime);
sig->prev_stime = max(sig->prev_stime,
cputime_sub(rtime, sig->prev_utime));
+ sig->prev_sttime = cputime_zero;

*ut = sig->prev_utime;
*st = sig->prev_stime;
+ *stt = sig->prev_sttime;
}
#endif

--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -880,10 +880,10 @@ change_okay:

void do_sys_times(struct tms *tms)
{
- cputime_t tgutime, tgstime, cutime, cstime;
+ cputime_t tgutime, tgstime, tgsttime, cutime, cstime;

spin_lock_irq(&current->sighand->siglock);
- thread_group_times(current, &tgutime, &tgstime);
+ thread_group_times(current, &tgutime, &tgstime, &tgsttime);
cutime = current->signal->cutime;
cstime = current->signal->cstime;
spin_unlock_irq(&current->sighand->siglock);
@@ -1488,14 +1488,14 @@ static void k_getrusage(struct task_stru
{
struct task_struct *t;
unsigned long flags;
- cputime_t tgutime, tgstime, utime, stime;
+ cputime_t tgutime, tgstime, tgsttime, utime, stime, sttime;
unsigned long maxrss = 0;

memset((char *) r, 0, sizeof *r);
utime = stime = cputime_zero;

if (who == RUSAGE_THREAD) {
- task_times(current, &utime, &stime);
+ task_times(current, &utime, &stime, &sttime);
accumulate_thread_rusage(p, r);
maxrss = p->signal->maxrss;
goto out;
@@ -1521,7 +1521,7 @@ static void k_getrusage(struct task_stru
break;

case RUSAGE_SELF:
- thread_group_times(p, &tgutime, &tgstime);
+ thread_group_times(p, &tgutime, &tgstime, &tgsttime);
utime = cputime_add(utime, tgutime);
stime = cputime_add(stime, tgstime);
r->ru_nvcsw += p->signal->nvcsw;
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -66,6 +66,7 @@ void bacct_add_tsk(struct taskstats *sta
rcu_read_unlock();
stats->ac_utime = cputime_to_usecs(tsk->utime);
stats->ac_stime = cputime_to_usecs(tsk->stime);
+ stats->ac_sttime = cputime_to_usecs(tsk->sttime);
stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
stats->ac_minflt = tsk->min_flt;


2010-11-13 19:38:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Thu, 2010-11-11 at 18:03 +0100, Michael Holzheu wrote:
> From: Michael Holzheu <[email protected]>
>
> Currently steal time is only accounted for the whole system. With this
> patch we add steal time to the per task CPU time accounting.
> The triplet "user time", "system time" and "steal time" represents
> all consumed CPU time on hypervisor based systems.

Does that really make sense? Its not like the hypervisor really knows
anything about tasks and won't steal from one? Its really a vcpu
feature.

What added benefit will all this extra accounting give?

2010-11-15 14:51:04

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Sat, 13 Nov 2010 20:38:02 +0100
Peter Zijlstra <[email protected]> wrote:

> On Thu, 2010-11-11 at 18:03 +0100, Michael Holzheu wrote:
> > From: Michael Holzheu <[email protected]>
> >
> > Currently steal time is only accounted for the whole system. With this
> > patch we add steal time to the per task CPU time accounting.
> > The triplet "user time", "system time" and "steal time" represents
> > all consumed CPU time on hypervisor based systems.
>
> Does that really make sense? Its not like the hypervisor really knows
> anything about tasks and won't steal from one? Its really a vcpu
> feature.
>
> What added benefit will all this extra accounting give?

Currently the linux kernel keeps track of used cpu cycles per task,
steal time is reported only per cpu. With the patch steal cycles are
reported per task just like used cpu cycles, giving the complete picture
on a per task basis. Without the patch you don't know if the task has
been waiting or got its cycles stolen. A matter of granularity.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2010-11-15 15:11:44

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Mon, 2010-11-15 at 15:50 +0100, Martin Schwidefsky wrote:
> On Sat, 13 Nov 2010 20:38:02 +0100
> Peter Zijlstra <[email protected]> wrote:
>
> > On Thu, 2010-11-11 at 18:03 +0100, Michael Holzheu wrote:
> > > From: Michael Holzheu <[email protected]>
> > >
> > > Currently steal time is only accounted for the whole system. With this
> > > patch we add steal time to the per task CPU time accounting.
> > > The triplet "user time", "system time" and "steal time" represents
> > > all consumed CPU time on hypervisor based systems.
> >
> > Does that really make sense? Its not like the hypervisor really knows
> > anything about tasks and won't steal from one? Its really a vcpu
> > feature.
> >
> > What added benefit will all this extra accounting give?
>
> Currently the linux kernel keeps track of used cpu cycles per task,
> steal time is reported only per cpu. With the patch steal cycles are
> reported per task just like used cpu cycles, giving the complete picture
> on a per task basis. Without the patch you don't know if the task has
> been waiting or got its cycles stolen. A matter of granularity.

That doesn't answer my question at all. Why do you want to know? Also,
once we change the scheduler to not account steal time to tasks like it
currently does (as Jeremy has been proposing to do several times now)
this should become totally redundant as it will always be 0, no?

Thing is, all I'm seeing is overhead here, the vast majority of systems
simply don't have any steal time at all. So again, what does this buy us
except a gazillion wasted bytes and cycles?

2010-11-15 17:42:14

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Mon, 15 Nov 2010 16:11:23 +0100
Peter Zijlstra <[email protected]> wrote:

> On Mon, 2010-11-15 at 15:50 +0100, Martin Schwidefsky wrote:
> > On Sat, 13 Nov 2010 20:38:02 +0100
> > Peter Zijlstra <[email protected]> wrote:
> >
> > > On Thu, 2010-11-11 at 18:03 +0100, Michael Holzheu wrote:
> > > > From: Michael Holzheu <[email protected]>
> > > >
> > > > Currently steal time is only accounted for the whole system. With this
> > > > patch we add steal time to the per task CPU time accounting.
> > > > The triplet "user time", "system time" and "steal time" represents
> > > > all consumed CPU time on hypervisor based systems.
> > >
> > > Does that really make sense? Its not like the hypervisor really knows
> > > anything about tasks and won't steal from one? Its really a vcpu
> > > feature.
> > >
> > > What added benefit will all this extra accounting give?
> >
> > Currently the linux kernel keeps track of used cpu cycles per task,
> > steal time is reported only per cpu. With the patch steal cycles are
> > reported per task just like used cpu cycles, giving the complete picture
> > on a per task basis. Without the patch you don't know if the task has
> > been waiting or got its cycles stolen. A matter of granularity.
>
> That doesn't answer my question at all. Why do you want to know? Also,
> once we change the scheduler to not account steal time to tasks like it
> currently does (as Jeremy has been proposing to do several times now)
> this should become totally redundant as it will always be 0, no?

At least on s390 and powerpc we already do not account steal time to tasks,
the user and system time is "real" cpu. I do not know if that is true for
ia64 as well which is the third architecture with VIRT_CPU_ACCOUNTING=y.
The steal time of a task tells us how much more progress a task could have
done if the hypervisor would not steal cpu. Now you could argue that the
steal time for a cpu is good enough for that purpose but steal time is not
necessarily uniform over all tasks. And we already do calculate this number,
we just do not store it right now.

> Thing is, all I'm seeing is overhead here, the vast majority of systems
> simply don't have any steal time at all. So again, what does this buy us
> except a gazillion wasted bytes and cycles?

There are 40 bytes more in the task structure and a few instructions more
in account_steal_time. I would not call that gazillions wasted bytes and
cycles. It is a minimal overhead. Would you prefer another #ifdef orgy to
avoid the overhead for VIRT_CPU_ACCOUNTING=n? We can certainly do that.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2010-11-15 17:46:21

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Mon, 2010-11-15 at 18:42 +0100, Martin Schwidefsky wrote:
> I do not know if that is true for
> ia64 as well which is the third architecture with VIRT_CPU_ACCOUNTING=y.

IIRC ia64 uses VIRT_CPU_ACCOUNTING to get high-res task accounting, it
was done before CFS came around and used ns based accounting for all
tasks.

2010-11-15 17:50:19

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Mon, 2010-11-15 at 18:42 +0100, Martin Schwidefsky wrote:
> Would you prefer another #ifdef orgy to
> avoid the overhead for VIRT_CPU_ACCOUNTING=n? We can certainly do
> that.
>
That still doesn't answer the question, why do you want this? What is
per task steal time accounting good for? What problem is being solved?

2010-11-15 17:50:44

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Mon, 2010-11-15 at 18:42 +0100, Martin Schwidefsky wrote:
> There are 40 bytes more in the task structure and a few instructions more
> in account_steal_time. I would not call that gazillions wasted bytes and
> cycles.

Taken over all machines running Linux on real hardware that's gobs of
memory wasted and lots of cycles spend adding 0s.

2010-11-15 17:51:36

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Mon, 2010-11-15 at 18:42 +0100, Martin Schwidefsky wrote:
> The steal time of a task tells us how much more progress a task could have
> done if the hypervisor would not steal cpu. Now you could argue that the
> steal time for a cpu is good enough for that purpose but steal time is not
> necessarily uniform over all tasks. And we already do calculate this number,
> we just do not store it right now.

If you make the scheduler take steal time into account like Jeremy
proposed then you schedule on serviced time and the steal time gain is
proportional to the existing service distribution.

Still, then you know, then what are you going to do about it? Are you
going to avoid the hypervisor from scheduling when that one task is
running?

What good is knowing something you cannot do anything about.

2010-11-15 17:59:29

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Mon, 15 Nov 2010 18:50:41 +0100
Peter Zijlstra <[email protected]> wrote:

> On Mon, 2010-11-15 at 18:42 +0100, Martin Schwidefsky wrote:
> > The steal time of a task tells us how much more progress a task could have
> > done if the hypervisor would not steal cpu. Now you could argue that the
> > steal time for a cpu is good enough for that purpose but steal time is not
> > necessarily uniform over all tasks. And we already do calculate this number,
> > we just do not store it right now.
>
> If you make the scheduler take steal time into account like Jeremy
> proposed then you schedule on serviced time and the steal time gain is
> proportional to the existing service distribution.
>
> Still, then you know, then what are you going to do about it? Are you
> going to avoid the hypervisor from scheduling when that one task is
> running?
>
> What good is knowing something you cannot do anything about.

Steal time per task is at least good for performance problem analysis.
Sometimes knowing what is not the cause of a performance problem can help you
tremendously. If a task is slow and has no steal time, well then the hypervisor
is likely not the culprit. On the other hand if you do see lots of steal time
for a task while the rest of the system doesn't cause any steal time can tell
you something as well. That task might hit a specific function which causes
hypervisor overhead. The usefulness depends on the situation, it is another
data point which may or may not help you.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2010-11-15 18:10:45

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Mon, 2010-11-15 at 18:59 +0100, Martin Schwidefsky wrote:
> Steal time per task is at least good for performance problem analysis.
> Sometimes knowing what is not the cause of a performance problem can help you
> tremendously. If a task is slow and has no steal time, well then the hypervisor
> is likely not the culprit. On the other hand if you do see lots of steal time
> for a task while the rest of the system doesn't cause any steal time can tell
> you something as well. That task might hit a specific function which causes
> hypervisor overhead. The usefulness depends on the situation, it is another
> data point which may or may not help you.

If performance analysis is the only reason, why not add a tracepoint on
vcpu enter that reports the duration the vcpu was out for and use perf
to gather said data? It can tell you what process was running and what
instruction it was at when the vcpu went away.

No need to add 40 bytes per task for that.

2010-11-16 08:51:08

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Mon, 15 Nov 2010 19:08:44 +0100
Peter Zijlstra <[email protected]> wrote:

> On Mon, 2010-11-15 at 18:59 +0100, Martin Schwidefsky wrote:
> > Steal time per task is at least good for performance problem analysis.
> > Sometimes knowing what is not the cause of a performance problem can help you
> > tremendously. If a task is slow and has no steal time, well then the hypervisor
> > is likely not the culprit. On the other hand if you do see lots of steal time
> > for a task while the rest of the system doesn't cause any steal time can tell
> > you something as well. That task might hit a specific function which causes
> > hypervisor overhead. The usefulness depends on the situation, it is another
> > data point which may or may not help you.
>
> If performance analysis is the only reason, why not add a tracepoint on
> vcpu enter that reports the duration the vcpu was out for and use perf
> to gather said data? It can tell you what process was running and what
> instruction it was at when the vcpu went away.
>
> No need to add 40 bytes per task for that.

Which vcpu enter? We usually have z/VM as our hypervisor and want to be able
to do performance analysis with the data we gather inside the guest. There
is no vcpu enter we could put a tracepoint on. We would have to put tracepoints
on every possible interaction point with z/VM to get this data. To me it seems
a lot simpler to add the per-task steal time.

And if it is really the additional 40 bytes on x86 that bother you so much,
we could put them behind #ifdef CONFIG_VIRT_CPU_ACCOUNTING. There already
is one in the task_struct for prev_utime and prev_stime.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2010-11-16 12:16:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Tue, 2010-11-16 at 09:51 +0100, Martin Schwidefsky wrote:
> On Mon, 15 Nov 2010 19:08:44 +0100
> Peter Zijlstra <[email protected]> wrote:
>
> > On Mon, 2010-11-15 at 18:59 +0100, Martin Schwidefsky wrote:
> > > Steal time per task is at least good for performance problem analysis.
> > > Sometimes knowing what is not the cause of a performance problem can help you
> > > tremendously. If a task is slow and has no steal time, well then the hypervisor
> > > is likely not the culprit. On the other hand if you do see lots of steal time
> > > for a task while the rest of the system doesn't cause any steal time can tell
> > > you something as well. That task might hit a specific function which causes
> > > hypervisor overhead. The usefulness depends on the situation, it is another
> > > data point which may or may not help you.
> >
> > If performance analysis is the only reason, why not add a tracepoint on
> > vcpu enter that reports the duration the vcpu was out for and use perf
> > to gather said data? It can tell you what process was running and what
> > instruction it was at when the vcpu went away.
> >
> > No need to add 40 bytes per task for that.
>
> Which vcpu enter? We usually have z/VM as our hypervisor and want to be able
> to do performance analysis with the data we gather inside the guest. There
> is no vcpu enter we could put a tracepoint on. We would have to put tracepoints
> on every possible interaction point with z/VM to get this data. To me it seems
> a lot simpler to add the per-task steal time.

Oh, you guys don't have a hypercall wrapper to exploit? Because from
what I heard from the kvm/xen/lguest people I gathered they could in
fact do something like I proposed.

In fact, kvm seems to already have these tracepoints: kvm_exit/kvm_entry
and it has a separate excplicit hypercall tracepoint as well:
kvm_hypercall.

Except that the per-task steal time gives you lot less detail, being
able to profile on vcpu exit/enter gives you a much more powerfull
performance tool. Aside from being able to measure the steal-time it
allows you to instantly find hypercalls (both explicit as well as
implicit), so you can also measure the hypercall induced steal-time as
well.

> And if it is really the additional 40 bytes on x86 that bother you so much,
> we could put them behind #ifdef CONFIG_VIRT_CPU_ACCOUNTING. There already
> is one in the task_struct for prev_utime and prev_stime.

Making it configurable would definitely help the embedded people, not
sure about VIRT_CPU_ACCOUNTING though, I bet the x86 virt weird^Wpeople
would like it too -- if only to strive for feature parity if nothing
else :/

Its just that I'm not at all convinced its the best approach to solve
the problem posed, and once its committed we're stuck with it due to
ABI.

We should be very careful not to die a death of thousand cuts with all
this accounting madness, there's way too many weird-ass process
accounting junk that adds ABI constraints as it is.

I think its definitely worth investing extra time to implement these
tracepoints if at all possible on your architecture before committing
yourself to something like this.

2010-11-16 15:33:33

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Tue, 16 Nov 2010 13:16:08 +0100
Peter Zijlstra <[email protected]> wrote:

> On Tue, 2010-11-16 at 09:51 +0100, Martin Schwidefsky wrote:
> > On Mon, 15 Nov 2010 19:08:44 +0100
> > Peter Zijlstra <[email protected]> wrote:
> >
> > > On Mon, 2010-11-15 at 18:59 +0100, Martin Schwidefsky wrote:
> > > > Steal time per task is at least good for performance problem analysis.
> > > > Sometimes knowing what is not the cause of a performance problem can help you
> > > > tremendously. If a task is slow and has no steal time, well then the hypervisor
> > > > is likely not the culprit. On the other hand if you do see lots of steal time
> > > > for a task while the rest of the system doesn't cause any steal time can tell
> > > > you something as well. That task might hit a specific function which causes
> > > > hypervisor overhead. The usefulness depends on the situation, it is another
> > > > data point which may or may not help you.
> > >
> > > If performance analysis is the only reason, why not add a tracepoint on
> > > vcpu enter that reports the duration the vcpu was out for and use perf
> > > to gather said data? It can tell you what process was running and what
> > > instruction it was at when the vcpu went away.
> > >
> > > No need to add 40 bytes per task for that.
> >
> > Which vcpu enter? We usually have z/VM as our hypervisor and want to be able
> > to do performance analysis with the data we gather inside the guest. There
> > is no vcpu enter we could put a tracepoint on. We would have to put tracepoints
> > on every possible interaction point with z/VM to get this data. To me it seems
> > a lot simpler to add the per-task steal time.
>
> Oh, you guys don't have a hypercall wrapper to exploit? Because from
> what I heard from the kvm/xen/lguest people I gathered they could in
> fact do something like I proposed.

We could do something along the lines of a hypercall wrapper (we would call
it a diagnose wrapper, same thing). The diagnoses we have on s390 do vastly
different things, so it is not easy to have a common diagnose wrapper.
Would be easier to add a tracepoint for each diagnose inline assembly.

> In fact, kvm seems to already have these tracepoints: kvm_exit/kvm_entry
> and it has a separate excplicit hypercall tracepoint as well:
> kvm_hypercall.

But the kvm tracepoints are used when Linux is the hypervisor, no? For our
situation that would be a tracepoint in z/VM - or the equivalent. This is
out of scope of this patch.

> Except that the per-task steal time gives you lot less detail, being
> able to profile on vcpu exit/enter gives you a much more powerfull
> performance tool. Aside from being able to measure the steal-time it
> allows you to instantly find hypercalls (both explicit as well as
> implicit), so you can also measure the hypercall induced steal-time as
> well.

Yes and no. The tracepoint idea looks interesting in itself. But that does
not completely replace the per-task steal time. The hypervisor can take
away the cpu anytime, it is still interesting to know which task was hit
hardest by that. You could view the cpu time lost by a hypercall as
"synchronous" steal time for the task, the remaining delta to the total
per-task steal time as "asynchronous" steal time.

> > And if it is really the additional 40 bytes on x86 that bother you so much,
> > we could put them behind #ifdef CONFIG_VIRT_CPU_ACCOUNTING. There already
> > is one in the task_struct for prev_utime and prev_stime.
>
> Making it configurable would definitely help the embedded people, not
> sure about VIRT_CPU_ACCOUNTING though, I bet the x86 virt weird^Wpeople
> would like it too -- if only to strive for feature parity if nothing
> else :/
>
> Its just that I'm not at all convinced its the best approach to solve
> the problem posed, and once its committed we're stuck with it due to
> ABI.
>
> We should be very careful not to die a death of thousand cuts with all
> this accounting madness, there's way too many weird-ass process
> accounting junk that adds ABI constraints as it is.
>
> I think its definitely worth investing extra time to implement these
> tracepoints if at all possible on your architecture before committing
> yourself to something like this.

I don't think it is either per-task steal time or tracepoints. Ideally
we'd have both. But I understand that you want to be careful about
committing an ABI. From my viewpoint per-task steal is a logical
extension, the data it is based on is already there.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2010-11-16 15:45:40

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Tue, 2010-11-16 at 16:33 +0100, Martin Schwidefsky wrote:
> > Oh, you guys don't have a hypercall wrapper to exploit? Because from
> > what I heard from the kvm/xen/lguest people I gathered they could in
> > fact do something like I proposed.
>
> We could do something along the lines of a hypercall wrapper (we would call
> it a diagnose wrapper, same thing). The diagnoses we have on s390 do vastly
> different things, so it is not easy to have a common diagnose wrapper.
> Would be easier to add a tracepoint for each diagnose inline assembly.

Right, bummer that.

> > In fact, kvm seems to already have these tracepoints: kvm_exit/kvm_entry
> > and it has a separate excplicit hypercall tracepoint as well:
> > kvm_hypercall.
>
> But the kvm tracepoints are used when Linux is the hypervisor, no? For our
> situation that would be a tracepoint in z/VM - or the equivalent. This is
> out of scope of this patch.

Ah crud, you might be right.. Avi could a kvm guest generate events on
vcpu enter/exit?

> > Except that the per-task steal time gives you lot less detail, being
> > able to profile on vcpu exit/enter gives you a much more powerfull
> > performance tool. Aside from being able to measure the steal-time it
> > allows you to instantly find hypercalls (both explicit as well as
> > implicit), so you can also measure the hypercall induced steal-time as
> > well.
>
> Yes and no. The tracepoint idea looks interesting in itself. But that does
> not completely replace the per-task steal time. The hypervisor can take
> away the cpu anytime, it is still interesting to know which task was hit
> hardest by that. You could view the cpu time lost by a hypercall as
> "synchronous" steal time for the task, the remaining delta to the total
> per-task steal time as "asynchronous" steal time.

Right, so there is no way the guest knows about the vcpu getting
scheduled, it can only derive the fact from hardware clocks after the
fact?

2010-11-16 16:05:56

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Tue, 16 Nov 2010 16:45:29 +0100
Peter Zijlstra <[email protected]> wrote:

> > > Except that the per-task steal time gives you lot less detail, being
> > > able to profile on vcpu exit/enter gives you a much more powerfull
> > > performance tool. Aside from being able to measure the steal-time it
> > > allows you to instantly find hypercalls (both explicit as well as
> > > implicit), so you can also measure the hypercall induced steal-time as
> > > well.
> >
> > Yes and no. The tracepoint idea looks interesting in itself. But that does
> > not completely replace the per-task steal time. The hypervisor can take
> > away the cpu anytime, it is still interesting to know which task was hit
> > hardest by that. You could view the cpu time lost by a hypercall as
> > "synchronous" steal time for the task, the remaining delta to the total
> > per-task steal time as "asynchronous" steal time.
>
> Right, so there is no way the guest knows about the vcpu getting
> scheduled, it can only derive the fact from hardware clocks after the
> fact?

Correct.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2010-11-16 16:39:42

by Avi Kivity

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On 11/16/2010 05:45 PM, Peter Zijlstra wrote:
>
>>> In fact, kvm seems to already have these tracepoints: kvm_exit/kvm_entry
>>> and it has a separate excplicit hypercall tracepoint as well:
>>> kvm_hypercall.
>> But the kvm tracepoints are used when Linux is the hypervisor, no? For our
>> situation that would be a tracepoint in z/VM - or the equivalent. This is
>> out of scope of this patch.
> Ah crud, you might be right.. Avi could a kvm guest generate events on
> vcpu enter/exit?

No. Hypercalls are voluntary and known, but most exits are involuntary
and unknown to the guest. Any memory access can generate a page fault,
and any host interrupt will exit the guest.

2010-11-16 16:43:45

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On Tue, 2010-11-16 at 18:38 +0200, Avi Kivity wrote:
> On 11/16/2010 05:45 PM, Peter Zijlstra wrote:
> >
> >>> In fact, kvm seems to already have these tracepoints: kvm_exit/kvm_entry
> >>> and it has a separate excplicit hypercall tracepoint as well:
> >>> kvm_hypercall.
> >> But the kvm tracepoints are used when Linux is the hypervisor, no? For our
> >> situation that would be a tracepoint in z/VM - or the equivalent. This is
> >> out of scope of this patch.
> > Ah crud, you might be right.. Avi could a kvm guest generate events on
> > vcpu enter/exit?
>
> No. Hypercalls are voluntary and known, but most exits are involuntary
> and unknown to the guest. Any memory access can generate a page fault,
> and any host interrupt will exit the guest.

Right, but we could not make the guest jump to a special stub on vcpu
enter? I guess we could simply because we have the hypervisor under
control.

2010-11-16 16:58:12

by Avi Kivity

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On 11/16/2010 06:43 PM, Peter Zijlstra wrote:
>
>> No. Hypercalls are voluntary and known, but most exits are involuntary
>> and unknown to the guest. Any memory access can generate a page fault,
>> and any host interrupt will exit the guest.
> Right, but we could not make the guest jump to a special stub on vcpu
> enter? I guess we could simply because we have the hypervisor under
> control.

No, some entries inject an interrupt or exception, so the next rip value
is unknown (without doing a lot of extra slow calculations).

2010-11-16 17:07:57

by Avi Kivity

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On 11/16/2010 06:56 PM, Avi Kivity wrote:
> On 11/16/2010 06:43 PM, Peter Zijlstra wrote:
>>
>>> No. Hypercalls are voluntary and known, but most exits are involuntary
>>> and unknown to the guest. Any memory access can generate a page fault,
>>> and any host interrupt will exit the guest.
>> Right, but we could not make the guest jump to a special stub on vcpu
>> enter? I guess we could simply because we have the hypervisor under
>> control.
>
> No, some entries inject an interrupt or exception, so the next rip
> value is unknown (without doing a lot of extra slow calculations).

Also, consider an exit within the stub (if the stub is paged out, for
example).

--
error compiling committee.c: too many arguments to function

2010-11-16 18:39:30

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [RFC][PATCH v2 4/7] taskstats: Add per task steal time accounting

On 11/16/2010 08:05 AM, Martin Schwidefsky wrote:
>>> Yes and no. The tracepoint idea looks interesting in itself. But that does
>>> not completely replace the per-task steal time. The hypervisor can take
>>> away the cpu anytime, it is still interesting to know which task was hit
>>> hardest by that. You could view the cpu time lost by a hypercall as
>>> "synchronous" steal time for the task, the remaining delta to the total
>>> per-task steal time as "asynchronous" steal time.
>> Right, so there is no way the guest knows about the vcpu getting
>> scheduled, it can only derive the fact from hardware clocks after the
>> fact?
> Correct.

Yes, same for Xen.

J