Hi Ingo,
Here is the v4 of the cpuacct statistics patch to obtain per-cgroup
system and user times with appropriate tags and complete changelog.
This applies against the latest -tip plus the cpuacct hierarchy fix v2
which I posted just now. Could you please include this in -tip ?
Changelog:
v4
- Remove comments in cpuacct_update_stats() which explained why rcu_read_lock()
was needed (as per Peter Zijlstra's review comments).
- Don't say that percpu_counter_read() is broken in Documentation/cpuacct.txt
as per KAMEZAWA Hiroyuki's review comments.
v3
- Fix a small race in the cpuacct hierarchy walk.
v2
- stime and utime now exported in clock_t units instead of msecs.
- Addressed the code review comments from Balbir and Li Zefan.
- Moved to -tip tree.
v1
- Moved the stime/utime accounting to cpuacct controller.
Earlier versions
- http://lkml.org/lkml/2009/2/25/129
Regards,
Bharata.
cpuacct: Add stime and utime statistics
Add per-cgroup cpuacct controller statistics like the system and user
time consumed by the group of tasks.
Signed-off-by: Bharata B Rao <[email protected]>
Signed-off-by: Balaji Rao <[email protected]>
CC: Dhaval Giani <[email protected]>
CC: Balbir Singh <[email protected]>
CC: Li Zefan <[email protected]>
CC: Paul Menage <[email protected]>
CC: Andrew Morton <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: KAMEZAWA Hiroyuki <[email protected]>
Acked-by: Peter Zijlstra <[email protected]>
Acked-by: Balbir Singh <[email protected]>
Tested-by: Balbir Singh <[email protected]>
---
Documentation/cgroups/cpuacct.txt | 18 +++++++
kernel/sched.c | 88 +++++++++++++++++++++++++++++++++++---
2 files changed, 100 insertions(+), 6 deletions(-)
--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroups/cpuacct.txt
@@ -30,3 +30,21 @@ The above steps create a new group g1 an
process (bash) into it. CPU time consumed by this bash and its children
can be obtained from g1/cpuacct.usage and the same is accumulated in
/cgroups/cpuacct.usage also.
+
+cpuacct.stat file lists a few statistics which further divide the
+CPU time obtained by the cgroup into user and system times. Currently
+the following statistics are supported:
+
+utime: Time spent by tasks of the cgroup in user mode.
+stime: Time spent by tasks of the cgroup in kernel mode.
+
+utime and stime are in USER_HZ unit.
+
+cpuacct controller uses percpu_counter interface to collect utime and
+stime. This causes two side effects:
+
+- It is theoretically possible to see wrong values for stime and utime.
+ This is because percpu_counter_read() on 32bit systems isn't safe
+ against concurrent writes.
+- It is possible to see slightly outdated values for stime and utime
+ due to the batch processing nature of percpu_counter.
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1442,10 +1442,22 @@ iter_move_one_task(struct rq *this_rq, i
struct rq_iterator *iterator);
#endif
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+ CPUACCT_STAT_UTIME, /* ... user mode */
+ CPUACCT_STAT_STIME, /* ... kernel mode */
+
+ CPUACCT_STAT_NSTATS,
+};
+
#ifdef CONFIG_CGROUP_CPUACCT
static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+static void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, cputime_t val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, cputime_t val) {}
#endif
static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -4375,6 +4387,8 @@ void account_user_time(struct task_struc
cpustat->nice = cputime64_add(cpustat->nice, tmp);
else
cpustat->user = cputime64_add(cpustat->user, tmp);
+
+ cpuacct_update_stats(p, CPUACCT_STAT_UTIME, cputime);
/* Account for user time used */
acct_update_integrals(p);
}
@@ -4436,6 +4450,8 @@ void account_system_time(struct task_str
else
cpustat->system = cputime64_add(cpustat->system, tmp);
+ cpuacct_update_stats(p, CPUACCT_STAT_STIME, cputime);
+
/* Account for system time used */
acct_update_integrals(p);
}
@@ -9724,6 +9740,7 @@ struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
u64 *cpuusage;
+ struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
struct cpuacct *parent;
};
@@ -9748,20 +9765,33 @@ static struct cgroup_subsys_state *cpuac
struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ int i;
if (!ca)
- return ERR_PTR(-ENOMEM);
+ goto out;
ca->cpuusage = alloc_percpu(u64);
- if (!ca->cpuusage) {
- kfree(ca);
- return ERR_PTR(-ENOMEM);
- }
+ if (!ca->cpuusage)
+ goto out_free_ca;
+
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+ if (percpu_counter_init(&ca->cpustat[i], 0))
+ goto out_free_counters;
if (cgrp->parent)
ca->parent = cgroup_ca(cgrp->parent);
return &ca->css;
+
+out_free_counters:
+ i--;
+ while (i-- >= 0)
+ percpu_counter_destroy(&ca->cpustat[i]);
+ free_percpu(ca->cpuusage);
+out_free_ca:
+ kfree(ca);
+out:
+ return ERR_PTR(-ENOMEM);
}
/* destroy an existing cpu accounting group */
@@ -9769,7 +9799,10 @@ static void
cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
+ int i;
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+ percpu_counter_destroy(&ca->cpustat[i]);
free_percpu(ca->cpuusage);
kfree(ca);
}
@@ -9856,6 +9889,25 @@ static int cpuacct_percpu_seq_read(struc
return 0;
}
+static const char *cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_UTIME] = "utime",
+ [CPUACCT_STAT_STIME] = "stime",
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ int i;
+
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+ s64 val = percpu_counter_read(&ca->cpustat[i]);
+ val = cputime_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[i], val);
+ }
+ return 0;
+}
+
static struct cftype files[] = {
{
.name = "usage",
@@ -9866,7 +9918,10 @@ static struct cftype files[] = {
.name = "usage_percpu",
.read_seq_string = cpuacct_percpu_seq_read,
},
-
+ {
+ .name = "stat",
+ .read_map = cpuacct_stats_show,
+ },
};
static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9900,6 +9955,27 @@ static void cpuacct_charge(struct task_s
rcu_read_unlock();
}
+/*
+ * Charge the system/user time to the task's accounting group.
+ */
+static void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, cputime_t val)
+{
+ struct cpuacct *ca;
+
+ if (unlikely(!cpuacct_subsys.active))
+ return;
+
+ rcu_read_lock();
+ ca = task_ca(tsk);
+
+ do {
+ percpu_counter_add(&ca->cpustat[idx], val);
+ ca = ca->parent;
+ } while (ca);
+ rcu_read_unlock();
+}
+
struct cgroup_subsys cpuacct_subsys = {
.name = "cpuacct",
.create = cpuacct_create,
On Mon, 23 Mar 2009 10:05:38 +0530
Bharata B Rao <[email protected]> wrote:
> Hi Ingo,
>
> Here is the v4 of the cpuacct statistics patch to obtain per-cgroup
> system and user times with appropriate tags and complete changelog.
> This applies against the latest -tip plus the cpuacct hierarchy fix v2
> which I posted just now. Could you please include this in -tip ?
>
> Changelog:
>
> v4
> - Remove comments in cpuacct_update_stats() which explained why rcu_read_lock()
> was needed (as per Peter Zijlstra's review comments).
> - Don't say that percpu_counter_read() is broken in Documentation/cpuacct.txt
> as per KAMEZAWA Hiroyuki's review comments.
>
Broken -> isn't safe ? no difference ;)
Can't this help you ?
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
Index: mmotm-2.6.29-Mar21/include/linux/percpu_counter.h
===================================================================
--- mmotm-2.6.29-Mar21.orig/include/linux/percpu_counter.h
+++ mmotm-2.6.29-Mar21/include/linux/percpu_counter.h
@@ -62,6 +62,16 @@ static inline s64 percpu_counter_read(st
return fbc->count;
}
+static inline s64 percpu_counter_read_safe(struct percpu_counter *fbc)
+{
+ s64 ret;
+
+ spin_lock(&fbc->lock);
+ ret = fbc->count;
+ spin_unlock(&fbc->lock);
+ return ret;
+}
+
/*
* It is possible for the percpu_counter_read() to return a small negative
* number for some counter which should never be negative.
@@ -114,6 +124,11 @@ static inline s64 percpu_counter_read(st
return fbc->count;
}
+static inline s64 percpu_counter_read_safe(struct percpu_counter *fbc)
+{
+ return fbc->count;
+}
+
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
return fbc->count;
On Mon, Mar 23, 2009 at 01:55:28PM +0900, KAMEZAWA Hiroyuki wrote:
> On Mon, 23 Mar 2009 10:05:38 +0530
> Bharata B Rao <[email protected]> wrote:
>
> > Hi Ingo,
> >
> > Here is the v4 of the cpuacct statistics patch to obtain per-cgroup
> > system and user times with appropriate tags and complete changelog.
> > This applies against the latest -tip plus the cpuacct hierarchy fix v2
> > which I posted just now. Could you please include this in -tip ?
> >
> > Changelog:
> >
> > v4
> > - Remove comments in cpuacct_update_stats() which explained why rcu_read_lock()
> > was needed (as per Peter Zijlstra's review comments).
> > - Don't say that percpu_counter_read() is broken in Documentation/cpuacct.txt
> > as per KAMEZAWA Hiroyuki's review comments.
> >
> Broken -> isn't safe ? no difference ;)
>From your comment last time, I thought you meant calling it broken would
be harsh.
> Can't this help you ?
>
>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> Index: mmotm-2.6.29-Mar21/include/linux/percpu_counter.h
> ===================================================================
> --- mmotm-2.6.29-Mar21.orig/include/linux/percpu_counter.h
> +++ mmotm-2.6.29-Mar21/include/linux/percpu_counter.h
> @@ -62,6 +62,16 @@ static inline s64 percpu_counter_read(st
> return fbc->count;
> }
>
> +static inline s64 percpu_counter_read_safe(struct percpu_counter *fbc)
> +{
> + s64 ret;
> +
> + spin_lock(&fbc->lock);
> + ret = fbc->count;
> + spin_unlock(&fbc->lock);
> + return ret;
> +}
> +
> /*
> * It is possible for the percpu_counter_read() to return a small negative
> * number for some counter which should never be negative.
> @@ -114,6 +124,11 @@ static inline s64 percpu_counter_read(st
> return fbc->count;
> }
>
> +static inline s64 percpu_counter_read_safe(struct percpu_counter *fbc)
> +{
> + return fbc->count;
> +}
> +
> static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
> {
> return fbc->count;
Yes it helps, but for the records, let me note that it makes the readside
~20% slower.
[I measured time spent by cpuacct_stats_show(), which is part of
the read routine for cpuacct.stat by using kretprobe and for 100 reads
(or 100 executions of cpuacct_stats_show()), I get the following numbers:
percpu_counter_read() - 3166367 ns
percpu_counter_read_safe() - 3793546 ns which is ~20% slower. ]
I would prefer that this patch should be included in its current form
and we could separately fix percpu_counter_read() given that there
has been an attempt in the past to fix this (http://lkml.org/lkml/2008/8/27/303)
Moreover, I don't know how much acceptable it is to work around the problem
in percpu_counter_read() by introducing another variant
percpu_counter_read_safe().
Regards,
Bharata.
On Mon, 23 Mar 2009 12:06:03 +0530
Bharata B Rao <[email protected]> wrote:
> On Mon, Mar 23, 2009 at 01:55:28PM +0900, KAMEZAWA Hiroyuki wrote:
> > On Mon, 23 Mar 2009 10:05:38 +0530
> > Bharata B Rao <[email protected]> wrote:
> >
> > > Hi Ingo,
> > >
> > > Here is the v4 of the cpuacct statistics patch to obtain per-cgroup
> > > system and user times with appropriate tags and complete changelog.
> > > This applies against the latest -tip plus the cpuacct hierarchy fix v2
> > > which I posted just now. Could you please include this in -tip ?
> > >
> > > Changelog:
> > >
> > > v4
> > > - Remove comments in cpuacct_update_stats() which explained why rcu_read_lock()
> > > was needed (as per Peter Zijlstra's review comments).
> > > - Don't say that percpu_counter_read() is broken in Documentation/cpuacct.txt
> > > as per KAMEZAWA Hiroyuki's review comments.
> > >
> > Broken -> isn't safe ? no difference ;)
>
> From your comment last time, I thought you meant calling it broken would
> be harsh.
>
> > Can't this help you ?
> >
> >
> > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> > ---
> > Index: mmotm-2.6.29-Mar21/include/linux/percpu_counter.h
> > ===================================================================
> > --- mmotm-2.6.29-Mar21.orig/include/linux/percpu_counter.h
> > +++ mmotm-2.6.29-Mar21/include/linux/percpu_counter.h
> > @@ -62,6 +62,16 @@ static inline s64 percpu_counter_read(st
> > return fbc->count;
> > }
> >
> > +static inline s64 percpu_counter_read_safe(struct percpu_counter *fbc)
> > +{
> > + s64 ret;
> > +
> > + spin_lock(&fbc->lock);
> > + ret = fbc->count;
> > + spin_unlock(&fbc->lock);
> > + return ret;
> > +}
> > +
> > /*
> > * It is possible for the percpu_counter_read() to return a small negative
> > * number for some counter which should never be negative.
> > @@ -114,6 +124,11 @@ static inline s64 percpu_counter_read(st
> > return fbc->count;
> > }
> >
> > +static inline s64 percpu_counter_read_safe(struct percpu_counter *fbc)
> > +{
> > + return fbc->count;
> > +}
> > +
> > static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
> > {
> > return fbc->count;
>
> Yes it helps, but for the records, let me note that it makes the readside
> ~20% slower.
>
> [I measured time spent by cpuacct_stats_show(), which is part of
> the read routine for cpuacct.stat by using kretprobe and for 100 reads
> (or 100 executions of cpuacct_stats_show()), I get the following numbers:
> percpu_counter_read() - 3166367 ns
> percpu_counter_read_safe() - 3793546 ns which is ~20% slower. ]
>
> I would prefer that this patch should be included in its current form
> and we could separately fix percpu_counter_read() given that there
> has been an attempt in the past to fix this (http://lkml.org/lkml/2008/8/27/303)
>
> Moreover, I don't know how much acceptable it is to work around the problem
> in percpu_counter_read() by introducing another variant
> percpu_counter_read_safe().
>
Hmm, ok, how about this way ?
Removet this.
+- It is possible to see slightly outdated values for stime and utime
+ due to the batch processing nature of percpu_counter.
Anyway, we get overhead of vfs_read(), interrupts, at el ;)
put into todo list.
+TODO:
+- It is theoretically possible to see wrong values for stime and utime.
+ This is because percpu_counter_read() on 32bit systems isn't safe
+ against concurrent writes. needs to be fixed.
Thanks,
-Kame
On Mon, Mar 23, 2009 at 04:01:12PM +0900, KAMEZAWA Hiroyuki wrote:
>
> Removet this.
>
> +- It is possible to see slightly outdated values for stime and utime
> + due to the batch processing nature of percpu_counter.
This will explain why the stime and utime values for the top cgroup is
different from the ones seen from /proc/stat. Hence I would prefer to have a
mention of it in the documentation.
[root@llm11 cgroups]# cat cpuacct.stat ; cat /proc/stat | grep cpu
utime 3110
stime 8410
cpu 3151 2 8336 4051594 2189 0 133 0 0
...
Regards,
Bharata.
On Mon, 23 Mar 2009 13:17:50 +0530
Bharata B Rao <[email protected]> wrote:
> On Mon, Mar 23, 2009 at 04:01:12PM +0900, KAMEZAWA Hiroyuki wrote:
> >
> > Removet this.
> >
> > +- It is possible to see slightly outdated values for stime and utime
> > + due to the batch processing nature of percpu_counter.
>
> This will explain why the stime and utime values for the top cgroup is
> different from the ones seen from /proc/stat. Hence I would prefer to have a
> mention of it in the documentation.
>
i see.
Thanks,
-Kame
> [root@llm11 cgroups]# cat cpuacct.stat ; cat /proc/stat | grep cpu
> utime 3110
> stime 8410
> cpu 3151 2 8336 4051594 2189 0 133 0 0
> ...
>
> Regards,
> Bharata.
>
On Mon, 23 Mar 2009 10:05:38 +0530 Bharata B Rao <[email protected]> wrote:
> +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
> + struct cgroup_map_cb *cb)
> +{
> + struct cpuacct *ca = cgroup_ca(cgrp);
> + int i;
> +
> + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
> + s64 val = percpu_counter_read(&ca->cpustat[i]);
> + val = cputime_to_clock_t(val);
> + cb->fill(cb, cpuacct_stat_desc[i], val);
> + }
> + return 0;
> +}
I'd have expected `val' to have type clock_t. But clock_t is 32-bit on
32-bit x86.
Is it correct to pass a 64-bit value to a function which takes a 32-bit
value and to then copy the 32-bit return value into a 64-bit variable?
On Mon, Mar 23, 2009 at 04:21:23AM -0700, Andrew Morton wrote:
> On Mon, 23 Mar 2009 10:05:38 +0530 Bharata B Rao <[email protected]> wrote:
>
> > +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
> > + struct cgroup_map_cb *cb)
> > +{
> > + struct cpuacct *ca = cgroup_ca(cgrp);
> > + int i;
> > +
> > + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
> > + s64 val = percpu_counter_read(&ca->cpustat[i]);
> > + val = cputime_to_clock_t(val);
> > + cb->fill(cb, cpuacct_stat_desc[i], val);
> > + }
> > + return 0;
> > +}
>
> I'd have expected `val' to have type clock_t. But clock_t is 32-bit on
> 32-bit x86.
>
> Is it correct to pass a 64-bit value to a function which takes a 32-bit
> value and to then copy the 32-bit return value into a 64-bit variable?
Yes, doesn't look neat. Since the accumulated stat(val here) is of 64bit type,
I could use cputime64_to_clock_t(val) in the above code. Storing the resulting
32bit clock_t in a 64 bit variable(which will be used by cb->fill) should be
fine I believe. Let me know if I got this right.
Regards,
Bharata.