2009-03-10 12:42:17

by Bharata B Rao

[permalink] [raw]
Subject: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1

Hi,

Based on the comments received during my last post
(http://lkml.org/lkml/2009/2/25/129), here is a fresh attempt
to get per-cgroup utime/stime statistics as part of cpuacct controller.

This patch adds a new file cpuacct.stat which displays two stats:
utime and stime. I wasn't too sure about the usefulness of providing
per-cgroup guest and steal times and hence not including them here.

Note that I am using percpu_counter for collecting these two stats.
Since percpu_counter subsystem doesn't protect the readside, readers could
theoritically obtain incorrect values for these stats on 32bit systems.
I hope occasional wrong values is not too much of a concern for
statistics like this. If it is a problem, we have to either fix
percpu_counter or do it all by ourselves as Kamezawa attempted
for cpuacct.usage (http://lkml.org/lkml/2009/3/4/14)

Regards,
Bharata.

cpuacct: Add stime and utime statistics

Add per-cgroup cpuacct controller statistics like the system and user
time consumed by the group of tasks.

Signed-off-by: Bharata B Rao <[email protected]>
Signed-off-by: Balaji Rao <[email protected]>
---
Documentation/cgroups/cpuacct.txt | 8 +++
kernel/sched.c | 87 +++++++++++++++++++++++++++++++++++---
2 files changed, 89 insertions(+), 6 deletions(-)

--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroups/cpuacct.txt
@@ -30,3 +30,11 @@ The above steps create a new group g1 an
process (bash) into it. CPU time consumed by this bash and its children
can be obtained from g1/cpuacct.usage and the same is accumulated in
/cgroups/cpuacct.usage also.
+
+cpuacct.stat file lists a few statistics which further divide the
+CPU time obtained by the cgroup into user and system times. Currently
+the following statistics are supported:
+
+utime: Time in milliseconds spent by tasks of the cgroup in user mode.
+stime: Time in milliseconds spent by tasks of the cgroup in kernel mode.
+
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1393,10 +1393,22 @@ iter_move_one_task(struct rq *this_rq, i
struct rq_iterator *iterator);
#endif

+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+ CPUACCT_STAT_UTIME, /* ... user mode */
+ CPUACCT_STAT_STIME, /* ... kernel mode */
+
+ CPUACCT_STAT_NSTATS,
+};
+
#ifdef CONFIG_CGROUP_CPUACCT
static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+static void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, int val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, int val) {}
#endif

static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -4182,6 +4194,8 @@ void account_user_time(struct task_struc
cpustat->nice = cputime64_add(cpustat->nice, tmp);
else
cpustat->user = cputime64_add(cpustat->user, tmp);
+
+ cpuacct_update_stats(p, CPUACCT_STAT_UTIME, cputime_to_msecs(cputime));
/* Account for user time used */
acct_update_integrals(p);
}
@@ -4243,6 +4257,8 @@ void account_system_time(struct task_str
else
cpustat->system = cputime64_add(cpustat->system, tmp);

+ cpuacct_update_stats(p, CPUACCT_STAT_STIME, cputime_to_msecs(cputime));
+
/* Account for system time used */
acct_update_integrals(p);
}
@@ -9438,6 +9454,7 @@ struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
u64 *cpuusage;
+ struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
struct cpuacct *parent;
};

@@ -9462,20 +9479,33 @@ static struct cgroup_subsys_state *cpuac
struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ int i;

if (!ca)
- return ERR_PTR(-ENOMEM);
+ goto out1;

ca->cpuusage = alloc_percpu(u64);
- if (!ca->cpuusage) {
- kfree(ca);
- return ERR_PTR(-ENOMEM);
- }
+ if (!ca->cpuusage)
+ goto out2;
+
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+ if (percpu_counter_init(&ca->cpustat[i], 0))
+ goto out3;

if (cgrp->parent)
ca->parent = cgroup_ca(cgrp->parent);

return &ca->css;
+
+out3:
+ i--;
+ while (i-- >= 0)
+ percpu_counter_destroy(&ca->cpustat[i]);
+ free_percpu(ca->cpuusage);
+out2:
+ kfree(ca);
+out1:
+ return ERR_PTR(-ENOMEM);
}

/* destroy an existing cpu accounting group */
@@ -9483,7 +9513,10 @@ static void
cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
+ int i;

+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+ percpu_counter_destroy(&ca->cpustat[i]);
free_percpu(ca->cpuusage);
kfree(ca);
}
@@ -9570,6 +9603,28 @@ static int cpuacct_percpu_seq_read(struc
return 0;
}

+static const struct cpuacct_stat_desc {
+ const char *msg;
+ u64 unit;
+} cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_UTIME] = { "utime", 1, },
+ [CPUACCT_STAT_STIME] = { "stime", 1, },
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ int i;
+
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+ s64 val = percpu_counter_read(&ca->cpustat[i]);
+ val *= cpuacct_stat_desc[i].unit;
+ cb->fill(cb, cpuacct_stat_desc[i].msg, val);
+ }
+ return 0;
+}
+
static struct cftype files[] = {
{
.name = "usage",
@@ -9580,7 +9635,10 @@ static struct cftype files[] = {
.name = "usage_percpu",
.read_seq_string = cpuacct_percpu_seq_read,
},
-
+ {
+ .name = "stat",
+ .read_map = cpuacct_stats_show,
+ },
};

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9610,6 +9668,23 @@ static void cpuacct_charge(struct task_s
}
}

+/*
+ * Account the system/user time to the task's accounting group.
+ */
+static void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, int val)
+{
+ struct cpuacct *ca;
+
+ if (!cpuacct_subsys.active)
+ return;
+
+ ca = task_ca(tsk);
+
+ for (; ca; ca = ca->parent)
+ percpu_counter_add(&ca->cpustat[idx], val);
+}
+
struct cgroup_subsys cpuacct_subsys = {
.name = "cpuacct",
.create = cpuacct_create,


2009-03-11 00:39:49

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1

On Tue, 10 Mar 2009 18:12:08 +0530
Bharata B Rao <[email protected]> wrote:

> Hi,
>
> Based on the comments received during my last post
> (http://lkml.org/lkml/2009/2/25/129), here is a fresh attempt
> to get per-cgroup utime/stime statistics as part of cpuacct controller.
>
> This patch adds a new file cpuacct.stat which displays two stats:
> utime and stime. I wasn't too sure about the usefulness of providing
> per-cgroup guest and steal times and hence not including them here.
>
> Note that I am using percpu_counter for collecting these two stats.
> Since percpu_counter subsystem doesn't protect the readside, readers could
> theoritically obtain incorrect values for these stats on 32bit systems.

Using percpu_counter_read() means that .. but is it okay to ignore "batch"
number ? (see FBC_BATCH)


> I hope occasional wrong values is not too much of a concern for
> statistics like this. If it is a problem, we have to either fix
> percpu_counter or do it all by ourselves as Kamezawa attempted
> for cpuacct.usage (http://lkml.org/lkml/2009/3/4/14)
>
Hmm, percpu_counter_sum() is bad ?

BTW, I'm not sure but don't we need special handling if
CONFIG_VIRT_CPU_ACCOUNTING=y ?


Thanks,
-Kame


> Regards,
> Bharata.
>
> cpuacct: Add stime and utime statistics
>
> Add per-cgroup cpuacct controller statistics like the system and user
> time consumed by the group of tasks.
>
> Signed-off-by: Bharata B Rao <[email protected]>
> Signed-off-by: Balaji Rao <[email protected]>
> ---
> Documentation/cgroups/cpuacct.txt | 8 +++
> kernel/sched.c | 87 +++++++++++++++++++++++++++++++++++---
> 2 files changed, 89 insertions(+), 6 deletions(-)
>
> --- a/Documentation/cgroups/cpuacct.txt
> +++ b/Documentation/cgroups/cpuacct.txt
> @@ -30,3 +30,11 @@ The above steps create a new group g1 an
> process (bash) into it. CPU time consumed by this bash and its children
> can be obtained from g1/cpuacct.usage and the same is accumulated in
> /cgroups/cpuacct.usage also.
> +
> +cpuacct.stat file lists a few statistics which further divide the
> +CPU time obtained by the cgroup into user and system times. Currently
> +the following statistics are supported:
> +
> +utime: Time in milliseconds spent by tasks of the cgroup in user mode.
> +stime: Time in milliseconds spent by tasks of the cgroup in kernel mode.
> +
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -1393,10 +1393,22 @@ iter_move_one_task(struct rq *this_rq, i
> struct rq_iterator *iterator);
> #endif
>
> +/* Time spent by the tasks of the cpu accounting group executing in ... */
> +enum cpuacct_stat_index {
> + CPUACCT_STAT_UTIME, /* ... user mode */
> + CPUACCT_STAT_STIME, /* ... kernel mode */
> +
> + CPUACCT_STAT_NSTATS,
> +};
> +
> #ifdef CONFIG_CGROUP_CPUACCT
> static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
> +static void cpuacct_update_stats(struct task_struct *tsk,
> + enum cpuacct_stat_index idx, int val);
> #else
> static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
> +static void cpuacct_update_stats(struct task_struct *tsk,
> + enum cpuacct_stat_index idx, int val) {}
> #endif
>
> static inline void inc_cpu_load(struct rq *rq, unsigned long load)
> @@ -4182,6 +4194,8 @@ void account_user_time(struct task_struc
> cpustat->nice = cputime64_add(cpustat->nice, tmp);
> else
> cpustat->user = cputime64_add(cpustat->user, tmp);
> +
> + cpuacct_update_stats(p, CPUACCT_STAT_UTIME, cputime_to_msecs(cputime));
> /* Account for user time used */
> acct_update_integrals(p);
> }
> @@ -4243,6 +4257,8 @@ void account_system_time(struct task_str
> else
> cpustat->system = cputime64_add(cpustat->system, tmp);
>
> + cpuacct_update_stats(p, CPUACCT_STAT_STIME, cputime_to_msecs(cputime));
> +
> /* Account for system time used */
> acct_update_integrals(p);
> }
> @@ -9438,6 +9454,7 @@ struct cpuacct {
> struct cgroup_subsys_state css;
> /* cpuusage holds pointer to a u64-type object on every cpu */
> u64 *cpuusage;
> + struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
> struct cpuacct *parent;
> };
>
> @@ -9462,20 +9479,33 @@ static struct cgroup_subsys_state *cpuac
> struct cgroup_subsys *ss, struct cgroup *cgrp)
> {
> struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
> + int i;
>
> if (!ca)
> - return ERR_PTR(-ENOMEM);
> + goto out1;
>
> ca->cpuusage = alloc_percpu(u64);
> - if (!ca->cpuusage) {
> - kfree(ca);
> - return ERR_PTR(-ENOMEM);
> - }
> + if (!ca->cpuusage)
> + goto out2;
> +
> + for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
> + if (percpu_counter_init(&ca->cpustat[i], 0))
> + goto out3;
>
> if (cgrp->parent)
> ca->parent = cgroup_ca(cgrp->parent);
>
> return &ca->css;
> +
> +out3:
> + i--;
> + while (i-- >= 0)
> + percpu_counter_destroy(&ca->cpustat[i]);
> + free_percpu(ca->cpuusage);
> +out2:
> + kfree(ca);
> +out1:
> + return ERR_PTR(-ENOMEM);
> }
>
> /* destroy an existing cpu accounting group */
> @@ -9483,7 +9513,10 @@ static void
> cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
> {
> struct cpuacct *ca = cgroup_ca(cgrp);
> + int i;
>
> + for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
> + percpu_counter_destroy(&ca->cpustat[i]);
> free_percpu(ca->cpuusage);
> kfree(ca);
> }
> @@ -9570,6 +9603,28 @@ static int cpuacct_percpu_seq_read(struc
> return 0;
> }
>
> +static const struct cpuacct_stat_desc {
> + const char *msg;
> + u64 unit;
> +} cpuacct_stat_desc[] = {
> + [CPUACCT_STAT_UTIME] = { "utime", 1, },
> + [CPUACCT_STAT_STIME] = { "stime", 1, },
> +};
> +
> +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
> + struct cgroup_map_cb *cb)
> +{
> + struct cpuacct *ca = cgroup_ca(cgrp);
> + int i;
> +
> + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
> + s64 val = percpu_counter_read(&ca->cpustat[i]);
> + val *= cpuacct_stat_desc[i].unit;
> + cb->fill(cb, cpuacct_stat_desc[i].msg, val);
> + }
> + return 0;
> +}
> +
> static struct cftype files[] = {
> {
> .name = "usage",
> @@ -9580,7 +9635,10 @@ static struct cftype files[] = {
> .name = "usage_percpu",
> .read_seq_string = cpuacct_percpu_seq_read,
> },
> -
> + {
> + .name = "stat",
> + .read_map = cpuacct_stats_show,
> + },
> };
>
> static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> @@ -9610,6 +9668,23 @@ static void cpuacct_charge(struct task_s
> }
> }
>
> +/*
> + * Account the system/user time to the task's accounting group.
> + */
> +static void cpuacct_update_stats(struct task_struct *tsk,
> + enum cpuacct_stat_index idx, int val)
> +{
> + struct cpuacct *ca;
> +
> + if (!cpuacct_subsys.active)
> + return;
> +
> + ca = task_ca(tsk);
> +
> + for (; ca; ca = ca->parent)
> + percpu_counter_add(&ca->cpustat[idx], val);
> +}
> +
> struct cgroup_subsys cpuacct_subsys = {
> .name = "cpuacct",
> .create = cpuacct_create,
>

2009-03-11 01:15:52

by Li Zefan

[permalink] [raw]
Subject: Re: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1

> +static const struct cpuacct_stat_desc {
> + const char *msg;
> + u64 unit;
> +} cpuacct_stat_desc[] = {
> + [CPUACCT_STAT_UTIME] = { "utime", 1, },
> + [CPUACCT_STAT_STIME] = { "stime", 1, },
> +};
> +

I think we can just remove 'unit'.

> +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
> + struct cgroup_map_cb *cb)
> +{
> + struct cpuacct *ca = cgroup_ca(cgrp);
> + int i;
> +
> + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
> + s64 val = percpu_counter_read(&ca->cpustat[i]);
> + val *= cpuacct_stat_desc[i].unit;
> + cb->fill(cb, cpuacct_stat_desc[i].msg, val);
> + }
> + return 0;
> +}
> +
...
> +/*
> + * Account the system/user time to the task's accounting group.
> + */
> +static void cpuacct_update_stats(struct task_struct *tsk,
> + enum cpuacct_stat_index idx, int val)
> +{
> + struct cpuacct *ca;
> +
> + if (!cpuacct_subsys.active)

if (unlikely(!cpuacct_subsys.active)))

> + return;
> +
> + ca = task_ca(tsk);
> +
> + for (; ca; ca = ca->parent)
> + percpu_counter_add(&ca->cpustat[idx], val);
> +}
> +

We can reduce one NULL check:

do (
percpu_counter_add();
ca = ca->parent;
} while (ca);

2009-03-11 08:53:10

by Bharata B Rao

[permalink] [raw]
Subject: Re: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1

On Wed, Mar 11, 2009 at 09:38:12AM +0900, KAMEZAWA Hiroyuki wrote:
> On Tue, 10 Mar 2009 18:12:08 +0530
> Bharata B Rao <[email protected]> wrote:
>
> > Hi,
> >
> > Based on the comments received during my last post
> > (http://lkml.org/lkml/2009/2/25/129), here is a fresh attempt
> > to get per-cgroup utime/stime statistics as part of cpuacct controller.
> >
> > This patch adds a new file cpuacct.stat which displays two stats:
> > utime and stime. I wasn't too sure about the usefulness of providing
> > per-cgroup guest and steal times and hence not including them here.
> >
> > Note that I am using percpu_counter for collecting these two stats.
> > Since percpu_counter subsystem doesn't protect the readside, readers could
> > theoritically obtain incorrect values for these stats on 32bit systems.
>
> Using percpu_counter_read() means that .. but is it okay to ignore "batch"
> number ? (see FBC_BATCH)

I would think it might be ok with the understanding that read is not
a frequent operation. The default value of percpu_counter_batch is 32.
Ideally it should have been possible to set this value independently
for each percpu_counter. That way, users could have chosen an appropriate
batch value for their counter based on the usage pattern of their
counters.

>
>
> > I hope occasional wrong values is not too much of a concern for
> > statistics like this. If it is a problem, we have to either fix
> > percpu_counter or do it all by ourselves as Kamezawa attempted
> > for cpuacct.usage (http://lkml.org/lkml/2009/3/4/14)
> >
> Hmm, percpu_counter_sum() is bad ?

It is slow and it doesn't do exactly what we want. It just adds the
32bit percpu counters to the global 64bit counter under lock and returns
the result. But it doesn't clear the 32bit percpu counters after accummulating
them in the 64bit counter.

If it is ok to be a bit slower on the read side, we could have something
like percpu_counter_read_slow() which would do what percpu_counter_sum()
does and in addition clear the 32bit percpu counters. Will this be
acceptable ? It slows down the read side, but will give accurate count.
This might slow down the write side also(due to contention b/n readers
and writers), but I guess due to batching the effect might not be too
pronounced. Should we be going this way ?

>
> BTW, I'm not sure but don't we need special handling if
> CONFIG_VIRT_CPU_ACCOUNTING=y ?

AFAICS no. Architectures which define CONFIG_VIRT_CPU_ACCOUNTING end up calling
account_{system,user}_time() where we have placed our hooks for
cpuacct charging. So even on such architectures we should be able to
get correct per-cgroup stime and utime.

Regards,
Bharata.

2009-03-11 08:55:37

by Bharata B Rao

[permalink] [raw]
Subject: Re: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1

On Wed, Mar 11, 2009 at 09:15:48AM +0800, Li Zefan wrote:
> > +static const struct cpuacct_stat_desc {
> > + const char *msg;
> > + u64 unit;
> > +} cpuacct_stat_desc[] = {
> > + [CPUACCT_STAT_UTIME] = { "utime", 1, },
> > + [CPUACCT_STAT_STIME] = { "stime", 1, },
> > +};
> > +
>
> I think we can just remove 'unit'.

Ok, they are redundant. Will do.

>
> > +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
> > + struct cgroup_map_cb *cb)
> > +{
> > + struct cpuacct *ca = cgroup_ca(cgrp);
> > + int i;
> > +
> > + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
> > + s64 val = percpu_counter_read(&ca->cpustat[i]);
> > + val *= cpuacct_stat_desc[i].unit;
> > + cb->fill(cb, cpuacct_stat_desc[i].msg, val);
> > + }
> > + return 0;
> > +}
> > +
> ...
> > +/*
> > + * Account the system/user time to the task's accounting group.
> > + */
> > +static void cpuacct_update_stats(struct task_struct *tsk,
> > + enum cpuacct_stat_index idx, int val)
> > +{
> > + struct cpuacct *ca;
> > +
> > + if (!cpuacct_subsys.active)
>
> if (unlikely(!cpuacct_subsys.active)))

Ok.

>
> > + return;
> > +
> > + ca = task_ca(tsk);
> > +
> > + for (; ca; ca = ca->parent)
> > + percpu_counter_add(&ca->cpustat[idx], val);
> > +}
> > +
>
> We can reduce one NULL check:
>
> do (
> percpu_counter_add();
> ca = ca->parent;
> } while (ca);

Sure. Will address all these in the next iteration.

Regards,
Bharata.

2009-03-11 09:14:42

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1

On Wed, 11 Mar 2009 14:23:16 +0530
Bharata B Rao <[email protected]> wrote:

> On Wed, Mar 11, 2009 at 09:38:12AM +0900, KAMEZAWA Hiroyuki wrote:
> > On Tue, 10 Mar 2009 18:12:08 +0530
> > Bharata B Rao <[email protected]> wrote:
> >
> > > Hi,
> > >
> > > Based on the comments received during my last post
> > > (http://lkml.org/lkml/2009/2/25/129), here is a fresh attempt
> > > to get per-cgroup utime/stime statistics as part of cpuacct controller.
> > >
> > > This patch adds a new file cpuacct.stat which displays two stats:
> > > utime and stime. I wasn't too sure about the usefulness of providing
> > > per-cgroup guest and steal times and hence not including them here.
> > >
> > > Note that I am using percpu_counter for collecting these two stats.
> > > Since percpu_counter subsystem doesn't protect the readside, readers could
> > > theoritically obtain incorrect values for these stats on 32bit systems.
> >
> > Using percpu_counter_read() means that .. but is it okay to ignore "batch"
> > number ? (see FBC_BATCH)
>
> I would think it might be ok with the understanding that read is not
> a frequent operation. The default value of percpu_counter_batch is 32.
> Ideally it should have been possible to set this value independently
> for each percpu_counter. That way, users could have chosen an appropriate
> batch value for their counter based on the usage pattern of their
> counters.
>
Hmm, in my point of view, stime/utime's unit is mili second and it's enough
big to be expected as "correct" value.
If read is not frequent, I love precise value.


> >
> >
> > > I hope occasional wrong values is not too much of a concern for
> > > statistics like this. If it is a problem, we have to either fix
> > > percpu_counter or do it all by ourselves as Kamezawa attempted
> > > for cpuacct.usage (http://lkml.org/lkml/2009/3/4/14)
> > >
> > Hmm, percpu_counter_sum() is bad ?
>
> It is slow and it doesn't do exactly what we want. It just adds the
> 32bit percpu counters to the global 64bit counter under lock and returns
> the result. But it doesn't clear the 32bit percpu counters after accummulating
> them in the 64bit counter.
>
> If it is ok to be a bit slower on the read side, we could have something
> like percpu_counter_read_slow() which would do what percpu_counter_sum()
> does and in addition clear the 32bit percpu counters. Will this be
> acceptable ? It slows down the read side, but will give accurate count.
> This might slow down the write side also(due to contention b/n readers
> and writers), but I guess due to batching the effect might not be too
> pronounced. Should we be going this way ?
>
I like precise one. Maybe measuring overhead and comparing them and making a
decision is a usual way to go.
This accounting is once-a-tick event. (right?) So, how about measuring read-side
over head ?

> >
> > BTW, I'm not sure but don't we need special handling if
> > CONFIG_VIRT_CPU_ACCOUNTING=y ?
>
> AFAICS no. Architectures which define CONFIG_VIRT_CPU_ACCOUNTING end up calling
> account_{system,user}_time() where we have placed our hooks for
> cpuacct charging. So even on such architectures we should be able to
> get correct per-cgroup stime and utime.
>
ok,

Thanks,
-Kame

2009-03-11 15:17:33

by Balbir Singh

[permalink] [raw]
Subject: Re: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1

* Bharata B Rao <[email protected]> [2009-03-10 18:12:08]:

> Hi,
>
> Based on the comments received during my last post
> (http://lkml.org/lkml/2009/2/25/129), here is a fresh attempt
> to get per-cgroup utime/stime statistics as part of cpuacct controller.
>
> This patch adds a new file cpuacct.stat which displays two stats:
> utime and stime. I wasn't too sure about the usefulness of providing
> per-cgroup guest and steal times and hence not including them here.
>
> Note that I am using percpu_counter for collecting these two stats.
> Since percpu_counter subsystem doesn't protect the readside, readers could
> theoritically obtain incorrect values for these stats on 32bit systems.
> I hope occasional wrong values is not too much of a concern for
> statistics like this. If it is a problem, we have to either fix
> percpu_counter or do it all by ourselves as Kamezawa attempted
> for cpuacct.usage (http://lkml.org/lkml/2009/3/4/14)
>
> Regards,
> Bharata.
>
> cpuacct: Add stime and utime statistics
>
> Add per-cgroup cpuacct controller statistics like the system and user
> time consumed by the group of tasks.
>
> Signed-off-by: Bharata B Rao <[email protected]>
> Signed-off-by: Balaji Rao <[email protected]>
> ---
> Documentation/cgroups/cpuacct.txt | 8 +++
> kernel/sched.c | 87 +++++++++++++++++++++++++++++++++++---
> 2 files changed, 89 insertions(+), 6 deletions(-)
>
> --- a/Documentation/cgroups/cpuacct.txt
> +++ b/Documentation/cgroups/cpuacct.txt
> @@ -30,3 +30,11 @@ The above steps create a new group g1 an
> process (bash) into it. CPU time consumed by this bash and its children
> can be obtained from g1/cpuacct.usage and the same is accumulated in
> /cgroups/cpuacct.usage also.
> +
> +cpuacct.stat file lists a few statistics which further divide the
> +CPU time obtained by the cgroup into user and system times. Currently
> +the following statistics are supported:
> +
> +utime: Time in milliseconds spent by tasks of the cgroup in user mode.
> +stime: Time in milliseconds spent by tasks of the cgroup in kernel mode.
> +

Hi, Bharata,

I did a quick run of the patch on my machine. The patch applied and
compile cleanly, here are a few comments?

1. We could consider enhancing the patch to account for irq, softirq,
etc time like cpustat does. Not right away, but iteratively
2. The accounting is converted to milliseconds, I would much rather
export it in cputime to be consistent with other cpu accounting.
I remember we used to return nanosecond accurate accounting and then
moved to cputime based accounting for cpuacct.
3. How do we deal with CPU hotplug. Since we use a per-cpu counter,
any hotplug would mean that the data related to the offlined CPU is
lost. That is how the current CPU accounting system seems to work.

> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -1393,10 +1393,22 @@ iter_move_one_task(struct rq *this_rq, i
> struct rq_iterator *iterator);
> #endif
>
> +/* Time spent by the tasks of the cpu accounting group executing in ... */
> +enum cpuacct_stat_index {
> + CPUACCT_STAT_UTIME, /* ... user mode */
> + CPUACCT_STAT_STIME, /* ... kernel mode */
> +
> + CPUACCT_STAT_NSTATS,
> +};
> +
> #ifdef CONFIG_CGROUP_CPUACCT
> static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
> +static void cpuacct_update_stats(struct task_struct *tsk,
> + enum cpuacct_stat_index idx, int val);
> #else
> static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
> +static void cpuacct_update_stats(struct task_struct *tsk,
> + enum cpuacct_stat_index idx, int val) {}
> #endif
>
> static inline void inc_cpu_load(struct rq *rq, unsigned long load)
> @@ -4182,6 +4194,8 @@ void account_user_time(struct task_struc
> cpustat->nice = cputime64_add(cpustat->nice, tmp);
> else
> cpustat->user = cputime64_add(cpustat->user, tmp);
> +
> + cpuacct_update_stats(p, CPUACCT_STAT_UTIME, cputime_to_msecs(cputime));
> /* Account for user time used */
> acct_update_integrals(p);
> }
> @@ -4243,6 +4257,8 @@ void account_system_time(struct task_str
> else
> cpustat->system = cputime64_add(cpustat->system, tmp);
>
> + cpuacct_update_stats(p, CPUACCT_STAT_STIME, cputime_to_msecs(cputime));
> +
> /* Account for system time used */
> acct_update_integrals(p);
> }
> @@ -9438,6 +9454,7 @@ struct cpuacct {
> struct cgroup_subsys_state css;
> /* cpuusage holds pointer to a u64-type object on every cpu */
> u64 *cpuusage;
> + struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
> struct cpuacct *parent;
> };
>
> @@ -9462,20 +9479,33 @@ static struct cgroup_subsys_state *cpuac
> struct cgroup_subsys *ss, struct cgroup *cgrp)
> {
> struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
> + int i;
>
> if (!ca)
> - return ERR_PTR(-ENOMEM);
> + goto out1;
>
> ca->cpuusage = alloc_percpu(u64);
> - if (!ca->cpuusage) {
> - kfree(ca);
> - return ERR_PTR(-ENOMEM);
> - }
> + if (!ca->cpuusage)
> + goto out2;
> +
> + for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
> + if (percpu_counter_init(&ca->cpustat[i], 0))
> + goto out3;
>
> if (cgrp->parent)
> ca->parent = cgroup_ca(cgrp->parent);
>
> return &ca->css;
> +
> +out3:
> + i--;
> + while (i-- >= 0)
> + percpu_counter_destroy(&ca->cpustat[i]);
> + free_percpu(ca->cpuusage);
> +out2:
> + kfree(ca);
> +out1:
> + return ERR_PTR(-ENOMEM);
> }
>

Don't like out* as labels, please let us have more meaningful labels.

> /* destroy an existing cpu accounting group */
> @@ -9483,7 +9513,10 @@ static void
> cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
> {
> struct cpuacct *ca = cgroup_ca(cgrp);
> + int i;
>
> + for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
> + percpu_counter_destroy(&ca->cpustat[i]);
> free_percpu(ca->cpuusage);
> kfree(ca);
> }
> @@ -9570,6 +9603,28 @@ static int cpuacct_percpu_seq_read(struc
> return 0;
> }
>
> +static const struct cpuacct_stat_desc {
> + const char *msg;
> + u64 unit;
> +} cpuacct_stat_desc[] = {
> + [CPUACCT_STAT_UTIME] = { "utime", 1, },
> + [CPUACCT_STAT_STIME] = { "stime", 1, },
> +};
> +
> +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
> + struct cgroup_map_cb *cb)
> +{
> + struct cpuacct *ca = cgroup_ca(cgrp);
> + int i;
> +
> + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
> + s64 val = percpu_counter_read(&ca->cpustat[i]);
> + val *= cpuacct_stat_desc[i].unit;
> + cb->fill(cb, cpuacct_stat_desc[i].msg, val);
> + }
> + return 0;
> +}
> +
> static struct cftype files[] = {
> {
> .name = "usage",
> @@ -9580,7 +9635,10 @@ static struct cftype files[] = {
> .name = "usage_percpu",
> .read_seq_string = cpuacct_percpu_seq_read,
> },
> -
> + {
> + .name = "stat",
> + .read_map = cpuacct_stats_show,
> + },
> };
>
> static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> @@ -9610,6 +9668,23 @@ static void cpuacct_charge(struct task_s
> }
> }
>
> +/*
> + * Account the system/user time to the task's accounting group.
> + */
> +static void cpuacct_update_stats(struct task_struct *tsk,
> + enum cpuacct_stat_index idx, int val)
> +{
> + struct cpuacct *ca;
> +
> + if (!cpuacct_subsys.active)
> + return;
> +
> + ca = task_ca(tsk);
> +
> + for (; ca; ca = ca->parent)
> + percpu_counter_add(&ca->cpustat[idx], val);
> +}
> +
> struct cgroup_subsys cpuacct_subsys = {
> .name = "cpuacct",
> .create = cpuacct_create,
>

--
Balbir

2009-03-11 15:34:55

by Balbir Singh

[permalink] [raw]
Subject: Re: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1

* KAMEZAWA Hiroyuki <[email protected]> [2009-03-11 09:38:12]:

> On Tue, 10 Mar 2009 18:12:08 +0530
> Bharata B Rao <[email protected]> wrote:
>
> > Hi,
> >
> > Based on the comments received during my last post
> > (http://lkml.org/lkml/2009/2/25/129), here is a fresh attempt
> > to get per-cgroup utime/stime statistics as part of cpuacct controller.
> >
> > This patch adds a new file cpuacct.stat which displays two stats:
> > utime and stime. I wasn't too sure about the usefulness of providing
> > per-cgroup guest and steal times and hence not including them here.
> >
> > Note that I am using percpu_counter for collecting these two stats.
> > Since percpu_counter subsystem doesn't protect the readside, readers could
> > theoritically obtain incorrect values for these stats on 32bit systems.
>
> Using percpu_counter_read() means that .. but is it okay to ignore "batch"
> number ? (see FBC_BATCH)
>

FBC_BATCH? Thats gone..no? We have dynamic batches now, IIRC. Could
you please elaborate on your comment?

>
> > I hope occasional wrong values is not too much of a concern for
> > statistics like this. If it is a problem, we have to either fix
> > percpu_counter or do it all by ourselves as Kamezawa attempted
> > for cpuacct.usage (http://lkml.org/lkml/2009/3/4/14)
> >
> Hmm, percpu_counter_sum() is bad ?
>

Yes, but we need to sum somewhere.. user space summing will not be
atomic, we'll get several snapshots of per CPU data and summing it
might not yield the correct answers.

> BTW, I'm not sure but don't we need special handling if
> CONFIG_VIRT_CPU_ACCOUNTING=y ?

Good point. Bharata, with CONFIG_VIRT_CPU_ACCOUNTING, utime and stime
is accounted for within the architecture.

--
Balbir

2009-03-11 16:44:39

by Balbir Singh

[permalink] [raw]
Subject: Re: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1

* Balbir Singh <[email protected]> [2009-03-11 20:46:57]:

> * Bharata B Rao <[email protected]> [2009-03-10 18:12:08]:
>
> > Hi,
> >
> > Based on the comments received during my last post
> > (http://lkml.org/lkml/2009/2/25/129), here is a fresh attempt
> > to get per-cgroup utime/stime statistics as part of cpuacct controller.
> >
> > This patch adds a new file cpuacct.stat which displays two stats:
> > utime and stime. I wasn't too sure about the usefulness of providing
> > per-cgroup guest and steal times and hence not including them here.
> >
> > Note that I am using percpu_counter for collecting these two stats.
> > Since percpu_counter subsystem doesn't protect the readside, readers could
> > theoritically obtain incorrect values for these stats on 32bit systems.
> > I hope occasional wrong values is not too much of a concern for
> > statistics like this. If it is a problem, we have to either fix
> > percpu_counter or do it all by ourselves as Kamezawa attempted
> > for cpuacct.usage (http://lkml.org/lkml/2009/3/4/14)
> >
> > Regards,
> > Bharata.
> >
> > cpuacct: Add stime and utime statistics
> >
> > Add per-cgroup cpuacct controller statistics like the system and user
> > time consumed by the group of tasks.
> >
> > Signed-off-by: Bharata B Rao <[email protected]>
> > Signed-off-by: Balaji Rao <[email protected]>
> > ---
> > Documentation/cgroups/cpuacct.txt | 8 +++
> > kernel/sched.c | 87 +++++++++++++++++++++++++++++++++++---
> > 2 files changed, 89 insertions(+), 6 deletions(-)
> >
> > --- a/Documentation/cgroups/cpuacct.txt
> > +++ b/Documentation/cgroups/cpuacct.txt
> > @@ -30,3 +30,11 @@ The above steps create a new group g1 an
> > process (bash) into it. CPU time consumed by this bash and its children
> > can be obtained from g1/cpuacct.usage and the same is accumulated in
> > /cgroups/cpuacct.usage also.
> > +
> > +cpuacct.stat file lists a few statistics which further divide the
> > +CPU time obtained by the cgroup into user and system times. Currently
> > +the following statistics are supported:
> > +
> > +utime: Time in milliseconds spent by tasks of the cgroup in user mode.
> > +stime: Time in milliseconds spent by tasks of the cgroup in kernel mode.
> > +
>
> Hi, Bharata,
>
> I did a quick run of the patch on my machine. The patch applied and
> compile cleanly, here are a few comments?
>
> 1. We could consider enhancing the patch to account for irq, softirq,
> etc time like cpustat does. Not right away, but iteratively
> 2. The accounting is converted to milliseconds, I would much rather
> export it in cputime to be consistent with other cpu accounting.
> I remember we used to return nanosecond accurate accounting and then
> moved to cputime based accounting for cpuacct.
> 3. How do we deal with CPU hotplug. Since we use a per-cpu counter,
> any hotplug would mean that the data related to the offlined CPU is
> lost. That is how the current CPU accounting system seems to work.
>

Bharata,

I tried the diff below and saw what I was looking for

diff --git a/kernel/sched.c b/kernel/sched.c
index 015155d..fadd17f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4195,7 +4195,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
else
cpustat->user = cputime64_add(cpustat->user, tmp);

- cpuacct_update_stats(p, CPUACCT_STAT_UTIME, cputime_to_msecs(cputime));
+ cpuacct_update_stats(p, CPUACCT_STAT_UTIME, cputime);
/* Account for user time used */
acct_update_integrals(p);
}
@@ -4257,7 +4257,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
else
cpustat->system = cputime64_add(cpustat->system, tmp);

- cpuacct_update_stats(p, CPUACCT_STAT_STIME, cputime_to_msecs(cputime));
+ cpuacct_update_stats(p, CPUACCT_STAT_STIME, cputime);

/* Account for system time used */
acct_update_integrals(p);
@@ -9611,6 +9611,7 @@ static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
s64 val = percpu_counter_read(&ca->cpustat[i]);
val *= cpuacct_stat_desc[i].unit;
+ val = cputime_to_clock_t(val);
cb->fill(cb, cpuacct_stat_desc[i].msg, val);
}
return 0;

1. The data is returned in clock_t
2. CPU hotplug seems to be already correctly handled by per_cpu
counters


--
Balbir

2009-03-12 00:15:45

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1

On Wed, 11 Mar 2009 21:04:34 +0530
Balbir Singh <[email protected]> wrote:
> > > This patch adds a new file cpuacct.stat which displays two stats:
> > > utime and stime. I wasn't too sure about the usefulness of providing
> > > per-cgroup guest and steal times and hence not including them here.
> > >
> > > Note that I am using percpu_counter for collecting these two stats.
> > > Since percpu_counter subsystem doesn't protect the readside, readers could
> > > theoritically obtain incorrect values for these stats on 32bit systems.
> >
> > Using percpu_counter_read() means that .. but is it okay to ignore "batch"
> > number ? (see FBC_BATCH)
> >
>
> FBC_BATCH? Thats gone..no? We have dynamic batches now, IIRC. Could
> you please elaborate on your comment?
>
Ah, ok. it's now extern int percpu_counter_batch.

-Kame

> >
> > > I hope occasional wrong values is not too much of a concern for
> > > statistics like this. If it is a problem, we have to either fix
> > > percpu_counter or do it all by ourselves as Kamezawa attempted
> > > for cpuacct.usage (http://lkml.org/lkml/2009/3/4/14)
> > >
> > Hmm, percpu_counter_sum() is bad ?
> >
>
> Yes, but we need to sum somewhere.. user space summing will not be
> atomic, we'll get several snapshots of per CPU data and summing it
> might not yield the correct answers.
>
> > BTW, I'm not sure but don't we need special handling if
> > CONFIG_VIRT_CPU_ACCOUNTING=y ?
>
> Good point. Bharata, with CONFIG_VIRT_CPU_ACCOUNTING, utime and stime
> is accounted for within the architecture.
>
> --
> Balbir
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2009-03-12 04:29:40

by Bharata B Rao

[permalink] [raw]
Subject: Re: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1

On Wed, Mar 11, 2009 at 09:04:34PM +0530, Balbir Singh wrote:
> * KAMEZAWA Hiroyuki <[email protected]> [2009-03-11 09:38:12]:
>
> > BTW, I'm not sure but don't we need special handling if
> > CONFIG_VIRT_CPU_ACCOUNTING=y ?
>
> Good point. Bharata, with CONFIG_VIRT_CPU_ACCOUNTING, utime and stime
> is accounted for within the architecture.

True, but as I replied to Kamezawa in the other thread, these
architectures are still dependent on generic implementations of
account_{system,user}_time() which feed to per-process and system-wide
accounting. And this is where we have hooks for per-cgroup accounting.
So I don't see why we need to handle these archs specially. Do I miss
something ?

Regards,
Bharata.

2009-03-12 04:35:38

by Balbir Singh

[permalink] [raw]
Subject: Re: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1

* Bharata B Rao <[email protected]> [2009-03-12 09:59:51]:

> On Wed, Mar 11, 2009 at 09:04:34PM +0530, Balbir Singh wrote:
> > * KAMEZAWA Hiroyuki <[email protected]> [2009-03-11 09:38:12]:
> >
> > > BTW, I'm not sure but don't we need special handling if
> > > CONFIG_VIRT_CPU_ACCOUNTING=y ?
> >
> > Good point. Bharata, with CONFIG_VIRT_CPU_ACCOUNTING, utime and stime
> > is accounted for within the architecture.
>
> True, but as I replied to Kamezawa in the other thread, these
> architectures are still dependent on generic implementations of
> account_{system,user}_time() which feed to per-process and system-wide
> accounting. And this is where we have hooks for per-cgroup accounting.

Fair enough...

> So I don't see why we need to handle these archs specially. Do I miss
> something ?

May be we should cc the patch to linux-arch as well to make sure
we handled all architectures correctly.

--
Balbir

2009-03-12 04:44:21

by Bharata B Rao

[permalink] [raw]
Subject: Re: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1

On Wed, Mar 11, 2009 at 08:46:57PM +0530, Balbir Singh wrote:
> * Bharata B Rao <[email protected]> [2009-03-10 18:12:08]:
>
> Hi, Bharata,
>
> I did a quick run of the patch on my machine. The patch applied and
> compile cleanly, here are a few comments?
>
> 1. We could consider enhancing the patch to account for irq, softirq,
> etc time like cpustat does. Not right away, but iteratively

Yes that should not be a problem. We can add those easily if needed.

> 2. The accounting is converted to milliseconds, I would much rather
> export it in cputime to be consistent with other cpu accounting.
> I remember we used to return nanosecond accurate accounting and then
> moved to cputime based accounting for cpuacct.

Ok had thought about it but was hesitant to do because that makes
percpu_counters to work directly on cputime_t type. But I guess that is fine.

> 3. How do we deal with CPU hotplug. Since we use a per-cpu counter,
> any hotplug would mean that the data related to the offlined CPU is
> lost. That is how the current CPU accounting system seems to work.

Not a problem as percpu_counter handles hotplug events as you noted
in the other thread.

> >
> > @@ -9462,20 +9479,33 @@ static struct cgroup_subsys_state *cpuac
> > struct cgroup_subsys *ss, struct cgroup *cgrp)
> > {
> > struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
> > + int i;
> >
> > if (!ca)
> > - return ERR_PTR(-ENOMEM);
> > + goto out1;
> >
> > ca->cpuusage = alloc_percpu(u64);
> > - if (!ca->cpuusage) {
> > - kfree(ca);
> > - return ERR_PTR(-ENOMEM);
> > - }
> > + if (!ca->cpuusage)
> > + goto out2;
> > +
> > + for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
> > + if (percpu_counter_init(&ca->cpustat[i], 0))
> > + goto out3;
> >
> > if (cgrp->parent)
> > ca->parent = cgroup_ca(cgrp->parent);
> >
> > return &ca->css;
> > +
> > +out3:
> > + i--;
> > + while (i-- >= 0)
> > + percpu_counter_destroy(&ca->cpustat[i]);
> > + free_percpu(ca->cpuusage);
> > +out2:
> > + kfree(ca);
> > +out1:
> > + return ERR_PTR(-ENOMEM);
> > }
> >
>
> Don't like out* as labels, please let us have more meaningful labels.

Ok, I can change them if you don't like out* labels :)

Regards,
Bharata.