Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754503AbZCKAjt (ORCPT ); Tue, 10 Mar 2009 20:39:49 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1753263AbZCKAjk (ORCPT ); Tue, 10 Mar 2009 20:39:40 -0400 Received: from fgwmail7.fujitsu.co.jp ([192.51.44.37]:33269 "EHLO fgwmail7.fujitsu.co.jp" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751002AbZCKAji (ORCPT ); Tue, 10 Mar 2009 20:39:38 -0400 Date: Wed, 11 Mar 2009 09:38:12 +0900 From: KAMEZAWA Hiroyuki To: bharata@linux.vnet.ibm.com Cc: linux-kernel@vger.kernel.org, Balaji Rao , Dhaval Giani , Balbir Singh , Li Zefan , Paul Menage , Andrew Morton , Ingo Molnar , Peter Zijlstra Subject: Re: [RFC PATCH] cpuacct: per-cgroup utime/stime statistics - v1 Message-Id: <20090311093812.298a0b21.kamezawa.hiroyu@jp.fujitsu.com> In-Reply-To: <20090310124208.GC3902@in.ibm.com> References: <20090310124208.GC3902@in.ibm.com> Organization: FUJITSU Co. LTD. X-Mailer: Sylpheed 2.5.0 (GTK+ 2.10.14; i686-pc-mingw32) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 7187 Lines: 238 On Tue, 10 Mar 2009 18:12:08 +0530 Bharata B Rao wrote: > Hi, > > Based on the comments received during my last post > (http://lkml.org/lkml/2009/2/25/129), here is a fresh attempt > to get per-cgroup utime/stime statistics as part of cpuacct controller. > > This patch adds a new file cpuacct.stat which displays two stats: > utime and stime. I wasn't too sure about the usefulness of providing > per-cgroup guest and steal times and hence not including them here. > > Note that I am using percpu_counter for collecting these two stats. > Since percpu_counter subsystem doesn't protect the readside, readers could > theoritically obtain incorrect values for these stats on 32bit systems. Using percpu_counter_read() means that .. but is it okay to ignore "batch" number ? (see FBC_BATCH) > I hope occasional wrong values is not too much of a concern for > statistics like this. If it is a problem, we have to either fix > percpu_counter or do it all by ourselves as Kamezawa attempted > for cpuacct.usage (http://lkml.org/lkml/2009/3/4/14) > Hmm, percpu_counter_sum() is bad ? BTW, I'm not sure but don't we need special handling if CONFIG_VIRT_CPU_ACCOUNTING=y ? Thanks, -Kame > Regards, > Bharata. > > cpuacct: Add stime and utime statistics > > Add per-cgroup cpuacct controller statistics like the system and user > time consumed by the group of tasks. > > Signed-off-by: Bharata B Rao > Signed-off-by: Balaji Rao > --- > Documentation/cgroups/cpuacct.txt | 8 +++ > kernel/sched.c | 87 +++++++++++++++++++++++++++++++++++--- > 2 files changed, 89 insertions(+), 6 deletions(-) > > --- a/Documentation/cgroups/cpuacct.txt > +++ b/Documentation/cgroups/cpuacct.txt > @@ -30,3 +30,11 @@ The above steps create a new group g1 an > process (bash) into it. CPU time consumed by this bash and its children > can be obtained from g1/cpuacct.usage and the same is accumulated in > /cgroups/cpuacct.usage also. > + > +cpuacct.stat file lists a few statistics which further divide the > +CPU time obtained by the cgroup into user and system times. Currently > +the following statistics are supported: > + > +utime: Time in milliseconds spent by tasks of the cgroup in user mode. > +stime: Time in milliseconds spent by tasks of the cgroup in kernel mode. > + > --- a/kernel/sched.c > +++ b/kernel/sched.c > @@ -1393,10 +1393,22 @@ iter_move_one_task(struct rq *this_rq, i > struct rq_iterator *iterator); > #endif > > +/* Time spent by the tasks of the cpu accounting group executing in ... */ > +enum cpuacct_stat_index { > + CPUACCT_STAT_UTIME, /* ... user mode */ > + CPUACCT_STAT_STIME, /* ... kernel mode */ > + > + CPUACCT_STAT_NSTATS, > +}; > + > #ifdef CONFIG_CGROUP_CPUACCT > static void cpuacct_charge(struct task_struct *tsk, u64 cputime); > +static void cpuacct_update_stats(struct task_struct *tsk, > + enum cpuacct_stat_index idx, int val); > #else > static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} > +static void cpuacct_update_stats(struct task_struct *tsk, > + enum cpuacct_stat_index idx, int val) {} > #endif > > static inline void inc_cpu_load(struct rq *rq, unsigned long load) > @@ -4182,6 +4194,8 @@ void account_user_time(struct task_struc > cpustat->nice = cputime64_add(cpustat->nice, tmp); > else > cpustat->user = cputime64_add(cpustat->user, tmp); > + > + cpuacct_update_stats(p, CPUACCT_STAT_UTIME, cputime_to_msecs(cputime)); > /* Account for user time used */ > acct_update_integrals(p); > } > @@ -4243,6 +4257,8 @@ void account_system_time(struct task_str > else > cpustat->system = cputime64_add(cpustat->system, tmp); > > + cpuacct_update_stats(p, CPUACCT_STAT_STIME, cputime_to_msecs(cputime)); > + > /* Account for system time used */ > acct_update_integrals(p); > } > @@ -9438,6 +9454,7 @@ struct cpuacct { > struct cgroup_subsys_state css; > /* cpuusage holds pointer to a u64-type object on every cpu */ > u64 *cpuusage; > + struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; > struct cpuacct *parent; > }; > > @@ -9462,20 +9479,33 @@ static struct cgroup_subsys_state *cpuac > struct cgroup_subsys *ss, struct cgroup *cgrp) > { > struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); > + int i; > > if (!ca) > - return ERR_PTR(-ENOMEM); > + goto out1; > > ca->cpuusage = alloc_percpu(u64); > - if (!ca->cpuusage) { > - kfree(ca); > - return ERR_PTR(-ENOMEM); > - } > + if (!ca->cpuusage) > + goto out2; > + > + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) > + if (percpu_counter_init(&ca->cpustat[i], 0)) > + goto out3; > > if (cgrp->parent) > ca->parent = cgroup_ca(cgrp->parent); > > return &ca->css; > + > +out3: > + i--; > + while (i-- >= 0) > + percpu_counter_destroy(&ca->cpustat[i]); > + free_percpu(ca->cpuusage); > +out2: > + kfree(ca); > +out1: > + return ERR_PTR(-ENOMEM); > } > > /* destroy an existing cpu accounting group */ > @@ -9483,7 +9513,10 @@ static void > cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) > { > struct cpuacct *ca = cgroup_ca(cgrp); > + int i; > > + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) > + percpu_counter_destroy(&ca->cpustat[i]); > free_percpu(ca->cpuusage); > kfree(ca); > } > @@ -9570,6 +9603,28 @@ static int cpuacct_percpu_seq_read(struc > return 0; > } > > +static const struct cpuacct_stat_desc { > + const char *msg; > + u64 unit; > +} cpuacct_stat_desc[] = { > + [CPUACCT_STAT_UTIME] = { "utime", 1, }, > + [CPUACCT_STAT_STIME] = { "stime", 1, }, > +}; > + > +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, > + struct cgroup_map_cb *cb) > +{ > + struct cpuacct *ca = cgroup_ca(cgrp); > + int i; > + > + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { > + s64 val = percpu_counter_read(&ca->cpustat[i]); > + val *= cpuacct_stat_desc[i].unit; > + cb->fill(cb, cpuacct_stat_desc[i].msg, val); > + } > + return 0; > +} > + > static struct cftype files[] = { > { > .name = "usage", > @@ -9580,7 +9635,10 @@ static struct cftype files[] = { > .name = "usage_percpu", > .read_seq_string = cpuacct_percpu_seq_read, > }, > - > + { > + .name = "stat", > + .read_map = cpuacct_stats_show, > + }, > }; > > static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) > @@ -9610,6 +9668,23 @@ static void cpuacct_charge(struct task_s > } > } > > +/* > + * Account the system/user time to the task's accounting group. > + */ > +static void cpuacct_update_stats(struct task_struct *tsk, > + enum cpuacct_stat_index idx, int val) > +{ > + struct cpuacct *ca; > + > + if (!cpuacct_subsys.active) > + return; > + > + ca = task_ca(tsk); > + > + for (; ca; ca = ca->parent) > + percpu_counter_add(&ca->cpustat[idx], val); > +} > + > struct cgroup_subsys cpuacct_subsys = { > .name = "cpuacct", > .create = cpuacct_create, > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/