LinuxLists.cc - [RFC PATCH -tip] cpuacct: per-cgroup utime/stime statistics

2009-03-12 11:09:25

Subject: [RFC PATCH -tip] cpuacct: per-cgroup utime/stime statistics - v2

Here is the 2nd version of the cpuacct statistics patch. Copying
linux-arch list also this time to check if archs which define
CONFIG_VIRT_CPU_ACCOUNTING are ok with this change.

Changes for v2:
- stime and utime now exported in clock_t units instead of msecs.
- Addressed the code review comments from Balbir and Li Zefan.
- Moved to -tip tree.

v1:
- http://lkml.org/lkml/2009/3/10/150

cpuacct: Add stime and utime statistics

Add per-cgroup cpuacct controller statistics like the system and user
time consumed by the group of tasks.

Signed-off-by: Bharata B Rao <[email protected]>
Signed-off-by: Balaji Rao <[email protected]>
---
Documentation/cgroups/cpuacct.txt | 17 +++++++
kernel/sched.c | 86 +++++++++++++++++++++++++++++++++++---
2 files changed, 97 insertions(+), 6 deletions(-)

--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroups/cpuacct.txt
@@ -30,3 +30,20 @@ The above steps create a new group g1 an
process (bash) into it. CPU time consumed by this bash and its children
can be obtained from g1/cpuacct.usage and the same is accumulated in
/cgroups/cpuacct.usage also.
+
+cpuacct.stat file lists a few statistics which further divide the
+CPU time obtained by the cgroup into user and system times. Currently
+the following statistics are supported:
+
+utime: Time spent by tasks of the cgroup in user mode.
+stime: Time spent by tasks of the cgroup in kernel mode.
+
+utime and stime are in USER_HZ unit.
+
+cpuacct controller uses percpu_counter interface to collect utime and
+stime. This causes two side effects:
+
+- It is theoritically possible to see wrong values for stime and utime.
+ This is because percpu_counter_read() on 32bit systems is broken.
+- It is possible to see slightly outdated values for stime and utime
+ due to the batch processing nature of percpu_counter.
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1434,10 +1434,22 @@ iter_move_one_task(struct rq *this_rq, i
struct rq_iterator *iterator);
#endif

+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+ CPUACCT_STAT_UTIME, /* ... user mode */
+ CPUACCT_STAT_STIME, /* ... kernel mode */
+
+ CPUACCT_STAT_NSTATS,
+};
+
#ifdef CONFIG_CGROUP_CPUACCT
static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+static void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, cputime_t val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, cputime_t val) {}
#endif

static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -4379,6 +4391,8 @@ void account_user_time(struct task_struc
cpustat->nice = cputime64_add(cpustat->nice, tmp);
else
cpustat->user = cputime64_add(cpustat->user, tmp);
+
+ cpuacct_update_stats(p, CPUACCT_STAT_UTIME, cputime);
/* Account for user time used */
acct_update_integrals(p);
}
@@ -4440,6 +4454,8 @@ void account_system_time(struct task_str
else
cpustat->system = cputime64_add(cpustat->system, tmp);

+ cpuacct_update_stats(p, CPUACCT_STAT_STIME, cputime);
+
/* Account for system time used */
acct_update_integrals(p);
}
@@ -9723,6 +9739,7 @@ struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
u64 *cpuusage;
+ struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
struct cpuacct *parent;
};

@@ -9747,20 +9764,33 @@ static struct cgroup_subsys_state *cpuac
struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ int i;

if (!ca)
- return ERR_PTR(-ENOMEM);
+ goto out;

ca->cpuusage = alloc_percpu(u64);
- if (!ca->cpuusage) {
- kfree(ca);
- return ERR_PTR(-ENOMEM);
- }
+ if (!ca->cpuusage)
+ goto out_free_ca;
+
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+ if (percpu_counter_init(&ca->cpustat[i], 0))
+ goto out_free_counters;

if (cgrp->parent)
ca->parent = cgroup_ca(cgrp->parent);

return &ca->css;
+
+out_free_counters:
+ i--;
+ while (i-- >= 0)
+ percpu_counter_destroy(&ca->cpustat[i]);
+ free_percpu(ca->cpuusage);
+out_free_ca:
+ kfree(ca);
+out:
+ return ERR_PTR(-ENOMEM);
}

/* destroy an existing cpu accounting group */
@@ -9768,7 +9798,10 @@ static void
cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
+ int i;

+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+ percpu_counter_destroy(&ca->cpustat[i]);
free_percpu(ca->cpuusage);
kfree(ca);
}
@@ -9855,6 +9888,25 @@ static int cpuacct_percpu_seq_read(struc
return 0;
}

+static const char *cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_UTIME] = "utime",
+ [CPUACCT_STAT_STIME] = "stime",
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ int i;
+
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+ s64 val = percpu_counter_read(&ca->cpustat[i]);
+ val = cputime_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[i], val);
+ }
+ return 0;
+}
+
static struct cftype files[] = {
{
.name = "usage",
@@ -9865,7 +9917,10 @@ static struct cftype files[] = {
.name = "usage_percpu",
.read_seq_string = cpuacct_percpu_seq_read,
},
-
+ {
+ .name = "stat",
+ .read_map = cpuacct_stats_show,
+ },
};

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9895,6 +9950,25 @@ static void cpuacct_charge(struct task_s
}
}

+/*
+ * Account the system/user time to the task's accounting group.
+ */
+static void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, cputime_t val)
+{
+ struct cpuacct *ca;
+
+ if (unlikely(!cpuacct_subsys.active))
+ return;
+
+ ca = task_ca(tsk);
+
+ do {
+ percpu_counter_add(&ca->cpustat[idx], val);
+ ca = ca->parent;
+ } while (ca);
+}
+
struct cgroup_subsys cpuacct_subsys = {
.name = "cpuacct",
.create = cpuacct_create,

2009-03-16 01:36:56

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [RFC PATCH -tip] cpuacct: per-cgroup utime/stime statistics - v2

On Thu, 12 Mar 2009 16:39:24 +0530
Bharata B Rao <[email protected]> wrote:

> Here is the 2nd version of the cpuacct statistics patch. Copying
> linux-arch list also this time to check if archs which define
> CONFIG_VIRT_CPU_ACCOUNTING are ok with this change.
>
> Changes for v2:
> - stime and utime now exported in clock_t units instead of msecs.
> - Addressed the code review comments from Balbir and Li Zefan.
> - Moved to -tip tree.
>
> v1:
> - http://lkml.org/lkml/2009/3/10/150
>
>
> cpuacct: Add stime and utime statistics
>
> Add per-cgroup cpuacct controller statistics like the system and user
> time consumed by the group of tasks.
>
> Signed-off-by: Bharata B Rao <[email protected]>
> Signed-off-by: Balaji Rao <[email protected]>
> ---
> Documentation/cgroups/cpuacct.txt | 17 +++++++
> kernel/sched.c | 86 +++++++++++++++++++++++++++++++++++---
> 2 files changed, 97 insertions(+), 6 deletions(-)
>
> --- a/Documentation/cgroups/cpuacct.txt
> +++ b/Documentation/cgroups/cpuacct.txt
> @@ -30,3 +30,20 @@ The above steps create a new group g1 an
> process (bash) into it. CPU time consumed by this bash and its children
> can be obtained from g1/cpuacct.usage and the same is accumulated in
> /cgroups/cpuacct.usage also.
> +
> +cpuacct.stat file lists a few statistics which further divide the
> +CPU time obtained by the cgroup into user and system times. Currently
> +the following statistics are supported:
> +
> +utime: Time spent by tasks of the cgroup in user mode.
> +stime: Time spent by tasks of the cgroup in kernel mode.
> +
> +utime and stime are in USER_HZ unit.
> +
> +cpuacct controller uses percpu_counter interface to collect utime and
> +stime. This causes two side effects:
> +
> +- It is theoritically possible to see wrong values for stime and utime.
> + This is because percpu_counter_read() on 32bit systems is broken.
> +- It is possible to see slightly outdated values for stime and utime
> + due to the batch processing nature of percpu_counter.
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -1434,10 +1434,22 @@ iter_move_one_task(struct rq *this_rq, i
> struct rq_iterator *iterator);
> #endif
>
> +/* Time spent by the tasks of the cpu accounting group executing in ... */
> +enum cpuacct_stat_index {
> + CPUACCT_STAT_UTIME, /* ... user mode */
> + CPUACCT_STAT_STIME, /* ... kernel mode */
> +
> + CPUACCT_STAT_NSTATS,
> +};
> +
> #ifdef CONFIG_CGROUP_CPUACCT
> static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
> +static void cpuacct_update_stats(struct task_struct *tsk,
> + enum cpuacct_stat_index idx, cputime_t val);
> #else
> static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
> +static void cpuacct_update_stats(struct task_struct *tsk,
> + enum cpuacct_stat_index idx, cputime_t val) {}
> #endif
"static inline"
if no "inline", the complier will show warning as
"this function is defined but not used.."

> static inline void inc_cpu_load(struct rq *rq, unsigned long load)
> @@ -4379,6 +4391,8 @@ void account_user_time(struct task_struc
> cpustat->nice = cputime64_add(cpustat->nice, tmp);
> else
> cpustat->user = cputime64_add(cpustat->user, tmp);
> +
> + cpuacct_update_stats(p, CPUACCT_STAT_UTIME, cputime);
> /* Account for user time used */
> acct_update_integrals(p);
> }
> @@ -4440,6 +4454,8 @@ void account_system_time(struct task_str
> else
> cpustat->system = cputime64_add(cpustat->system, tmp);
>
> + cpuacct_update_stats(p, CPUACCT_STAT_STIME, cputime);
> +
> /* Account for system time used */
> acct_update_integrals(p);
> }
> @@ -9723,6 +9739,7 @@ struct cpuacct {
> struct cgroup_subsys_state css;
> /* cpuusage holds pointer to a u64-type object on every cpu */
> u64 *cpuusage;
> + struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
> struct cpuacct *parent;
> };
>
> @@ -9747,20 +9764,33 @@ static struct cgroup_subsys_state *cpuac
> struct cgroup_subsys *ss, struct cgroup *cgrp)
> {
> struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
> + int i;
>
> if (!ca)
> - return ERR_PTR(-ENOMEM);
> + goto out;
>
> ca->cpuusage = alloc_percpu(u64);
> - if (!ca->cpuusage) {
> - kfree(ca);
> - return ERR_PTR(-ENOMEM);
> - }
> + if (!ca->cpuusage)
> + goto out_free_ca;
> +
> + for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
> + if (percpu_counter_init(&ca->cpustat[i], 0))
> + goto out_free_counters;
>
> if (cgrp->parent)
> ca->parent = cgroup_ca(cgrp->parent);
>
> return &ca->css;
> +
> +out_free_counters:
> + i--;
> + while (i-- >= 0)
> + percpu_counter_destroy(&ca->cpustat[i]);
> + free_percpu(ca->cpuusage);
> +out_free_ca:
> + kfree(ca);
> +out:
> + return ERR_PTR(-ENOMEM);
> }
>
> /* destroy an existing cpu accounting group */
> @@ -9768,7 +9798,10 @@ static void
> cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
> {
> struct cpuacct *ca = cgroup_ca(cgrp);
> + int i;
>
> + for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
> + percpu_counter_destroy(&ca->cpustat[i]);
> free_percpu(ca->cpuusage);
> kfree(ca);
> }
> @@ -9855,6 +9888,25 @@ static int cpuacct_percpu_seq_read(struc
> return 0;
> }
>
> +static const char *cpuacct_stat_desc[] = {
> + [CPUACCT_STAT_UTIME] = "utime",
> + [CPUACCT_STAT_STIME] = "stime",
> +};
> +
> +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
> + struct cgroup_map_cb *cb)
> +{
> + struct cpuacct *ca = cgroup_ca(cgrp);
> + int i;
> +
> + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
> + s64 val = percpu_counter_read(&ca->cpustat[i]);
> + val = cputime_to_clock_t(val);
> + cb->fill(cb, cpuacct_stat_desc[i], val);
> + }
> + return 0;
> +}
> +
> static struct cftype files[] = {
> {
> .name = "usage",
> @@ -9865,7 +9917,10 @@ static struct cftype files[] = {
> .name = "usage_percpu",
> .read_seq_string = cpuacct_percpu_seq_read,
> },
> -
> + {
> + .name = "stat",
> + .read_map = cpuacct_stats_show,
> + },
> };
>
> static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> @@ -9895,6 +9950,25 @@ static void cpuacct_charge(struct task_s
> }
> }
>
> +/*
> + * Account the system/user time to the task's accounting group.
> + */
> +static void cpuacct_update_stats(struct task_struct *tsk,
> + enum cpuacct_stat_index idx, cputime_t val)
> +{
> + struct cpuacct *ca;
> +
> + if (unlikely(!cpuacct_subsys.active))
> + return;
> +
> + ca = task_ca(tsk);
> +
> + do {
> + percpu_counter_add(&ca->cpustat[idx], val);
> + ca = ca->parent;
> + } while (ca);
> +}
> +

IIUC, to make sure accessing "ca" to be safe, we need some condition.
(task_lock() or some other.....

Could you add "why this is safe ?" as comment ?

-Kame

> struct cgroup_subsys cpuacct_subsys = {
> .name = "cpuacct",
> .create = cpuacct_create,
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2009-03-16 04:37:51

by Bharata B Rao

[permalink] [raw]

Subject: Re: [RFC PATCH -tip] cpuacct: per-cgroup utime/stime statistics - v2

On Mon, Mar 16, 2009 at 10:35:17AM +0900, KAMEZAWA Hiroyuki wrote:
> On Thu, 12 Mar 2009 16:39:24 +0530
> Bharata B Rao <[email protected]> wrote:
>
> > #ifdef CONFIG_CGROUP_CPUACCT
> > static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
> > +static void cpuacct_update_stats(struct task_struct *tsk,
> > + enum cpuacct_stat_index idx, cputime_t val);
> > #else
> > static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
> > +static void cpuacct_update_stats(struct task_struct *tsk,
> > + enum cpuacct_stat_index idx, cputime_t val) {}
> > #endif
> "static inline"
> if no "inline", the complier will show warning as
> "this function is defined but not used.."

Ok.

> >
> > +/*
> > + * Account the system/user time to the task's accounting group.
> > + */
> > +static void cpuacct_update_stats(struct task_struct *tsk,
> > + enum cpuacct_stat_index idx, cputime_t val)
> > +{
> > + struct cpuacct *ca;
> > +
> > + if (unlikely(!cpuacct_subsys.active))
> > + return;
> > +
> > + ca = task_ca(tsk);
> > +
> > + do {
> > + percpu_counter_add(&ca->cpustat[idx], val);
> > + ca = ca->parent;
> > + } while (ca);
> > +}
> > +
>
> IIUC, to make sure accessing "ca" to be safe, we need some condition.
> (task_lock() or some other.....

task_lock() protects tsk->cgroups->subsys[]. So can we hold task_lock()
to protect this walk ? But we do this cpuacct hierarchy walk for the
current task here. So can a current task's ca or ca's parents disappear
from under us ?

Regards,
Bharata.

2009-03-16 05:03:31

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [RFC PATCH -tip] cpuacct: per-cgroup utime/stime statistics - v2

On Mon, 16 Mar 2009 10:07:54 +0530
Bharata B Rao <[email protected]> wrote:

> > > +static void cpuacct_update_stats(struct task_struct *tsk,
> > > + enum cpuacct_stat_index idx, cputime_t val)
> > > +{
> > > + struct cpuacct *ca;
> > > +
> > > + if (unlikely(!cpuacct_subsys.active))
> > > + return;
> > > +
> > > + ca = task_ca(tsk);
> > > +
> > > + do {
> > > + percpu_counter_add(&ca->cpustat[idx], val);
> > > + ca = ca->parent;
> > > + } while (ca);
> > > +}
> > > +
> >
> > IIUC, to make sure accessing "ca" to be safe, we need some condition.
> > (task_lock() or some other.....
>
> task_lock() protects tsk->cgroups->subsys[]. So can we hold task_lock()
> to protect this walk ?
If necessary (see below)

> But we do this cpuacct hierarchy walk for the
> current task here. So can a current task's ca or ca's parents disappear
> from under us ?

Ok, followings are correct, I think.
1. All updates are called under preempt-disabled.
2. If "ca" is pointed by task, "disappear" will not happen.
3. if one of children is alive, the parent is alive.
4. "attach" can happen and tsk may be moved to other cgroup,
because we don't take task_lock.
So, ca = task_ca() may not be correct one.

What we should explain here is How we consider "4", here.

> > > + ca = task_ca(tsk);
> > > + ----------------------------------(*1)
> > > + do {
> > > + percpu_counter_add(&ca->cpustat[idx], val);
> > > + ca = ca->parent;
> > > + } while (ca);
----------------------------------(*2)

About accounting, we don't have any problem even if (*2) is different from (*1).
But, considering "ca" can be empty cgroup after (*1), in _theory_, we cannot
say "ca" is valid pointer after (*1).

Hmm, if necessary, adding rcu_read_lock() before (*1) is reasonable ?
(I say "if necessary" because I'm not sure.)

Thanks,
-Kame

2009-03-16 07:13:43

by Li Zefan

[permalink] [raw]

Subject: Re: [RFC PATCH -tip] cpuacct: per-cgroup utime/stime statistics - v2

>>> +/*
>>> + * Account the system/user time to the task's accounting group.
>>> + */
>>> +static void cpuacct_update_stats(struct task_struct *tsk,
>>> + enum cpuacct_stat_index idx, cputime_t val)
>>> +{
>>> + struct cpuacct *ca;
>>> +
>>> + if (unlikely(!cpuacct_subsys.active))
>>> + return;
>>> +
>>> + ca = task_ca(tsk);
>>> +
>>> + do {
>>> + percpu_counter_add(&ca->cpustat[idx], val);
>>> + ca = ca->parent;
>>> + } while (ca);
>>> +}
>>> +
>> IIUC, to make sure accessing "ca" to be safe, we need some condition.
>> (task_lock() or some other.....
>
> task_lock() protects tsk->cgroups->subsys[]. So can we hold task_lock()
> to protect this walk ? But we do this cpuacct hierarchy walk for the
> current task here. So can a current task's ca or ca's parents disappear
> from under us ?
>

task_ca() should be protected by task_lock() or rcu_read_lock(), otherwise
there is a very small race:

ca = task_ca(tsk)
move @tsk to another cgroup
rmdir old_cgrp (thus ca is freed)
ca->cpustat <--- accessing freed memory

As KAMEZAWA-san said all updates are called under preempt-disabled, and
classic and tree rcu's rcu_read_lock does preempt disable only, so above
code is ok, except for rcupreempt.

2009-03-16 08:47:48

by Bharata B Rao

[permalink] [raw]

Subject: Re: [RFC PATCH -tip] cpuacct: per-cgroup utime/stime statistics - v2

On Mon, Mar 16, 2009 at 03:13:38PM +0800, Li Zefan wrote:
> >>> +/*
> >>> + * Account the system/user time to the task's accounting group.
> >>> + */
> >>> +static void cpuacct_update_stats(struct task_struct *tsk,
> >>> + enum cpuacct_stat_index idx, cputime_t val)
> >>> +{
> >>> + struct cpuacct *ca;
> >>> +
> >>> + if (unlikely(!cpuacct_subsys.active))
> >>> + return;
> >>> +
> >>> + ca = task_ca(tsk);
> >>> +
> >>> + do {
> >>> + percpu_counter_add(&ca->cpustat[idx], val);
> >>> + ca = ca->parent;
> >>> + } while (ca);
> >>> +}
> >>> +
> >> IIUC, to make sure accessing "ca" to be safe, we need some condition.
> >> (task_lock() or some other.....
> >
> > task_lock() protects tsk->cgroups->subsys[]. So can we hold task_lock()
> > to protect this walk ? But we do this cpuacct hierarchy walk for the
> > current task here. So can a current task's ca or ca's parents disappear
> > from under us ?
> >
>
> task_ca() should be protected by task_lock() or rcu_read_lock(), otherwise
> there is a very small race:
>
> ca = task_ca(tsk)
> move @tsk to another cgroup
> rmdir old_cgrp (thus ca is freed)
> ca->cpustat <--- accessing freed memory
>
> As KAMEZAWA-san said all updates are called under preempt-disabled, and
> classic and tree rcu's rcu_read_lock does preempt disable only, so above
> code is ok, except for rcupreempt.

So I will protect task_ca() and ca hierarchy walk with explicit
rcu_read_lock() to be fully safe.

By the same logic, hierarchy walk in cpuacct_charge() is also
not safe with rcupreempt. It is under preempt disabled section due
to rq->lock. Does cpuacct_charge() also need a fix then ?

Regards,
Bharata.

2009-03-16 08:58:54

by Li Zefan

[permalink] [raw]

Subject: Re: [RFC PATCH -tip] cpuacct: per-cgroup utime/stime statistics - v2

Bharata B Rao wrote:
> On Mon, Mar 16, 2009 at 03:13:38PM +0800, Li Zefan wrote:
>>>>> +/*
>>>>> + * Account the system/user time to the task's accounting group.
>>>>> + */
>>>>> +static void cpuacct_update_stats(struct task_struct *tsk,
>>>>> + enum cpuacct_stat_index idx, cputime_t val)
>>>>> +{
>>>>> + struct cpuacct *ca;
>>>>> +
>>>>> + if (unlikely(!cpuacct_subsys.active))
>>>>> + return;
>>>>> +
>>>>> + ca = task_ca(tsk);
>>>>> +
>>>>> + do {
>>>>> + percpu_counter_add(&ca->cpustat[idx], val);
>>>>> + ca = ca->parent;
>>>>> + } while (ca);
>>>>> +}
>>>>> +
>>>> IIUC, to make sure accessing "ca" to be safe, we need some condition.
>>>> (task_lock() or some other.....
>>> task_lock() protects tsk->cgroups->subsys[]. So can we hold task_lock()
>>> to protect this walk ? But we do this cpuacct hierarchy walk for the
>>> current task here. So can a current task's ca or ca's parents disappear
>>> from under us ?
>>>
>> task_ca() should be protected by task_lock() or rcu_read_lock(), otherwise
>> there is a very small race:
>>
>> ca = task_ca(tsk)
>> move @tsk to another cgroup
>> rmdir old_cgrp (thus ca is freed)
>> ca->cpustat <--- accessing freed memory
>>
>> As KAMEZAWA-san said all updates are called under preempt-disabled, and
>> classic and tree rcu's rcu_read_lock does preempt disable only, so above
>> code is ok, except for rcupreempt.
>
> So I will protect task_ca() and ca hierarchy walk with explicit
> rcu_read_lock() to be fully safe.
>

either:

rcu_read_lock();
ca = task_ca(tsk);
do {
percpu_counter_add(&ca->cpustat[idx], val);
ca = ca->parent;
} while (ca);
rcu_read_unlock();

or:

rcu_read_lock();
ca = task_ca(tsk);
css_get(&ca->css);
rcu_read_unlock();
do {
percpu_counter_add(&ca->cpustat[idx], val);
ca = ca->parent;
} while (ca);
css_put(&ca->css);

which is more efficient?

> By the same logic, hierarchy walk in cpuacct_charge() is also
> not safe with rcupreempt. It is under preempt disabled section due
> to rq->lock. Does cpuacct_charge() also need a fix then ?
>

I guess so..

2009-03-17 06:23:32

by Bharata B Rao

[permalink] [raw]

Subject: Re: [RFC PATCH -tip] cpuacct: per-cgroup utime/stime statistics - v2

On Mon, Mar 16, 2009 at 04:58:45PM +0800, Li Zefan wrote:
> Bharata B Rao wrote:
> > On Mon, Mar 16, 2009 at 03:13:38PM +0800, Li Zefan wrote:
> >>>>> +/*
> >>>>> + * Account the system/user time to the task's accounting group.
> >>>>> + */
> >>>>> +static void cpuacct_update_stats(struct task_struct *tsk,
> >>>>> + enum cpuacct_stat_index idx, cputime_t val)
> >>>>> +{
> >>>>> + struct cpuacct *ca;
> >>>>> +
> >>>>> + if (unlikely(!cpuacct_subsys.active))
> >>>>> + return;
> >>>>> +
> >>>>> + ca = task_ca(tsk);
> >>>>> +
> >>>>> + do {
> >>>>> + percpu_counter_add(&ca->cpustat[idx], val);
> >>>>> + ca = ca->parent;
> >>>>> + } while (ca);
> >>>>> +}
> >>>>> +
> >>>> IIUC, to make sure accessing "ca" to be safe, we need some condition.
> >>>> (task_lock() or some other.....
> >>> task_lock() protects tsk->cgroups->subsys[]. So can we hold task_lock()
> >>> to protect this walk ? But we do this cpuacct hierarchy walk for the
> >>> current task here. So can a current task's ca or ca's parents disappear
> >>> from under us ?
> >>>
> >> task_ca() should be protected by task_lock() or rcu_read_lock(), otherwise
> >> there is a very small race:
> >>
> >> ca = task_ca(tsk)
> >> move @tsk to another cgroup
> >> rmdir old_cgrp (thus ca is freed)
> >> ca->cpustat <--- accessing freed memory
> >>
> >> As KAMEZAWA-san said all updates are called under preempt-disabled, and
> >> classic and tree rcu's rcu_read_lock does preempt disable only, so above
> >> code is ok, except for rcupreempt.
> >
> > So I will protect task_ca() and ca hierarchy walk with explicit
> > rcu_read_lock() to be fully safe.
> >
>
> either:
>
> rcu_read_lock();
> ca = task_ca(tsk);
> do {
> percpu_counter_add(&ca->cpustat[idx], val);
> ca = ca->parent;
> } while (ca);
> rcu_read_unlock();
>
> or:
>
> rcu_read_lock();
> ca = task_ca(tsk);
> css_get(&ca->css);
> rcu_read_unlock();
> do {
> percpu_counter_add(&ca->cpustat[idx], val);
> ca = ca->parent;
> } while (ca);
> css_put(&ca->css);
>
> which is more efficient?

Went with the first one in my next version of cpuacct stats patch
as I felt it is better than the 2nd version which needs 2 atomic
operations.

>
> > By the same logic, hierarchy walk in cpuacct_charge() is also
> > not safe with rcupreempt. It is under preempt disabled section due
> > to rq->lock. Does cpuacct_charge() also need a fix then ?
> >
>
> I guess so..

Sent a separate fix for this.

Thanks for your review and suggestions.

Regards,
Bharata.