2010-04-07 01:22:21

by Mike Chan

[permalink] [raw]
Subject: [PATCH] sched: cpuacct: Track cpuusage for CPU frequencies

New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.

cpuacct.cpufreq reports the CPU time (nanoseconds) spent at each CPU frequency

Maximum number of frequencies supported is 32. As future architectures are
added that support more than 32 frequency levels, CPUFREQ_TABLE_MAX in sched.c
needs to be updated.

Signed-off-by: Mike Chan <[email protected]>
---
Documentation/cgroups/cpuacct.txt | 3 +
kernel/sched.c | 112 +++++++++++++++++++++++++++++++++++++
2 files changed, 115 insertions(+), 0 deletions(-)

diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt
index 8b93094..61c2bce 100644
--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroups/cpuacct.txt
@@ -40,6 +40,9 @@ system: Time spent by tasks of the cgroup in kernel mode.

user and system are in USER_HZ unit.

+cpuacct.cpufreq file gives the CPU time (in nanoseconds) spent at each CPU
+frequency.
+
cpuacct controller uses percpu_counter interface to collect user and
system times. This has two side effects:

diff --git a/kernel/sched.c b/kernel/sched.c
index 528a105..96a214d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
+#include <linux/cpufreq.h>

#include <asm/tlb.h>
#include <asm/irq_regs.h>
@@ -8817,6 +8818,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
* ([email protected]).
*/

+#ifdef CONFIG_CPU_FREQ_STAT
+#define CPUFREQ_TABLE_MAX 32
+/* Table that represents cpu frequencies available */
+struct cpufreq_table { u64 freq[CPUFREQ_TABLE_MAX]; };
+#endif
+
/* track cpu usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
@@ -8824,6 +8831,9 @@ struct cpuacct {
u64 __percpu *cpuusage;
struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
struct cpuacct *parent;
+#ifdef CONFIG_CPU_FREQ_STAT
+ struct cpufreq_table *cpufreq_table;
+#endif
};

struct cgroup_subsys cpuacct_subsys;
@@ -8856,6 +8866,10 @@ static struct cgroup_subsys_state *cpuacct_create(
if (!ca->cpuusage)
goto out_free_ca;

+#ifdef CONFIG_CPU_FREQ_STAT
+ ca->cpufreq_table = alloc_percpu(struct cpufreq_table);
+#endif
+
for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
if (percpu_counter_init(&ca->cpustat[i], 0))
goto out_free_counters;
@@ -8888,6 +8902,87 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
kfree(ca);
}

+#ifdef CONFIG_CPU_FREQ_STAT
+static int cpufreq_index;
+static int cpuacct_cpufreq_notify(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ int ret;
+ struct cpufreq_policy *policy;
+ struct cpufreq_freqs *freq = data;
+ struct cpufreq_frequency_table *table;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return 0;
+
+ /* Update cpufreq_index with current speed */
+ policy = cpufreq_cpu_get(freq->cpu);
+ table = cpufreq_frequency_get_table(freq->cpu);
+ /*
+ * Get the index of the frequency in the freq_table for this cpu.
+ * The choic of which relation to use
+ * CPUFREQ_RELATION_L (lowest frequency at or above target) or
+ * CPUFREQ_RELATION_H (highest frequency below or at target)
+ * is arbitrary. Reason being, the current speed is guaranteed to
+ * exist within the table so we will always match exactly.
+ */
+ ret = cpufreq_frequency_table_target(policy, table, policy->cur,
+ CPUFREQ_RELATION_L, &cpufreq_index);
+ cpufreq_cpu_put(policy);
+ return ret;
+}
+
+static struct notifier_block cpufreq_notifier = {
+ .notifier_call = cpuacct_cpufreq_notify,
+};
+
+static __init int cpuacct_init(void)
+{
+ return cpufreq_register_notifier(&cpufreq_notifier,
+ CPUFREQ_TRANSITION_NOTIFIER);
+}
+
+module_init(cpuacct_init);
+
+static int cpuacct_cpufreq_show(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ int i;
+ unsigned int cpu;
+ char buf[32];
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ struct cpufreq_frequency_table *table =
+ cpufreq_frequency_get_table(smp_processor_id());
+
+ for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) {
+ u64 total = 0;
+
+ if (table[i].frequency == CPUFREQ_ENTRY_INVALID)
+ continue;
+
+ /*
+ * Include all present cpus, even ones that are currently
+ * offline. Otherwise statistics could be incorrect if a cpu
+ * consistently cycles between on and offline for the lifetime
+ * of the system.
+ */
+ for_each_present_cpu(cpu) {
+ struct cpufreq_table *cpufreq_table;
+
+ cpufreq_table = per_cpu_ptr(ca->cpufreq_table, cpu);
+ table = cpufreq_frequency_get_table(cpu);
+
+ total += cpufreq_table->freq[i];
+ }
+
+ snprintf(buf, sizeof(buf), "%d", table[i].frequency);
+ cb->fill(cb, buf, total);
+ }
+
+ return 0;
+}
+#endif
+
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
{
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
@@ -9003,6 +9098,12 @@ static struct cftype files[] = {
.name = "stat",
.read_map = cpuacct_stats_show,
},
+#ifdef CONFIG_CPU_FREQ_STAT
+ {
+ .name = "cpufreq",
+ .read_map = cpuacct_cpufreq_show,
+ },
+#endif
};

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9031,6 +9132,17 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

for (; ca; ca = ca->parent) {
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+#ifdef CONFIG_CPU_FREQ_STAT
+ struct cpufreq_table *cpufreq_table =
+ per_cpu_ptr(ca->cpufreq_table, cpu);
+
+ if (cpufreq_index > CPUFREQ_TABLE_MAX)
+ printk_once(KERN_WARNING "cpuacct_charge: "
+ "cpufreq_index: %d exceeds max table "
+ "size\n", cpufreq_index);
+ else
+ cpufreq_table->freq[cpufreq_index] += cputime;
+#endif
*cpuusage += cputime;
}

--
1.7.0.1


2010-04-09 09:46:34

by Thomas Renninger

[permalink] [raw]
Subject: Re: [PATCH] sched: cpuacct: Track cpuusage for CPU frequencies

On Wednesday 07 April 2010 03:21:59 Mike Chan wrote:
> New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.
>
> cpuacct.cpufreq reports the CPU time (nanoseconds) spent at each CPU frequency
>
> Maximum number of frequencies supported is 32. As future architectures are
> added that support more than 32 frequency levels, CPUFREQ_TABLE_MAX in sched.c
> needs to be updated.
Why is accounting of each frequency needed?
pcc-cpufreq driver can do every frequency in a range and supports hundreds of
different frequencies, thus it does not depend on CPU_FREQ_TABLE.
Would the average frequency be enough to track/account?
This would avoid the static interface of listing each available freq.
It would also count "boosted" frequency case which is avail on most recent
X86 cpus.

> Signed-off-by: Mike Chan <[email protected]>
> ---
> Documentation/cgroups/cpuacct.txt | 3 +
> kernel/sched.c | 112 +++++++++++++++++++++++++++++++++++++
> 2 files changed, 115 insertions(+), 0 deletions(-)
...
> static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> @@ -9031,6 +9132,17 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
>
> for (; ca; ca = ca->parent) {
> u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
> +#ifdef CONFIG_CPU_FREQ_STAT
> + struct cpufreq_table *cpufreq_table =
> + per_cpu_ptr(ca->cpufreq_table, cpu);
> +
> + if (cpufreq_index > CPUFREQ_TABLE_MAX)
> + printk_once(KERN_WARNING "cpuacct_charge: "
> + "cpufreq_index: %d exceeds max table "
> + "size\n", cpufreq_index);
> + else
> + cpufreq_table->freq[cpufreq_index] += cputime;
> +#endif
Can the frequency change somewhere in the middle between cpuacct_charge is
called?
What guarantees that the task run at cpufreq_table->freq[cpufreq_index]
all the time?

Thomas

> *cpuusage += cputime;
> }
>
> --

2010-04-09 20:50:38

by Mike Chan

[permalink] [raw]
Subject: Re: [PATCH] sched: cpuacct: Track cpuusage for CPU frequencies

2010/4/9 Thomas Renninger <[email protected]>:
> On Wednesday 07 April 2010 03:21:59 Mike Chan wrote:
>> New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.
>>
>> cpuacct.cpufreq reports the CPU time (nanoseconds) spent at each CPU frequency
>>
>> Maximum number of frequencies supported is 32. As future architectures are
>> added that support more than 32 frequency levels, CPUFREQ_TABLE_MAX in sched.c
>> needs to be updated.
> Why is accounting of each frequency needed?

The intent is to track time spent at each cpu frequency to measure
power consumption. Userspace can figure out the mapping between
frequency and power consumption. This is also a useful indication of
what kind of hw performance userspace apps need (does Chrome really
need 1ghz?).

Paul Menage had suggested an integral earlier in my [RFC] patch. I
wasn't completely against the idea but it had a few shortcomings that
I couldn't think of decent solutions for. You would have to either
pre-define power consumption for the cpu frequences per-arch or board
file. Or have a way to calculate.

> pcc-cpufreq driver can do every frequency in a range and supports hundreds of
> different frequencies, thus it does not depend on CPU_FREQ_TABLE.
> Would the average frequency be enough to track/account?

Humm, this is a tricky case we haven't yet run into for ARM. Average
frequency might not be too useful because power is not linear with
speed. We could possibly have buckets for speeds (hi/lo).

> This would avoid the static interface of listing each available freq.
> It would also count "boosted" frequency case which is avail on most recent
> X86 cpus.
>
>> Signed-off-by: Mike Chan <[email protected]>
>> ---
>> ?Documentation/cgroups/cpuacct.txt | ? ?3 +
>> ?kernel/sched.c ? ? ? ? ? ? ? ? ? ?| ?112 +++++++++++++++++++++++++++++++++++++
>> ?2 files changed, 115 insertions(+), 0 deletions(-)
> ...
>> ?static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
>> @@ -9031,6 +9132,17 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
>>
>> ? ? ? for (; ca; ca = ca->parent) {
>> ? ? ? ? ? ? ? u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> + ? ? ? ? ? ? struct cpufreq_table *cpufreq_table =
>> + ? ? ? ? ? ? ? ? ? ? per_cpu_ptr(ca->cpufreq_table, cpu);
>> +
>> + ? ? ? ? ? ? if (cpufreq_index > CPUFREQ_TABLE_MAX)
>> + ? ? ? ? ? ? ? ? ? ? printk_once(KERN_WARNING "cpuacct_charge: "
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "cpufreq_index: %d exceeds max table "
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "size\n", cpufreq_index);
>> + ? ? ? ? ? ? else
>> + ? ? ? ? ? ? ? ? ? ? cpufreq_table->freq[cpufreq_index] += cputime;
>> +#endif
> Can the frequency change somewhere in the middle between cpuacct_charge is
> called?
> What guarantees that the task run at cpufreq_table->freq[cpufreq_index]
> all the time?
>

Ah, good catch, it doesn't. What we can do is register a callback for
a cpu frequency transition notifier. I can fix this up in a v2.

-- Mike

> ? ? Thomas
>
>> ? ? ? ? ? ? ? *cpuusage += cputime;
>> ? ? ? }
>>
>> --
>

2010-04-12 20:03:45

by Thomas Renninger

[permalink] [raw]
Subject: Re: [PATCH] sched: cpuacct: Track cpuusage for CPU frequencies

On Friday 09 April 2010 10:50:33 pm Mike Chan wrote:
> 2010/4/9 Thomas Renninger <[email protected]>:
> > On Wednesday 07 April 2010 03:21:59 Mike Chan wrote:
> >> New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.
> >>
> >> cpuacct.cpufreq reports the CPU time (nanoseconds) spent at each CPU
> >> frequency
> >>
> >> Maximum number of frequencies supported is 32. As future architectures
> >> are added that support more than 32 frequency levels, CPUFREQ_TABLE_MAX
> >> in sched.c needs to be updated.
> >
> > Why is accounting of each frequency needed?
>
> The intent is to track time spent at each cpu frequency to measure
> power consumption. Userspace can figure out the mapping between
> frequency and power consumption. This is also a useful indication of
> what kind of hw performance userspace apps need (does Chrome really
> need 1ghz?).
>
> Paul Menage had suggested an integral earlier in my [RFC] patch. I
> wasn't completely against the idea but it had a few shortcomings that
> I couldn't think of decent solutions for. You would have to either
> pre-define power consumption for the cpu frequences per-arch or board
> file. Or have a way to calculate.
Sounds as if this is for specific CPUs/boards only then.
X86 boosting and PCC driver are hard, possibly impossible to track (in respect
to real power consumption).

> > pcc-cpufreq driver can do every frequency in a range and supports
> > hundreds of different frequencies, thus it does not depend on
> > CPU_FREQ_TABLE. Would the average frequency be enough to track/account?
> Humm, this is a tricky case we haven't yet run into for ARM. Average
> frequency might not be too useful because power is not linear with
> speed. We could possibly have buckets for speeds (hi/lo).
Your whole concept sounds as if it requires limited amount of frequencies.
Don't mind for the special case I mentioned.

Thomas

2010-04-12 22:01:14

by Mike Chan

[permalink] [raw]
Subject: Re: [PATCH] sched: cpuacct: Track cpuusage for CPU frequencies

On Mon, Apr 12, 2010 at 1:03 PM, Thomas Renninger <[email protected]> wrote:
> On Friday 09 April 2010 10:50:33 pm Mike Chan wrote:
>> 2010/4/9 Thomas Renninger <[email protected]>:
>> > On Wednesday 07 April 2010 03:21:59 Mike Chan wrote:
>> >> New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.
>> >>
>> >> cpuacct.cpufreq reports the CPU time (nanoseconds) spent at each CPU
>> >> frequency
>> >>
>> >> Maximum number of frequencies supported is 32. As future architectures
>> >> are added that support more than 32 frequency levels, CPUFREQ_TABLE_MAX
>> >> in sched.c needs to be updated.
>> >
>> > Why is accounting of each frequency needed?
>>
>> The intent is to track time spent at each cpu frequency to measure
>> power consumption. Userspace can figure out the mapping between
>> frequency and power consumption. This is also a useful indication of
>> what kind of hw performance userspace apps need (does Chrome really
>> need 1ghz?).
>>
>> Paul Menage had suggested an integral earlier in my [RFC] patch. I
>> wasn't completely against the idea but it had a few shortcomings that
>> I couldn't think of decent solutions for. You would have to either
>> pre-define power consumption for the cpu frequences per-arch or board
>> file. Or have a way to calculate.
> Sounds as if this is for specific CPUs/boards only then.
> X86 boosting and PCC driver are hard, possibly impossible to track (in respect
> to real power consumption).
>

Good point, although the class of CPUs that would benefit would be
ones with fixed frequencies, very common in the ARM world. For X86 and
PPC, if they are not using CONFIG_CPU_FREQ_STAT these statistics will
not be enabled. Perhaps what I should really be checking for is
CONFIG_CPU_FREQ_TABLE.

>> > pcc-cpufreq driver can do every frequency in a range and supports
>> > hundreds of different frequencies, thus it does not depend on
>> > CPU_FREQ_TABLE. Would the average frequency be enough to track/account?
>> Humm, this is a tricky case we haven't yet run into for ARM. Average
>> frequency might not be too useful because power is not linear with
>> speed. We could possibly have buckets for speeds (hi/lo).
> Your whole concept sounds as if it requires limited amount of frequencies.
> Don't mind for the special case I mentioned.
>

True, this was mostly inspired by cpufreq stats/time_in_state file,
which shows how much time the CPU has spent globally at each
frequency.
However applying this to cpu acct groups allows us to track how
userspace applications are consuming processing power.

So here are some thoughts after everyone's feedback:

1) Keep tracking in kernel/sched.c: For CPUs that do not use fixed
frequencies (PPC, x86), this is disabled, as its too difficult (at
least for now) to track. Perhaps later someone with more intelligence
than myself and more knowledge of these architectures can figure out a
way to track.

2) Add cpuacct notifiers: Introduce cpuacct notifiers that board or
mach-cpu files could register. We have the benefit here that at the
arch level we know if / how many fixed frequencies we can run at. This
is probably going to result in a little bit of code duplication across
several architectures, omap, msm and tegra at least for Android.

3) Not useful for upstream. In which case this will go into our
android/common branch.

I'm slightly favoring #2. This also gives us the benefit of
(optionally) exporting some power integral value Paul Menage suggested
earlier, if such power numbers are available from the board-file.

-- Mike

> ? ? ?Thomas
>