2010-04-05 19:33:44

by Mike Chan

[permalink] [raw]
Subject: [RFC][PATCH] sched: cpuacct: Track cpuusage per cpu frequency

New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.

cpuacct.cpufreq accounts for cpu time per-cpu frequency, time is exported
in nano-seconds

We do not know the cpufreq table size at compile time.
So a new config option CONFIG_CPUACCT_CPUFREQ_TABLE_MAX is intruduced,
to determine the cpufreq table per-cpu in the cpuacct struct.

Signed-off-by: Mike Chan <[email protected]>
---
init/Kconfig | 5 +++
kernel/sched.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 104 insertions(+), 0 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index eb77e8c..e1e86df 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -534,6 +534,11 @@ config CGROUP_CPUACCT
Provides a simple Resource Controller for monitoring the
total CPU consumed by the tasks in a cgroup.

+config CPUACCT_CPUFREQ_TABLE_MAX
+ int "Max CPUFREQ table size"
+ depends on CGROUP_CPUACCT && CPU_FREQ_TABLE
+ default 32
+
config RESOURCE_COUNTERS
bool "Resource counters"
help
diff --git a/kernel/sched.c b/kernel/sched.c
index 528a105..a0b56b5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
+#include <linux/cpufreq.h>

#include <asm/tlb.h>
#include <asm/irq_regs.h>
@@ -8817,6 +8818,11 @@ struct cgroup_subsys cpu_cgroup_subsys = {
* ([email protected]).
*/

+#ifdef CONFIG_CPU_FREQ_STAT
+/* The alloc_percpu macro uses typeof so we must define a type here. */
+typedef struct { u64 usage[CONFIG_CPUACCT_CPUFREQ_TABLE_MAX]; } cpufreq_usage_t;
+#endif
+
/* track cpu usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
@@ -8824,6 +8830,10 @@ struct cpuacct {
u64 __percpu *cpuusage;
struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
struct cpuacct *parent;
+#ifdef CONFIG_CPU_FREQ_STAT
+ /* cpufreq_usage is a pointer to an array of u64-types on every cpu */
+ cpufreq_usage_t *cpufreq_usage;
+#endif
};

struct cgroup_subsys cpuacct_subsys;
@@ -8856,6 +8866,10 @@ static struct cgroup_subsys_state *cpuacct_create(
if (!ca->cpuusage)
goto out_free_ca;

+#ifdef CONFIG_CPU_FREQ_STAT
+ ca->cpufreq_usage = alloc_percpu(cpufreq_usage_t);
+#endif
+
for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
if (percpu_counter_init(&ca->cpustat[i], 0))
goto out_free_counters;
@@ -8888,6 +8902,74 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
kfree(ca);
}

+#ifdef CONFIG_CPU_FREQ_STAT
+static int cpufreq_index;
+static int cpuacct_cpufreq_notify(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ int ret;
+ struct cpufreq_policy *policy;
+ struct cpufreq_freqs *freq = data;
+ struct cpufreq_frequency_table *table;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return 0;
+
+ /* Update cpufreq_index with current speed */
+ table = cpufreq_frequency_get_table(freq->cpu);
+ policy = cpufreq_cpu_get(freq->cpu);
+ ret = cpufreq_frequency_table_target(policy, table,
+ cpufreq_quick_get(freq->cpu),
+ CPUFREQ_RELATION_L, &cpufreq_index);
+ cpufreq_cpu_put(policy);
+ return ret;
+}
+
+static struct notifier_block cpufreq_notifier = {
+ .notifier_call = cpuacct_cpufreq_notify,
+};
+
+static __init int cpuacct_init(void)
+{
+ return cpufreq_register_notifier(&cpufreq_notifier,
+ CPUFREQ_TRANSITION_NOTIFIER);
+}
+
+module_init(cpuacct_init);
+
+static int cpuacct_cpufreq_show(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ int i;
+ unsigned int cpu;
+ char buf[32];
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ struct cpufreq_frequency_table *table =
+ cpufreq_frequency_get_table(smp_processor_id());
+
+ for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) {
+ u64 total = 0;
+
+ if (table[i].frequency == CPUFREQ_ENTRY_INVALID)
+ continue;
+
+ for_each_present_cpu(cpu) {
+ cpufreq_usage_t *cpufreq_usage;
+
+ cpufreq_usage = per_cpu_ptr(ca->cpufreq_usage, cpu);
+ table = cpufreq_frequency_get_table(cpu);
+
+ total += cpufreq_usage->usage[i];
+ }
+
+ snprintf(buf, sizeof(buf), "%d", table[i].frequency);
+ cb->fill(cb, buf, total);
+ }
+
+ return 0;
+}
+#endif
+
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
{
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
@@ -9003,6 +9085,12 @@ static struct cftype files[] = {
.name = "stat",
.read_map = cpuacct_stats_show,
},
+#ifdef CONFIG_CPU_FREQ_STAT
+ {
+ .name = "cpufreq",
+ .read_map = cpuacct_cpufreq_show,
+ },
+#endif
};

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9031,6 +9119,17 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

for (; ca; ca = ca->parent) {
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+#ifdef CONFIG_CPU_FREQ_STAT
+ cpufreq_usage_t *cpufreq_usage =
+ per_cpu_ptr(ca->cpufreq_usage, cpu);
+
+ if (cpufreq_index > CONFIG_CPUACCT_CPUFREQ_TABLE_MAX)
+ printk_once(KERN_WARNING "cpuacct_charge: "
+ "cpufreq_index: %d exceeds max table "
+ "size\n", cpufreq_index);
+ else
+ cpufreq_usage->usage[cpufreq_index] += cputime;
+#endif
*cpuusage += cputime;
}

--
1.7.0.1


2010-04-05 19:53:08

by Paul Menage

[permalink] [raw]
Subject: Re: [RFC][PATCH] sched: cpuacct: Track cpuusage per cpu frequency

On Mon, Apr 5, 2010 at 12:33 PM, Mike Chan <[email protected]> wrote:
> New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.
>
> cpuacct.cpufreq accounts for cpu time per-cpu frequency, time is exported
> in nano-seconds

Can you clarify the wording of this (and describe it in the relevant
Documentation/... file)? It's not clear.

>From the code, it appears that the file reports a breakdown of how
much CPU time the cgroup has been consuming at each different CPU
frequency level. If so, then you probably want to reword the
description to avoid "per-cpu", since that makes it sounds as though
it's reporting something, well, "per CPU".

Also, what's the motivation here? If it's for power monitoring
purposes, might it be simpler to just report a single number, that's
the integral of the CPU usage by frequency index (i.e. calculated from
the same information that this patch is already gathering in
cpuacct_charge()) rather than dumping a whole table on userspace?

Paul

>
> We do not know the cpufreq table size at compile time.
> So a new config option CONFIG_CPUACCT_CPUFREQ_TABLE_MAX is intruduced,
> to determine the cpufreq table per-cpu in the cpuacct struct.
>
> Signed-off-by: Mike Chan <[email protected]>
> ---
> ?init/Kconfig ? | ? ?5 +++
> ?kernel/sched.c | ? 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> ?2 files changed, 104 insertions(+), 0 deletions(-)
>
> diff --git a/init/Kconfig b/init/Kconfig
> index eb77e8c..e1e86df 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -534,6 +534,11 @@ config CGROUP_CPUACCT
> ? ? ? ? ?Provides a simple Resource Controller for monitoring the
> ? ? ? ? ?total CPU consumed by the tasks in a cgroup.
>
> +config CPUACCT_CPUFREQ_TABLE_MAX
> + ? ? ? int "Max CPUFREQ table size"
> + ? ? ? depends on CGROUP_CPUACCT && CPU_FREQ_TABLE
> + ? ? ? default 32
> +
> ?config RESOURCE_COUNTERS
> ? ? ? ?bool "Resource counters"
> ? ? ? ?help
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 528a105..a0b56b5 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -71,6 +71,7 @@
> ?#include <linux/debugfs.h>
> ?#include <linux/ctype.h>
> ?#include <linux/ftrace.h>
> +#include <linux/cpufreq.h>
>
> ?#include <asm/tlb.h>
> ?#include <asm/irq_regs.h>
> @@ -8817,6 +8818,11 @@ struct cgroup_subsys cpu_cgroup_subsys = {
> ?* ([email protected]).
> ?*/
>
> +#ifdef CONFIG_CPU_FREQ_STAT
> +/* The alloc_percpu macro uses typeof so we must define a type here. */
> +typedef struct { u64 usage[CONFIG_CPUACCT_CPUFREQ_TABLE_MAX]; } cpufreq_usage_t;
> +#endif
> +
> ?/* track cpu usage of a group of tasks and its child groups */
> ?struct cpuacct {
> ? ? ? ?struct cgroup_subsys_state css;
> @@ -8824,6 +8830,10 @@ struct cpuacct {
> ? ? ? ?u64 __percpu *cpuusage;
> ? ? ? ?struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
> ? ? ? ?struct cpuacct *parent;
> +#ifdef CONFIG_CPU_FREQ_STAT
> + ? ? ? /* cpufreq_usage is a pointer to an array of u64-types on every cpu */
> + ? ? ? cpufreq_usage_t *cpufreq_usage;
> +#endif
> ?};
>
> ?struct cgroup_subsys cpuacct_subsys;
> @@ -8856,6 +8866,10 @@ static struct cgroup_subsys_state *cpuacct_create(
> ? ? ? ?if (!ca->cpuusage)
> ? ? ? ? ? ? ? ?goto out_free_ca;
>
> +#ifdef CONFIG_CPU_FREQ_STAT
> + ? ? ? ca->cpufreq_usage = alloc_percpu(cpufreq_usage_t);
> +#endif
> +
> ? ? ? ?for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
> ? ? ? ? ? ? ? ?if (percpu_counter_init(&ca->cpustat[i], 0))
> ? ? ? ? ? ? ? ? ? ? ? ?goto out_free_counters;
> @@ -8888,6 +8902,74 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
> ? ? ? ?kfree(ca);
> ?}
>
> +#ifdef CONFIG_CPU_FREQ_STAT
> +static int cpufreq_index;
> +static int cpuacct_cpufreq_notify(struct notifier_block *nb, unsigned long val,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? void *data)
> +{
> + ? ? ? int ret;
> + ? ? ? struct cpufreq_policy *policy;
> + ? ? ? struct cpufreq_freqs *freq = data;
> + ? ? ? struct cpufreq_frequency_table *table;
> +
> + ? ? ? if (val != CPUFREQ_POSTCHANGE)
> + ? ? ? ? ? ? ? return 0;
> +
> + ? ? ? /* Update cpufreq_index with current speed */
> + ? ? ? table = cpufreq_frequency_get_table(freq->cpu);
> + ? ? ? policy = cpufreq_cpu_get(freq->cpu);
> + ? ? ? ret = cpufreq_frequency_table_target(policy, table,
> + ? ? ? ? ? ? ? ? ? ? ? cpufreq_quick_get(freq->cpu),
> + ? ? ? ? ? ? ? ? ? ? ? CPUFREQ_RELATION_L, &cpufreq_index);
> + ? ? ? cpufreq_cpu_put(policy);
> + ? ? ? return ret;
> +}
> +
> +static struct notifier_block cpufreq_notifier = {
> + ? ? ? .notifier_call = cpuacct_cpufreq_notify,
> +};
> +
> +static __init int cpuacct_init(void)
> +{
> + ? ? ? return cpufreq_register_notifier(&cpufreq_notifier,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? CPUFREQ_TRANSITION_NOTIFIER);
> +}
> +
> +module_init(cpuacct_init);
> +
> +static int cpuacct_cpufreq_show(struct cgroup *cgrp, struct cftype *cft,
> + ? ? ? ? ? ? ? struct cgroup_map_cb *cb)
> +{
> + ? ? ? int i;
> + ? ? ? unsigned int cpu;
> + ? ? ? char buf[32];
> + ? ? ? struct cpuacct *ca = cgroup_ca(cgrp);
> + ? ? ? struct cpufreq_frequency_table *table =
> + ? ? ? ? ? ? ? cpufreq_frequency_get_table(smp_processor_id());
> +
> + ? ? ? for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) {
> + ? ? ? ? ? ? ? u64 total = 0;
> +
> + ? ? ? ? ? ? ? if (table[i].frequency == CPUFREQ_ENTRY_INVALID)
> + ? ? ? ? ? ? ? ? ? ? ? continue;
> +
> + ? ? ? ? ? ? ? for_each_present_cpu(cpu) {
> + ? ? ? ? ? ? ? ? ? ? ? cpufreq_usage_t *cpufreq_usage;
> +
> + ? ? ? ? ? ? ? ? ? ? ? cpufreq_usage = per_cpu_ptr(ca->cpufreq_usage, cpu);
> + ? ? ? ? ? ? ? ? ? ? ? table = cpufreq_frequency_get_table(cpu);
> +
> + ? ? ? ? ? ? ? ? ? ? ? total += cpufreq_usage->usage[i];
> + ? ? ? ? ? ? ? }
> +
> + ? ? ? ? ? ? ? snprintf(buf, sizeof(buf), "%d", table[i].frequency);
> + ? ? ? ? ? ? ? cb->fill(cb, buf, total);
> + ? ? ? }
> +
> + ? ? ? return 0;
> +}
> +#endif
> +
> ?static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
> ?{
> ? ? ? ?u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
> @@ -9003,6 +9085,12 @@ static struct cftype files[] = {
> ? ? ? ? ? ? ? ?.name = "stat",
> ? ? ? ? ? ? ? ?.read_map = cpuacct_stats_show,
> ? ? ? ?},
> +#ifdef CONFIG_CPU_FREQ_STAT
> + ? ? ? {
> + ? ? ? ? ? ? ? .name = "cpufreq",
> + ? ? ? ? ? ? ? .read_map = cpuacct_cpufreq_show,
> + ? ? ? },
> +#endif
> ?};
>
> ?static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> @@ -9031,6 +9119,17 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
>
> ? ? ? ?for (; ca; ca = ca->parent) {
> ? ? ? ? ? ? ? ?u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
> +#ifdef CONFIG_CPU_FREQ_STAT
> + ? ? ? ? ? ? ? cpufreq_usage_t *cpufreq_usage =
> + ? ? ? ? ? ? ? ? ? ? ? per_cpu_ptr(ca->cpufreq_usage, cpu);
> +
> + ? ? ? ? ? ? ? if (cpufreq_index > CONFIG_CPUACCT_CPUFREQ_TABLE_MAX)
> + ? ? ? ? ? ? ? ? ? ? ? printk_once(KERN_WARNING "cpuacct_charge: "
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "cpufreq_index: %d exceeds max table "
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "size\n", cpufreq_index);
> + ? ? ? ? ? ? ? else
> + ? ? ? ? ? ? ? ? ? ? ? cpufreq_usage->usage[cpufreq_index] += cputime;
> +#endif
> ? ? ? ? ? ? ? ?*cpuusage += cputime;
> ? ? ? ?}
>
> --
> 1.7.0.1
>
>

2010-04-05 21:03:00

by Mike Chan

[permalink] [raw]
Subject: Re: [RFC][PATCH] sched: cpuacct: Track cpuusage per cpu frequency

On Mon, Apr 5, 2010 at 12:52 PM, Paul Menage <[email protected]> wrote:
> On Mon, Apr 5, 2010 at 12:33 PM, Mike Chan <[email protected]> wrote:
>> New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.
>>
>> cpuacct.cpufreq accounts for cpu time per-cpu frequency, time is exported
>> in nano-seconds
>
> Can you clarify the wording of this (and describe it in the relevant
> Documentation/... file)? It's not clear.
>
> From the code, it appears that the file reports a breakdown of how
> much CPU time the cgroup has been consuming at each different CPU
> frequency level. If so, then you probably want to reword the
> description to avoid "per-cpu", since that makes it sounds as though
> it's reporting something, well, "per CPU".
>

Thanks for the input, I'll clarify in the commit and documentation.

> Also, what's the motivation here? If it's for power monitoring
> purposes, might it be simpler to just report a single number, that's
> the integral of the CPU usage by frequency index (i.e. calculated from
> the same information that this patch is already gathering in
> cpuacct_charge()) rather than dumping a whole table on userspace?
>

The motivation is for power monitoring. Exporting an integral is an
interesting idea, but this gets tricky since we can't just take the
integral of the cpu speed, since each speed has different power
measurements, and these are not linear for each cpu speed. We could
export power values in the board files for various cpu-frequencies

This can also be useful for userspace developers / system admins so
they can optimized their cpu utilization and overall cpu speed usage
can give insight for cpu max / min speed capping.

I liked the integral idea, but I think we lose some useful information
that the original patch intended to show.

-- Mike

> Paul
>
>>
>> We do not know the cpufreq table size at compile time.
>> So a new config option CONFIG_CPUACCT_CPUFREQ_TABLE_MAX is intruduced,
>> to determine the cpufreq table per-cpu in the cpuacct struct.
>>
>> Signed-off-by: Mike Chan <[email protected]>
>> ---
>> ?init/Kconfig ? | ? ?5 +++
>> ?kernel/sched.c | ? 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> ?2 files changed, 104 insertions(+), 0 deletions(-)
>>
>> diff --git a/init/Kconfig b/init/Kconfig
>> index eb77e8c..e1e86df 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -534,6 +534,11 @@ config CGROUP_CPUACCT
>> ? ? ? ? ?Provides a simple Resource Controller for monitoring the
>> ? ? ? ? ?total CPU consumed by the tasks in a cgroup.
>>
>> +config CPUACCT_CPUFREQ_TABLE_MAX
>> + ? ? ? int "Max CPUFREQ table size"
>> + ? ? ? depends on CGROUP_CPUACCT && CPU_FREQ_TABLE
>> + ? ? ? default 32
>> +
>> ?config RESOURCE_COUNTERS
>> ? ? ? ?bool "Resource counters"
>> ? ? ? ?help
>> diff --git a/kernel/sched.c b/kernel/sched.c
>> index 528a105..a0b56b5 100644
>> --- a/kernel/sched.c
>> +++ b/kernel/sched.c
>> @@ -71,6 +71,7 @@
>> ?#include <linux/debugfs.h>
>> ?#include <linux/ctype.h>
>> ?#include <linux/ftrace.h>
>> +#include <linux/cpufreq.h>
>>
>> ?#include <asm/tlb.h>
>> ?#include <asm/irq_regs.h>
>> @@ -8817,6 +8818,11 @@ struct cgroup_subsys cpu_cgroup_subsys = {
>> ?* ([email protected]).
>> ?*/
>>
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> +/* The alloc_percpu macro uses typeof so we must define a type here. */
>> +typedef struct { u64 usage[CONFIG_CPUACCT_CPUFREQ_TABLE_MAX]; } cpufreq_usage_t;
>> +#endif
>> +
>> ?/* track cpu usage of a group of tasks and its child groups */
>> ?struct cpuacct {
>> ? ? ? ?struct cgroup_subsys_state css;
>> @@ -8824,6 +8830,10 @@ struct cpuacct {
>> ? ? ? ?u64 __percpu *cpuusage;
>> ? ? ? ?struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
>> ? ? ? ?struct cpuacct *parent;
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> + ? ? ? /* cpufreq_usage is a pointer to an array of u64-types on every cpu */
>> + ? ? ? cpufreq_usage_t *cpufreq_usage;
>> +#endif
>> ?};
>>
>> ?struct cgroup_subsys cpuacct_subsys;
>> @@ -8856,6 +8866,10 @@ static struct cgroup_subsys_state *cpuacct_create(
>> ? ? ? ?if (!ca->cpuusage)
>> ? ? ? ? ? ? ? ?goto out_free_ca;
>>
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> + ? ? ? ca->cpufreq_usage = alloc_percpu(cpufreq_usage_t);
>> +#endif
>> +
>> ? ? ? ?for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
>> ? ? ? ? ? ? ? ?if (percpu_counter_init(&ca->cpustat[i], 0))
>> ? ? ? ? ? ? ? ? ? ? ? ?goto out_free_counters;
>> @@ -8888,6 +8902,74 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
>> ? ? ? ?kfree(ca);
>> ?}
>>
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> +static int cpufreq_index;
>> +static int cpuacct_cpufreq_notify(struct notifier_block *nb, unsigned long val,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? void *data)
>> +{
>> + ? ? ? int ret;
>> + ? ? ? struct cpufreq_policy *policy;
>> + ? ? ? struct cpufreq_freqs *freq = data;
>> + ? ? ? struct cpufreq_frequency_table *table;
>> +
>> + ? ? ? if (val != CPUFREQ_POSTCHANGE)
>> + ? ? ? ? ? ? ? return 0;
>> +
>> + ? ? ? /* Update cpufreq_index with current speed */
>> + ? ? ? table = cpufreq_frequency_get_table(freq->cpu);
>> + ? ? ? policy = cpufreq_cpu_get(freq->cpu);
>> + ? ? ? ret = cpufreq_frequency_table_target(policy, table,
>> + ? ? ? ? ? ? ? ? ? ? ? cpufreq_quick_get(freq->cpu),
>> + ? ? ? ? ? ? ? ? ? ? ? CPUFREQ_RELATION_L, &cpufreq_index);
>> + ? ? ? cpufreq_cpu_put(policy);
>> + ? ? ? return ret;
>> +}
>> +
>> +static struct notifier_block cpufreq_notifier = {
>> + ? ? ? .notifier_call = cpuacct_cpufreq_notify,
>> +};
>> +
>> +static __init int cpuacct_init(void)
>> +{
>> + ? ? ? return cpufreq_register_notifier(&cpufreq_notifier,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? CPUFREQ_TRANSITION_NOTIFIER);
>> +}
>> +
>> +module_init(cpuacct_init);
>> +
>> +static int cpuacct_cpufreq_show(struct cgroup *cgrp, struct cftype *cft,
>> + ? ? ? ? ? ? ? struct cgroup_map_cb *cb)
>> +{
>> + ? ? ? int i;
>> + ? ? ? unsigned int cpu;
>> + ? ? ? char buf[32];
>> + ? ? ? struct cpuacct *ca = cgroup_ca(cgrp);
>> + ? ? ? struct cpufreq_frequency_table *table =
>> + ? ? ? ? ? ? ? cpufreq_frequency_get_table(smp_processor_id());
>> +
>> + ? ? ? for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) {
>> + ? ? ? ? ? ? ? u64 total = 0;
>> +
>> + ? ? ? ? ? ? ? if (table[i].frequency == CPUFREQ_ENTRY_INVALID)
>> + ? ? ? ? ? ? ? ? ? ? ? continue;
>> +
>> + ? ? ? ? ? ? ? for_each_present_cpu(cpu) {
>> + ? ? ? ? ? ? ? ? ? ? ? cpufreq_usage_t *cpufreq_usage;
>> +
>> + ? ? ? ? ? ? ? ? ? ? ? cpufreq_usage = per_cpu_ptr(ca->cpufreq_usage, cpu);
>> + ? ? ? ? ? ? ? ? ? ? ? table = cpufreq_frequency_get_table(cpu);
>> +
>> + ? ? ? ? ? ? ? ? ? ? ? total += cpufreq_usage->usage[i];
>> + ? ? ? ? ? ? ? }
>> +
>> + ? ? ? ? ? ? ? snprintf(buf, sizeof(buf), "%d", table[i].frequency);
>> + ? ? ? ? ? ? ? cb->fill(cb, buf, total);
>> + ? ? ? }
>> +
>> + ? ? ? return 0;
>> +}
>> +#endif
>> +
>> ?static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
>> ?{
>> ? ? ? ?u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
>> @@ -9003,6 +9085,12 @@ static struct cftype files[] = {
>> ? ? ? ? ? ? ? ?.name = "stat",
>> ? ? ? ? ? ? ? ?.read_map = cpuacct_stats_show,
>> ? ? ? ?},
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> + ? ? ? {
>> + ? ? ? ? ? ? ? .name = "cpufreq",
>> + ? ? ? ? ? ? ? .read_map = cpuacct_cpufreq_show,
>> + ? ? ? },
>> +#endif
>> ?};
>>
>> ?static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
>> @@ -9031,6 +9119,17 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
>>
>> ? ? ? ?for (; ca; ca = ca->parent) {
>> ? ? ? ? ? ? ? ?u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> + ? ? ? ? ? ? ? cpufreq_usage_t *cpufreq_usage =
>> + ? ? ? ? ? ? ? ? ? ? ? per_cpu_ptr(ca->cpufreq_usage, cpu);
>> +
>> + ? ? ? ? ? ? ? if (cpufreq_index > CONFIG_CPUACCT_CPUFREQ_TABLE_MAX)
>> + ? ? ? ? ? ? ? ? ? ? ? printk_once(KERN_WARNING "cpuacct_charge: "
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "cpufreq_index: %d exceeds max table "
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "size\n", cpufreq_index);
>> + ? ? ? ? ? ? ? else
>> + ? ? ? ? ? ? ? ? ? ? ? cpufreq_usage->usage[cpufreq_index] += cputime;
>> +#endif
>> ? ? ? ? ? ? ? ?*cpuusage += cputime;
>> ? ? ? ?}
>>
>> --
>> 1.7.0.1
>>
>>
>

2010-04-06 02:13:39

by Balbir Singh

[permalink] [raw]
Subject: Re: [RFC][PATCH] sched: cpuacct: Track cpuusage per cpu frequency

* Mike Chan <[email protected]> [2010-04-05 12:33:24]:

> New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.
>
> cpuacct.cpufreq accounts for cpu time per-cpu frequency, time is exported
> in nano-seconds
>
> We do not know the cpufreq table size at compile time.
> So a new config option CONFIG_CPUACCT_CPUFREQ_TABLE_MAX is intruduced,
> to determine the cpufreq table per-cpu in the cpuacct struct.
>
> Signed-off-by: Mike Chan <[email protected]>
> ---
> init/Kconfig | 5 +++
> kernel/sched.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 104 insertions(+), 0 deletions(-)
>
> diff --git a/init/Kconfig b/init/Kconfig
> index eb77e8c..e1e86df 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -534,6 +534,11 @@ config CGROUP_CPUACCT
> Provides a simple Resource Controller for monitoring the
> total CPU consumed by the tasks in a cgroup.
>
> +config CPUACCT_CPUFREQ_TABLE_MAX
> + int "Max CPUFREQ table size"
> + depends on CGROUP_CPUACCT && CPU_FREQ_TABLE
> + default 32
> +
> config RESOURCE_COUNTERS
> bool "Resource counters"
> help
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 528a105..a0b56b5 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -71,6 +71,7 @@
> #include <linux/debugfs.h>
> #include <linux/ctype.h>
> #include <linux/ftrace.h>
> +#include <linux/cpufreq.h>
>
> #include <asm/tlb.h>
> #include <asm/irq_regs.h>
> @@ -8817,6 +8818,11 @@ struct cgroup_subsys cpu_cgroup_subsys = {
> * ([email protected]).
> */
>
> +#ifdef CONFIG_CPU_FREQ_STAT
> +/* The alloc_percpu macro uses typeof so we must define a type here. */
> +typedef struct { u64 usage[CONFIG_CPUACCT_CPUFREQ_TABLE_MAX]; } cpufreq_usage_t;
> +#endif
> +
> /* track cpu usage of a group of tasks and its child groups */
> struct cpuacct {
> struct cgroup_subsys_state css;
> @@ -8824,6 +8830,10 @@ struct cpuacct {
> u64 __percpu *cpuusage;
> struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
> struct cpuacct *parent;
> +#ifdef CONFIG_CPU_FREQ_STAT
> + /* cpufreq_usage is a pointer to an array of u64-types on every cpu */
> + cpufreq_usage_t *cpufreq_usage;
> +#endif
> };
>
> struct cgroup_subsys cpuacct_subsys;
> @@ -8856,6 +8866,10 @@ static struct cgroup_subsys_state *cpuacct_create(
> if (!ca->cpuusage)
> goto out_free_ca;
>
> +#ifdef CONFIG_CPU_FREQ_STAT
> + ca->cpufreq_usage = alloc_percpu(cpufreq_usage_t);
> +#endif
> +
> for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
> if (percpu_counter_init(&ca->cpustat[i], 0))
> goto out_free_counters;
> @@ -8888,6 +8902,74 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
> kfree(ca);
> }
>
> +#ifdef CONFIG_CPU_FREQ_STAT
> +static int cpufreq_index;
> +static int cpuacct_cpufreq_notify(struct notifier_block *nb, unsigned long val,
> + void *data)
> +{
> + int ret;
> + struct cpufreq_policy *policy;
> + struct cpufreq_freqs *freq = data;
> + struct cpufreq_frequency_table *table;
> +
> + if (val != CPUFREQ_POSTCHANGE)
> + return 0;
> +
> + /* Update cpufreq_index with current speed */
> + table = cpufreq_frequency_get_table(freq->cpu);
> + policy = cpufreq_cpu_get(freq->cpu);
> + ret = cpufreq_frequency_table_target(policy, table,
> + cpufreq_quick_get(freq->cpu),

Since you've already done a cpufreq_cpu_get, can't you directly
dereference policy->cur here?

> + CPUFREQ_RELATION_L, &cpufreq_index);

The code is unclear

> + cpufreq_cpu_put(policy);
> + return ret;
> +}
> +
> +static struct notifier_block cpufreq_notifier = {
> + .notifier_call = cpuacct_cpufreq_notify,
> +};
> +
> +static __init int cpuacct_init(void)
> +{
> + return cpufreq_register_notifier(&cpufreq_notifier,
> + CPUFREQ_TRANSITION_NOTIFIER);
> +}
> +
> +module_init(cpuacct_init);
> +
> +static int cpuacct_cpufreq_show(struct cgroup *cgrp, struct cftype *cft,
> + struct cgroup_map_cb *cb)
> +{
> + int i;
> + unsigned int cpu;
> + char buf[32];
> + struct cpuacct *ca = cgroup_ca(cgrp);
> + struct cpufreq_frequency_table *table =
> + cpufreq_frequency_get_table(smp_processor_id());
> +
> + for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) {
> + u64 total = 0;
> +
> + if (table[i].frequency == CPUFREQ_ENTRY_INVALID)
> + continue;
> +
> + for_each_present_cpu(cpu) {

Why do we care about all present cpus? A comment would be nice, is
this an ABI thing? Do we care about data of offline cpus, etc.

> + cpufreq_usage_t *cpufreq_usage;
> +
> + cpufreq_usage = per_cpu_ptr(ca->cpufreq_usage, cpu);
> + table = cpufreq_frequency_get_table(cpu);
> +
> + total += cpufreq_usage->usage[i];
> + }
> +
> + snprintf(buf, sizeof(buf), "%d", table[i].frequency);
> + cb->fill(cb, buf, total);
> + }
> +
> + return 0;
> +}
> +#endif
> +
> static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
> {
> u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
> @@ -9003,6 +9085,12 @@ static struct cftype files[] = {
> .name = "stat",
> .read_map = cpuacct_stats_show,
> },
> +#ifdef CONFIG_CPU_FREQ_STAT
> + {
> + .name = "cpufreq",
> + .read_map = cpuacct_cpufreq_show,
> + },
> +#endif
> };
>
> static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> @@ -9031,6 +9119,17 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
>
> for (; ca; ca = ca->parent) {
> u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
> +#ifdef CONFIG_CPU_FREQ_STAT
> + cpufreq_usage_t *cpufreq_usage =
> + per_cpu_ptr(ca->cpufreq_usage, cpu);
> +
> + if (cpufreq_index > CONFIG_CPUACCT_CPUFREQ_TABLE_MAX)
> + printk_once(KERN_WARNING "cpuacct_charge: "
> + "cpufreq_index: %d exceeds max table "
> + "size\n", cpufreq_index);

WARN_ON_ONCE() would be more readable, IMHO

> + else
> + cpufreq_usage->usage[cpufreq_index] += cputime;
> +#endif
> *cpuusage += cputime;
> }
>
> --
> 1.7.0.1
>

--
Three Cheers,
Balbir

2010-04-06 02:16:12

by Balbir Singh

[permalink] [raw]
Subject: Re: [RFC][PATCH] sched: cpuacct: Track cpuusage per cpu frequency

* [email protected] <[email protected]> [2010-04-05 12:52:57]:

> On Mon, Apr 5, 2010 at 12:33 PM, Mike Chan <[email protected]> wrote:
> > New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.
> >
> > cpuacct.cpufreq accounts for cpu time per-cpu frequency, time is exported
> > in nano-seconds
>
> Can you clarify the wording of this (and describe it in the relevant
> Documentation/... file)? It's not clear.
>
> From the code, it appears that the file reports a breakdown of how
> much CPU time the cgroup has been consuming at each different CPU
> frequency level. If so, then you probably want to reword the
> description to avoid "per-cpu", since that makes it sounds as though
> it's reporting something, well, "per CPU".
>
> Also, what's the motivation here? If it's for power monitoring
> purposes, might it be simpler to just report a single number, that's
> the integral of the CPU usage by frequency index (i.e. calculated from
> the same information that this patch is already gathering in
> cpuacct_charge()) rather than dumping a whole table on userspace?

As utilization increases, won't the integral quickly overflow? BTW,
Mike have you looked at the scaled accounting infrastructure we have
in taskstats?

--
Three Cheers,
Balbir

2010-04-06 02:40:55

by Mike Chan

[permalink] [raw]
Subject: Re: [RFC][PATCH] sched: cpuacct: Track cpuusage per cpu frequency

On Mon, Apr 5, 2010 at 7:15 PM, Balbir Singh <[email protected]> wrote:
> * [email protected] <[email protected]> [2010-04-05 12:52:57]:
>
>> On Mon, Apr 5, 2010 at 12:33 PM, Mike Chan <[email protected]> wrote:
>> > New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.
>> >
>> > cpuacct.cpufreq accounts for cpu time per-cpu frequency, time is exported
>> > in nano-seconds
>>
>> Can you clarify the wording of this (and describe it in the relevant
>> Documentation/... file)? It's not clear.
>>
>> From the code, it appears that the file reports a breakdown of how
>> much CPU time the cgroup has been consuming at each different CPU
>> frequency level. If so, then you probably want to reword the
>> description to avoid "per-cpu", since that makes it sounds as though
>> it's reporting something, well, "per CPU".
>>
>> Also, what's the motivation here? If it's for power monitoring
>> purposes, might it be simpler to just report a single number, that's
>> the integral of the CPU usage by frequency index (i.e. calculated from
>> the same information that this patch is already gathering in
>> cpuacct_charge()) rather than dumping a whole table on userspace?
>
> As utilization increases, won't the integral quickly overflow? BTW,
> Mike have you looked at the scaled accounting infrastructure we have
> in taskstats?
>

I just looked (thanks for pointed that out, it was new to me, thx!).
The problem here is that the accounting scales time based of cpu
speed. As it currently stands it will not represent power tracking
accurately as the power consumption of running at 1ghz / 2 != 500mhz.
We would have to register a weight factor for each speed and those
would have to be registered from the board files (at least in the ARM
world), since voltage levels can vary across projects.

-- Mike

> --
> ? ? ? ?Three Cheers,
> ? ? ? ?Balbir
>

2010-04-06 04:14:15

by Mike Chan

[permalink] [raw]
Subject: Re: [RFC][PATCH] sched: cpuacct: Track cpuusage per cpu frequency

On Mon, Apr 5, 2010 at 7:13 PM, Balbir Singh <[email protected]> wrote:
> * Mike Chan <[email protected]> [2010-04-05 12:33:24]:
>
>> New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.
>>
>> cpuacct.cpufreq accounts for cpu time per-cpu frequency, time is exported
>> in nano-seconds
>>
>> We do not know the cpufreq table size at compile time.
>> So a new config option CONFIG_CPUACCT_CPUFREQ_TABLE_MAX is intruduced,
>> to determine the cpufreq table per-cpu in the cpuacct struct.
>>
>> Signed-off-by: Mike Chan <[email protected]>
>> ---
>> ?init/Kconfig ? | ? ?5 +++
>> ?kernel/sched.c | ? 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> ?2 files changed, 104 insertions(+), 0 deletions(-)
>>
>> diff --git a/init/Kconfig b/init/Kconfig
>> index eb77e8c..e1e86df 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -534,6 +534,11 @@ config CGROUP_CPUACCT
>> ? ? ? ? Provides a simple Resource Controller for monitoring the
>> ? ? ? ? total CPU consumed by the tasks in a cgroup.
>>
>> +config CPUACCT_CPUFREQ_TABLE_MAX
>> + ? ? int "Max CPUFREQ table size"
>> + ? ? depends on CGROUP_CPUACCT && CPU_FREQ_TABLE
>> + ? ? default 32
>> +
>> ?config RESOURCE_COUNTERS
>> ? ? ? bool "Resource counters"
>> ? ? ? help
>> diff --git a/kernel/sched.c b/kernel/sched.c
>> index 528a105..a0b56b5 100644
>> --- a/kernel/sched.c
>> +++ b/kernel/sched.c
>> @@ -71,6 +71,7 @@
>> ?#include <linux/debugfs.h>
>> ?#include <linux/ctype.h>
>> ?#include <linux/ftrace.h>
>> +#include <linux/cpufreq.h>
>>
>> ?#include <asm/tlb.h>
>> ?#include <asm/irq_regs.h>
>> @@ -8817,6 +8818,11 @@ struct cgroup_subsys cpu_cgroup_subsys = {
>> ? * ([email protected]).
>> ? */
>>
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> +/* The alloc_percpu macro uses typeof so we must define a type here. */
>> +typedef struct { u64 usage[CONFIG_CPUACCT_CPUFREQ_TABLE_MAX]; } cpufreq_usage_t;
>> +#endif
>> +
>> ?/* track cpu usage of a group of tasks and its child groups */
>> ?struct cpuacct {
>> ? ? ? struct cgroup_subsys_state css;
>> @@ -8824,6 +8830,10 @@ struct cpuacct {
>> ? ? ? u64 __percpu *cpuusage;
>> ? ? ? struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
>> ? ? ? struct cpuacct *parent;
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> + ? ? /* cpufreq_usage is a pointer to an array of u64-types on every cpu */
>> + ? ? cpufreq_usage_t *cpufreq_usage;
>> +#endif
>> ?};
>>
>> ?struct cgroup_subsys cpuacct_subsys;
>> @@ -8856,6 +8866,10 @@ static struct cgroup_subsys_state *cpuacct_create(
>> ? ? ? if (!ca->cpuusage)
>> ? ? ? ? ? ? ? goto out_free_ca;
>>
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> + ? ? ca->cpufreq_usage = alloc_percpu(cpufreq_usage_t);
>> +#endif
>> +
>> ? ? ? for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
>> ? ? ? ? ? ? ? if (percpu_counter_init(&ca->cpustat[i], 0))
>> ? ? ? ? ? ? ? ? ? ? ? goto out_free_counters;
>> @@ -8888,6 +8902,74 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
>> ? ? ? kfree(ca);
>> ?}
>>
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> +static int cpufreq_index;
>> +static int cpuacct_cpufreq_notify(struct notifier_block *nb, unsigned long val,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? void *data)
>> +{
>> + ? ? int ret;
>> + ? ? struct cpufreq_policy *policy;
>> + ? ? struct cpufreq_freqs *freq = data;
>> + ? ? struct cpufreq_frequency_table *table;
>> +
>> + ? ? if (val != CPUFREQ_POSTCHANGE)
>> + ? ? ? ? ? ? return 0;
>> +
>> + ? ? /* Update cpufreq_index with current speed */
>> + ? ? table = cpufreq_frequency_get_table(freq->cpu);
>> + ? ? policy = cpufreq_cpu_get(freq->cpu);
>> + ? ? ret = cpufreq_frequency_table_target(policy, table,
>> + ? ? ? ? ? ? ? ? ? ? cpufreq_quick_get(freq->cpu),
>
> Since you've already done a cpufreq_cpu_get, can't you directly
> dereference policy->cur here?

Yes, we could. I thought I would use the cpufreq public api's from the
header just incase there were any core-internal changes in the future.
Although its probably safe to use policy->cur.

>
>> + ? ? ? ? ? ? ? ? ? ? CPUFREQ_RELATION_L, &cpufreq_index);
>
> The code is unclear
>

I can put a comment here about this. Basically when selecting a
frequency from the freq_table using the cpufreq, incase the exact
frequency you are looking for does not exist in the table you can
specify "highest speed at or below requested speed" or "lowest speed
at or below requested speed", either of these params work here since
the speed we are choosing will always exist in the freq_table.

>> + ? ? cpufreq_cpu_put(policy);
>> + ? ? return ret;
>> +}
>> +
>> +static struct notifier_block cpufreq_notifier = {
>> + ? ? .notifier_call = cpuacct_cpufreq_notify,
>> +};
>> +
>> +static __init int cpuacct_init(void)
>> +{
>> + ? ? return cpufreq_register_notifier(&cpufreq_notifier,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? CPUFREQ_TRANSITION_NOTIFIER);
>> +}
>> +
>> +module_init(cpuacct_init);
>> +
>> +static int cpuacct_cpufreq_show(struct cgroup *cgrp, struct cftype *cft,
>> + ? ? ? ? ? ? struct cgroup_map_cb *cb)
>> +{
>> + ? ? int i;
>> + ? ? unsigned int cpu;
>> + ? ? char buf[32];
>> + ? ? struct cpuacct *ca = cgroup_ca(cgrp);
>> + ? ? struct cpufreq_frequency_table *table =
>> + ? ? ? ? ? ? cpufreq_frequency_get_table(smp_processor_id());
>> +
>> + ? ? for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) {
>> + ? ? ? ? ? ? u64 total = 0;
>> +
>> + ? ? ? ? ? ? if (table[i].frequency == CPUFREQ_ENTRY_INVALID)
>> + ? ? ? ? ? ? ? ? ? ? continue;
>> +
>> + ? ? ? ? ? ? for_each_present_cpu(cpu) {
>
> Why do we care about all present cpus? A comment would be nice, is
> this an ABI thing? Do we care about data of offline cpus, etc.

If we have a system with cpu hotplug and cpu's come online and offline
for the lifetime of the system, we want to include their statistics
when you dump the file.

If you only care about currently online, you can have a case where
Chrome has been running on CPU1, and you hotplug CPU1, immediately
after you read cpuacct.cpufreq. I can add a comment here.

>
>> + ? ? ? ? ? ? ? ? ? ? cpufreq_usage_t *cpufreq_usage;
>> +
>> + ? ? ? ? ? ? ? ? ? ? cpufreq_usage = per_cpu_ptr(ca->cpufreq_usage, cpu);
>> + ? ? ? ? ? ? ? ? ? ? table = cpufreq_frequency_get_table(cpu);
>> +
>> + ? ? ? ? ? ? ? ? ? ? total += cpufreq_usage->usage[i];
>> + ? ? ? ? ? ? }
>> +
>> + ? ? ? ? ? ? snprintf(buf, sizeof(buf), "%d", table[i].frequency);
>> + ? ? ? ? ? ? cb->fill(cb, buf, total);
>> + ? ? }
>> +
>> + ? ? return 0;
>> +}
>> +#endif
>> +
>> ?static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
>> ?{
>> ? ? ? u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
>> @@ -9003,6 +9085,12 @@ static struct cftype files[] = {
>> ? ? ? ? ? ? ? .name = "stat",
>> ? ? ? ? ? ? ? .read_map = cpuacct_stats_show,
>> ? ? ? },
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> + ? ? {
>> + ? ? ? ? ? ? .name = "cpufreq",
>> + ? ? ? ? ? ? .read_map = cpuacct_cpufreq_show,
>> + ? ? },
>> +#endif
>> ?};
>>
>> ?static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
>> @@ -9031,6 +9119,17 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
>>
>> ? ? ? for (; ca; ca = ca->parent) {
>> ? ? ? ? ? ? ? u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> + ? ? ? ? ? ? cpufreq_usage_t *cpufreq_usage =
>> + ? ? ? ? ? ? ? ? ? ? per_cpu_ptr(ca->cpufreq_usage, cpu);
>> +
>> + ? ? ? ? ? ? if (cpufreq_index > CONFIG_CPUACCT_CPUFREQ_TABLE_MAX)
>> + ? ? ? ? ? ? ? ? ? ? printk_once(KERN_WARNING "cpuacct_charge: "
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "cpufreq_index: %d exceeds max table "
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "size\n", cpufreq_index);
>
> WARN_ON_ONCE() would be more readable, IMHO

Although I do admit WARN is much easier to spot, WARN gives a
stacktrace, which isn't too useful in this case and wastes dmesg
buffer.

>
>> + ? ? ? ? ? ? else
>> + ? ? ? ? ? ? ? ? ? ? cpufreq_usage->usage[cpufreq_index] += cputime;
>> +#endif
>> ? ? ? ? ? ? ? *cpuusage += cputime;
>> ? ? ? }
>>
>> --
>> 1.7.0.1
>>
>
> --
> ? ? ? ?Three Cheers,
> ? ? ? ?Balbir
>

2010-04-06 17:56:00

by Daniel Walker

[permalink] [raw]
Subject: Re: [RFC][PATCH] sched: cpuacct: Track cpuusage per cpu frequency

On Mon, 2010-04-05 at 12:33 -0700, Mike Chan wrote:
> New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.
>
> cpuacct.cpufreq accounts for cpu time per-cpu frequency, time is exported
> in nano-seconds
>
> We do not know the cpufreq table size at compile time.
> So a new config option CONFIG_CPUACCT_CPUFREQ_TABLE_MAX is intruduced,
> to determine the cpufreq table per-cpu in the cpuacct struct.
>
> Signed-off-by: Mike Chan <[email protected]>
> ---
> init/Kconfig | 5 +++
> kernel/sched.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 104 insertions(+), 0 deletions(-)
>
> diff --git a/init/Kconfig b/init/Kconfig
> index eb77e8c..e1e86df 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -534,6 +534,11 @@ config CGROUP_CPUACCT
> Provides a simple Resource Controller for monitoring the
> total CPU consumed by the tasks in a cgroup.
>
> +config CPUACCT_CPUFREQ_TABLE_MAX
> + int "Max CPUFREQ table size"
> + depends on CGROUP_CPUACCT && CPU_FREQ_TABLE
> + default 32
> +

I'd say make it just a regular define unless you can think of a reason
why a non-developer would want to touch this value.

> config RESOURCE_COUNTERS
> bool "Resource counters"
> help
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 528a105..a0b56b5 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -71,6 +71,7 @@
> #include <linux/debugfs.h>
> #include <linux/ctype.h>
> #include <linux/ftrace.h>
> +#include <linux/cpufreq.h>
>
> #include <asm/tlb.h>
> #include <asm/irq_regs.h>
> @@ -8817,6 +8818,11 @@ struct cgroup_subsys cpu_cgroup_subsys = {
> * ([email protected]).
> */
>
> +#ifdef CONFIG_CPU_FREQ_STAT
> +/* The alloc_percpu macro uses typeof so we must define a type here. */
> +typedef struct { u64 usage[CONFIG_CPUACCT_CPUFREQ_TABLE_MAX]; } cpufreq_usage_t;
> +#endif

you should be able to send it a regular struct . I found this usage,

lport->dev_stats = alloc_percpu(struct fcoe_dev_stats);

as an example.

Daniel

2010-04-06 19:48:48

by Mike Chan

[permalink] [raw]
Subject: Re: [RFC][PATCH] sched: cpuacct: Track cpuusage per cpu frequency

On Tue, Apr 6, 2010 at 10:48 AM, Daniel Walker <[email protected]> wrote:
> On Mon, 2010-04-05 at 12:33 -0700, Mike Chan wrote:
>> New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.
>>
>> cpuacct.cpufreq accounts for cpu time per-cpu frequency, time is exported
>> in nano-seconds
>>
>> We do not know the cpufreq table size at compile time.
>> So a new config option CONFIG_CPUACCT_CPUFREQ_TABLE_MAX is intruduced,
>> to determine the cpufreq table per-cpu in the cpuacct struct.
>>
>> Signed-off-by: Mike Chan <[email protected]>
>> ---
>> ?init/Kconfig ? | ? ?5 +++
>> ?kernel/sched.c | ? 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> ?2 files changed, 104 insertions(+), 0 deletions(-)
>>
>> diff --git a/init/Kconfig b/init/Kconfig
>> index eb77e8c..e1e86df 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -534,6 +534,11 @@ config CGROUP_CPUACCT
>> ? ? ? ? Provides a simple Resource Controller for monitoring the
>> ? ? ? ? total CPU consumed by the tasks in a cgroup.
>>
>> +config CPUACCT_CPUFREQ_TABLE_MAX
>> + ? ? int "Max CPUFREQ table size"
>> + ? ? depends on CGROUP_CPUACCT && CPU_FREQ_TABLE
>> + ? ? default 32
>> +
>
> I'd say make it just a regular define unless you can think of a reason
> why a non-developer would want to touch this value.

I originally thought about doing this, but my concern here is for
future (or existing) cpu's that have more than 32 speed stepping. This
will have to be updated as new cpu's are supported in mainline (if
they exceed the max). Which might be acceptable or not.

Ideally it would be nice to be able to pull the table size from the
board-file, or determine the size at run-time instead of compile time.

-- Mike

>
>> ?config RESOURCE_COUNTERS
>> ? ? ? bool "Resource counters"
>> ? ? ? help
>> diff --git a/kernel/sched.c b/kernel/sched.c
>> index 528a105..a0b56b5 100644
>> --- a/kernel/sched.c
>> +++ b/kernel/sched.c
>> @@ -71,6 +71,7 @@
>> ?#include <linux/debugfs.h>
>> ?#include <linux/ctype.h>
>> ?#include <linux/ftrace.h>
>> +#include <linux/cpufreq.h>
>>
>> ?#include <asm/tlb.h>
>> ?#include <asm/irq_regs.h>
>> @@ -8817,6 +8818,11 @@ struct cgroup_subsys cpu_cgroup_subsys = {
>> ? * ([email protected]).
>> ? */
>>
>> +#ifdef CONFIG_CPU_FREQ_STAT
>> +/* The alloc_percpu macro uses typeof so we must define a type here. */
>> +typedef struct { u64 usage[CONFIG_CPUACCT_CPUFREQ_TABLE_MAX]; } cpufreq_usage_t;
>> +#endif
>
> you should be able to send it a regular struct . I found this usage,
>
> lport->dev_stats = alloc_percpu(struct fcoe_dev_stats);
>
> as an example.
>

True, I shouldn't need the typedef. Actually I think I can get away
with getting rid of this statement all together and just define an
anonymous struct in the alloc_percpu function.

ca->cpufreq_usage = alloc_percpu(struct { u64
usage[CONFIG_CPUACCT_CPUFREQ_TABLE_MAX]; });

> Daniel
>
>

2010-04-06 19:58:36

by Daniel Walker

[permalink] [raw]
Subject: Re: [RFC][PATCH] sched: cpuacct: Track cpuusage per cpu frequency

On Tue, 2010-04-06 at 12:43 -0700, Mike Chan wrote:
> On Tue, Apr 6, 2010 at 10:48 AM, Daniel Walker <[email protected]> wrote:
> > On Mon, 2010-04-05 at 12:33 -0700, Mike Chan wrote:
> >> New file: cpuacct.cpufreq when CONFIG_CPU_FREQ_STATS is enabled.
> >>
> >> cpuacct.cpufreq accounts for cpu time per-cpu frequency, time is exported
> >> in nano-seconds
> >>
> >> We do not know the cpufreq table size at compile time.
> >> So a new config option CONFIG_CPUACCT_CPUFREQ_TABLE_MAX is intruduced,
> >> to determine the cpufreq table per-cpu in the cpuacct struct.
> >>
> >> Signed-off-by: Mike Chan <[email protected]>
> >> ---
> >> init/Kconfig | 5 +++
> >> kernel/sched.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >> 2 files changed, 104 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/init/Kconfig b/init/Kconfig
> >> index eb77e8c..e1e86df 100644
> >> --- a/init/Kconfig
> >> +++ b/init/Kconfig
> >> @@ -534,6 +534,11 @@ config CGROUP_CPUACCT
> >> Provides a simple Resource Controller for monitoring the
> >> total CPU consumed by the tasks in a cgroup.
> >>
> >> +config CPUACCT_CPUFREQ_TABLE_MAX
> >> + int "Max CPUFREQ table size"
> >> + depends on CGROUP_CPUACCT && CPU_FREQ_TABLE
> >> + default 32
> >> +
> >
> > I'd say make it just a regular define unless you can think of a reason
> > why a non-developer would want to touch this value.
>
> I originally thought about doing this, but my concern here is for
> future (or existing) cpu's that have more than 32 speed stepping. This
> will have to be updated as new cpu's are supported in mainline (if
> they exceed the max). Which might be acceptable or not.
>
> Ideally it would be nice to be able to pull the table size from the
> board-file, or determine the size at run-time instead of compile time.

You should try to discover any current cpu's that have more than 32, if
they exist. (32 seems like a lot)

Then it should be ok to just allow others to patch up the value as
needed, when new cpu's get added with more. There's nothing wrong with
that practice AFAIK ..

Daniel