2015-05-14 23:56:11

by Ruchi Kandoi

[permalink] [raw]
Subject: [PATCH 0/2] Adds cpu power accounting per-pid basis.

These patches add a mechanism which will accurately caculate the CPU power
used by all the processes in the system. In order to account for the power
used by all the processes a data field "cpu_power" has been added in the
task_struct. This field adds power for both the system as well as user
time. cpu_power contains the total amount of charge(in uAmsec units) used
by the process. This model takes into account the frequency at which the
process was running(i.e higher power for processes running at higher
frequencies). It requires the cpufreq_stats module to be initialized with
the current numbers for each of the CPU core at each frequency. This will
be initialized during init time.

Ruchi Kandoi (2):
cpufreq_stats: Adds sysfs file
/sys/devices/system/cpu/cpufreq/current_in_state
sched: cpufreq: Adds a field cpu_power in the task_struct

drivers/cpufreq/cpufreq_stats.c | 186 +++++++++++++++++++++++++++++++++++++++-
include/linux/cpufreq.h | 8 ++
include/linux/sched.h | 2 +
kernel/fork.c | 1 +
kernel/sched/cputime.c | 7 ++
5 files changed, 202 insertions(+), 2 deletions(-)

--
2.2.0.rc0.207.ga3a616c


2015-05-14 23:56:17

by Ruchi Kandoi

[permalink] [raw]
Subject: [PATCH 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state

Adds the sysfs file for userspace to initialize the active current
values for all the cores at each of the frequencies.

The format for storing the values is as follows:
echo "CPU<cpu#>:<freq1>=<current in uA> <freq2>=<current>,CPU<cpu#>:
..." > /sys/devices/system/cpu/cpufreq/current_in_state

Signed-off-by: Ruchi Kandoi <[email protected]>
---
drivers/cpufreq/cpufreq_stats.c | 163 +++++++++++++++++++++++++++++++++++++++-
1 file changed, 161 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 5e370a3..6f0b562 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -30,6 +30,14 @@ struct cpufreq_stats {
#endif
};

+struct cpufreq_power_stats {
+ unsigned int state_num;
+ unsigned int *curr;
+ unsigned int *freq_table;
+};
+
+static DEFINE_PER_CPU(struct cpufreq_power_stats *, cpufreq_power_stats);
+
static int cpufreq_stats_update(struct cpufreq_stats *stats)
{
unsigned long long cur_time = get_jiffies_64();
@@ -61,6 +69,87 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
return len;
}

+static void store_current_value(struct cpufreq_power_stats *powerstats,
+ int freq, int curr)
+{
+ int i;
+
+ /* freq_table doesn't contain any CPU_FREQ_INVALID */
+ for (i = 0; i < powerstats->state_num; i++) {
+ if (powerstats->freq_table[i] == freq) {
+ powerstats->curr[i] = curr;
+ break;
+ }
+ }
+}
+
+static ssize_t store_current_in_state(struct cpufreq_policy *policy,
+ const char *buf, size_t len)
+{
+ char *cp, *cp2, *start, *buffer;
+ unsigned int cpu_num, ret, curr, freq;
+ struct cpufreq_power_stats *powerstats;
+
+ if (!buf || len < 0)
+ return len;
+
+ buffer = kzalloc(len + 1, GFP_KERNEL);
+ if (!buffer)
+ return len;
+
+ strncpy(buffer, buf, len);
+ buffer[len] = '\0';
+ cp = buffer;
+ spin_lock(&cpufreq_stats_lock);
+ while ((start = strsep(&cp, ","))) {
+ ret = sscanf(start, "CPU%u:", &cpu_num);
+ if (ret != 1 || cpu_num > (num_possible_cpus() - 1)) {
+ ret = -EINVAL;
+ goto error;
+ }
+ powerstats = per_cpu(cpufreq_power_stats, cpu_num);
+ if (!powerstats)
+ continue;
+
+ /* sscanf makes sure that strchr doesn't return a NULL */
+ cp2 = strchr(start, ':') + 1;
+ while ((start = strsep(&cp2, " "))) {
+ if (sscanf(start, "%u=%u", &freq, &curr) != 2) {
+ ret = -EINVAL;
+ goto error;
+ }
+ store_current_value(powerstats, freq, curr);
+ }
+ }
+ ret = len;
+error:
+ spin_unlock(&cpufreq_stats_lock);
+ kfree(buffer);
+ return ret;
+}
+
+static ssize_t show_current_in_state(struct cpufreq_policy *policy, char *buf)
+{
+ ssize_t len = 0;
+ unsigned int i, cpu;
+ struct cpufreq_power_stats *powerstats;
+
+ spin_lock(&cpufreq_stats_lock);
+ for_each_possible_cpu(cpu) {
+ powerstats = per_cpu(cpufreq_power_stats, cpu);
+ if (!powerstats)
+ continue;
+ len += scnprintf(buf + len, PAGE_SIZE - len, "CPU%d:", cpu);
+ for (i = 0; i < powerstats->state_num; i++)
+ len += scnprintf(buf + len, PAGE_SIZE - len,
+ "%d=%d ", powerstats->freq_table[i],
+ powerstats->curr[i]);
+ len += scnprintf(buf + len, PAGE_SIZE - len, "\n");
+ }
+ spin_unlock(&cpufreq_stats_lock);
+ return len;
+}
+
#ifdef CONFIG_CPU_FREQ_STAT_DETAILS
static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
{
@@ -107,6 +196,7 @@ cpufreq_freq_attr_ro(trans_table);

cpufreq_freq_attr_ro(total_trans);
cpufreq_freq_attr_ro(time_in_state);
+cpufreq_freq_attr_rw(current_in_state);

static struct attribute *default_attrs[] = {
&total_trans.attr,
@@ -159,6 +249,67 @@ static void cpufreq_stats_free_table(unsigned int cpu)
cpufreq_cpu_put(policy);
}

+static void cpufreq_powerstats_free(void)
+{
+ int cpu;
+ struct cpufreq_power_stats *powerstats;
+
+ sysfs_remove_file(cpufreq_global_kobject, &current_in_state.attr);
+
+ for_each_possible_cpu(cpu) {
+ powerstats = per_cpu(cpufreq_power_stats, cpu);
+ if (!powerstats)
+ continue;
+ kfree(powerstats->curr);
+ kfree(powerstats);
+ per_cpu(cpufreq_power_stats, cpu) = NULL;
+ }
+}
+
+static void cpufreq_powerstats_create(unsigned int cpu)
+{
+ unsigned int alloc_size, i = 0, j = 0, count = 0;
+ struct cpufreq_power_stats *powerstats;
+ struct cpufreq_frequency_table *pos, *table;
+
+ /* We need cpufreq table for creating power stats table */
+ table = cpufreq_frequency_get_table(cpu);
+ if (unlikely(!table))
+ return;
+
+ powerstats = kzalloc(sizeof(struct cpufreq_power_stats),
+ GFP_KERNEL);
+ if (!powerstats)
+ return;
+
+ /* Find total allocation size */
+ cpufreq_for_each_valid_entry(pos, table)
+ count++;
+
+ /* Allocate memory for freq table per cpu as well as clockticks per
+ * freq*/
+ alloc_size = count * sizeof(unsigned int) +
+ count * sizeof(unsigned int);
+ powerstats->curr = kzalloc(alloc_size, GFP_KERNEL);
+ if (!powerstats->curr) {
+ kfree(powerstats);
+ return;
+ }
+ powerstats->freq_table = powerstats->curr + count;
+
+ spin_lock(&cpufreq_stats_lock);
+ for (i = 0; table[i].frequency != CPUFREQ_TABLE_END && j < count; i++) {
+ unsigned int freq = table[i].frequency;
+
+ if (freq == CPUFREQ_ENTRY_INVALID)
+ continue;
+ powerstats->freq_table[j++] = freq;
+ }
+ powerstats->state_num = j;
+ per_cpu(cpufreq_power_stats, cpu) = powerstats;
+ spin_unlock(&cpufreq_stats_lock);
+}
+
static int __cpufreq_stats_create_table(struct cpufreq_policy *policy)
{
unsigned int i = 0, count = 0, ret = -ENOMEM;
@@ -249,9 +400,11 @@ static int cpufreq_stat_notifier_policy(struct notifier_block *nb,
int ret = 0;
struct cpufreq_policy *policy = data;

- if (val == CPUFREQ_CREATE_POLICY)
+ if (val == CPUFREQ_CREATE_POLICY) {
ret = __cpufreq_stats_create_table(policy);
- else if (val == CPUFREQ_REMOVE_POLICY)
+ if (!per_cpu(cpufreq_power_stats, policy->cpu))
+ cpufreq_powerstats_create(policy->cpu);
+ } else if (val == CPUFREQ_REMOVE_POLICY)
__cpufreq_stats_free_table(policy);

return ret;
@@ -335,8 +488,13 @@ static int __init cpufreq_stats_init(void)
return ret;
}

+ ret = sysfs_create_file(cpufreq_global_kobject, &current_in_state.attr);
+ if (ret)
+ pr_warn("Cannot create sysfs file for cpufreq current stats\n");
+
return 0;
}
+
static void __exit cpufreq_stats_exit(void)
{
unsigned int cpu;
@@ -347,6 +505,7 @@ static void __exit cpufreq_stats_exit(void)
CPUFREQ_TRANSITION_NOTIFIER);
for_each_online_cpu(cpu)
cpufreq_stats_free_table(cpu);
+ cpufreq_powerstats_free();
}

MODULE_AUTHOR("Zou Nan hai <[email protected]>");
--
2.2.0.rc0.207.ga3a616c

2015-05-14 23:56:35

by Ruchi Kandoi

[permalink] [raw]
Subject: [PATCH 2/2] sched: cpufreq: Adds a field cpu_power in the task_struct

cpu_power has been added to keep track of amount of power each task is
consuming. cpu_power is updated whenever stime and utime are updated for
a task. power is computed by taking into account the frequency at which
the current core was running and the current for cpu actively
running at hat frequency.

Signed-off-by: Ruchi Kandoi <[email protected]>
---
drivers/cpufreq/cpufreq_stats.c | 23 +++++++++++++++++++++++
include/linux/cpufreq.h | 8 ++++++++
include/linux/sched.h | 2 ++
kernel/fork.c | 1 +
kernel/sched/cputime.c | 7 +++++++
5 files changed, 41 insertions(+)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 6f0b562..4a0bd9a 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/cputime.h>
+#include <linux/sched.h>

static spinlock_t cpufreq_stats_lock;

@@ -83,6 +84,28 @@ static void store_current_value(struct cpufreq_power_stats *powerstats,
}
}

+void acct_update_power(struct task_struct *task, cputime_t cputime)
+{
+ struct cpufreq_power_stats *powerstats;
+ struct cpufreq_stats *stats;
+ struct cpufreq_policy *policy;
+ unsigned int cpu_num, curr;
+
+ if (!task)
+ return;
+ cpu_num = task_cpu(task);
+ powerstats = per_cpu(cpufreq_power_stats, cpu_num);
+ policy = cpufreq_cpu_get(cpu_num);
+ if (!powerstats || !policy || !(policy->stats))
+ return;
+
+ stats = policy->stats;
+ curr = powerstats->curr[stats->last_index];
+ task->cpu_power += curr * cputime_to_usecs(cputime);
+ cpufreq_cpu_put(cpu_num);
+}
+EXPORT_SYMBOL_GPL(acct_update_power);
+
static ssize_t store_current_in_state(struct cpufreq_policy *policy,
const char *buf, size_t len)
{
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 2ee4888..86826c8 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -18,6 +18,7 @@
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/sysfs.h>
+#include <asm/cputime.h>

/*********************************************************************
* CPUFREQ INTERFACE *
@@ -601,4 +602,11 @@ unsigned int cpufreq_generic_get(unsigned int cpu);
int cpufreq_generic_init(struct cpufreq_policy *policy,
struct cpufreq_frequency_table *table,
unsigned int transition_latency);
+
+/*********************************************************************
+ * CPUFREQ STATS *
+ *********************************************************************/
+
+void acct_update_power(struct task_struct *p, cputime_t cputime);
+
#endif /* _LINUX_CPUFREQ_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a2e61..1f2400a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1429,6 +1429,7 @@ struct task_struct {
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */

cputime_t utime, stime, utimescaled, stimescaled;
+ unsigned long long cpu_power;
cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
struct cputime prev_cputime;
@@ -1441,6 +1442,7 @@ struct task_struct {
VTIME_USER,
VTIME_SYS,
} vtime_snap_whence;
+
#endif
unsigned long nvcsw, nivcsw; /* context switch counts */
u64 start_time; /* monotonic time in nsec */
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaa..2ca0e9e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1341,6 +1341,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,

p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
+ p->cpu_power = 0;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1e..53a79d5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,6 +4,7 @@
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
#include <linux/context_tracking.h>
+#include <linux/cpufreq.h>
#include "sched.h"


@@ -149,6 +150,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime,

/* Account for user time used */
acct_account_cputime(p);
+
+ /* Account power usage for user time */
+ acct_update_power(p, cputime);
}

/*
@@ -199,6 +203,9 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,

/* Account for system time used */
acct_account_cputime(p);
+
+ /* Account power usage for system time */
+ acct_update_power(p, cputime);
}

/*
--
2.2.0.rc0.207.ga3a616c

2015-05-15 16:07:35

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched: cpufreq: Adds a field cpu_power in the task_struct

On Thu, May 14, 2015 at 04:55:48PM -0700, Ruchi Kandoi wrote:
> cpu_power has been added to keep track of amount of power each task is
> consuming. cpu_power is updated whenever stime and utime are updated for
> a task. power is computed by taking into account the frequency at which
> the current core was running and the current for cpu actively
> running at hat frequency.
>

Both you patches completely lack any reason for me to even start
considering this.

_WHY_ and _what_ are you doing?

2015-05-18 21:18:52

by Ruchi Kandoi

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched: cpufreq: Adds a field cpu_power in the task_struct

On Fri, May 15, 2015 at 9:07 AM, Peter Zijlstra <[email protected]> wrote:
>
> On Thu, May 14, 2015 at 04:55:48PM -0700, Ruchi Kandoi wrote:
> > cpu_power has been added to keep track of amount of power each task is
> > consuming. cpu_power is updated whenever stime and utime are updated for
> > a task. power is computed by taking into account the frequency at which
> > the current core was running and the current for cpu actively
> > running at hat frequency.
> >
>
> Both you patches completely lack any reason for me to even start
> considering this.
>
> _WHY_ and _what_ are you doing?

We need a mechanism in which we can get information about how much cpu
power each of the process(which is then aggregated fro each
uid/application) is consuming. In the current architecture, it is
based on the amount of the time the process ran. This brings in
inaccuracy because running x seconds at low frequency will have
different power consumption as compared to running at the higher
frequency. With these changes we have the information about the power
which is not only dependent on the time it was running but also takes
into account the frequency it was running at as well as the CPU # it
was running at. Because the cost of running at different CPUs at the
same frequency is different. This gives a better overview of the
current power state of the system wrt cpu power. Please let me know if
more information is required for the same.

Thanks,
Ruchi Kandoi