2013-07-30 08:17:57

by Konstantin Krivyakin

[permalink] [raw]
Subject: [PATCH RFC 0/3] Per-process power consumption measurement facility

This patchset adds per-process power consumption measurement facility.
Power consumption is very important on mobile platforms. This code
allows to measure consumed power in Watts*Hours. The consumed power
for process is updated on scheduler switch and depends on current
CPU voltage and frequency.

The formula for computation is: P = C * V^2 * f, where C is a constant
that reflects capacity of the system, V is the current voltage and
f is the current frequency.
(Taken from: http://en.wikipedia.org/wiki/CPU_power_dissipation).

In this patchset was added implementation for Exynos platform
to demonstrate how it works.

To minimize scheduler impact for each CPU P-state the value of (V^2 *f)
was precomputed at the time of platform initialization.

And to reduce performance impact furthermore, the C constant is multiplied
in userspace.

Konstantin Krivyakin (3):
Add interface to receive current cpu power
Add power consumption counter in task_struct.
Update current cpu power when cpu freq change for exynos.

drivers/cpufreq/cpufreq.c | 17 +++++++++++++++++
drivers/cpufreq/exynos-cpufreq.c | 2 ++
drivers/cpufreq/exynos-cpufreq.h | 1 +
drivers/cpufreq/exynos4x12-cpufreq.c | 19 ++++++++++++++++++-
include/linux/cpufreq.h | 6 ++++++
include/linux/sched.h | 2 ++
include/uapi/linux/taskstats.h | 2 ++
kernel/fork.c | 1 +
kernel/sched/core.c | 8 ++++++++
kernel/sched/cputime.c | 11 +++++++++++
kernel/tsacct.c | 3 +++
11 files changed, 71 insertions(+), 1 deletion(-)

--
1.7.9.5


2013-07-30 08:18:00

by Konstantin Krivyakin

[permalink] [raw]
Subject: [PATCH RFC 2/3] Add power consumption counter in task_struct.

Signed-off-by: Konstantin Krivyakin <[email protected]>
---
include/linux/sched.h | 2 ++
include/uapi/linux/taskstats.h | 2 ++
kernel/fork.c | 1 +
kernel/sched/core.c | 8 ++++++++
kernel/sched/cputime.c | 11 +++++++++++
kernel/tsacct.c | 3 +++
6 files changed, 27 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cdd5407..f074718 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1157,6 +1157,8 @@ struct task_struct {
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */

cputime_t utime, stime, utimescaled, stimescaled;
+ u64 utime_power_cons;
+ u64 stime_power_cons;
cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
struct cputime prev_cputime;
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 2466e55..02ac708 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -116,6 +116,8 @@ struct taskstats {
/* Elapsed time [usec] */
__u64 ac_utime; /* User CPU time [usec] */
__u64 ac_stime; /* SYstem CPU time [usec] */
+ __u64 ac_utime_power_cons; /* User CPU time power consumption */
+ __u64 ac_stime_power_cons; /* System CPU time power consumption */
__u64 ac_minflt; /* Minor Page Fault Count */
__u64 ac_majflt; /* Major Page Fault Count */
/* Basic Accounting Fields end */
diff --git a/kernel/fork.c b/kernel/fork.c
index 6e6a1c1..a021d5b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1240,6 +1240,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,

p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
+ p->utime_power_cons = p->stime_power_cons = 0;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9b1f2e5..cac73d7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
#include <linux/init_task.h>
#include <linux/binfmts.h>
#include <linux/context_tracking.h>
+#include <linux/cpufreq.h>

#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -297,6 +298,13 @@ __read_mostly int scheduler_running;
int sysctl_sched_rt_runtime = 950000;


+static u64 cpu_power_cons(cputime_t cputime)
+{
+ struct thread_info *ti = current_thread_info();
+
+ return cpu_power_get(ti->cpu) * cputime;
+}
+

/*
* __task_rq_lock - lock the rq @p resides on.
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a7959e0..512727d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,6 +4,7 @@
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
#include <linux/context_tracking.h>
+#include <linux/cpufreq.h>
#include "sched.h"


@@ -126,6 +127,13 @@ static inline void task_group_account_field(struct task_struct *p, int index,
cpuacct_account_field(p, index, tmp);
}

+static u64 cpu_power_cons(cputime_t cputime)
+{
+ struct thread_info *ti = current_thread_info();
+
+ return cpu_power_get(ti->cpu) * cputime;
+}
+
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
@@ -138,6 +146,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
int index;

/* Add user time to process. */
+ p->utime_power_cons += cpu_power_cons(cputime);
p->utime += cputime;
p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
@@ -163,6 +172,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
u64 *cpustat = kcpustat_this_cpu->cpustat;

/* Add guest time to process. */
+ p->utime_power_cons += cpu_power_cons(cputime);
p->utime += cputime;
p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
@@ -190,6 +200,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
cputime_t cputime_scaled, int index)
{
/* Add system time to process. */
+ p->stime_power_cons += cpu_power_cons(cputime);
p->stime += cputime;
p->stimescaled += cputime_scaled;
account_group_system_time(p, cputime);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index a1dd9a1..cea4a9c 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -75,6 +75,9 @@ void bacct_add_tsk(struct user_namespace *user_ns,
stats->ac_utimescaled = cputime_to_usecs(utimescaled);
stats->ac_stimescaled = cputime_to_usecs(stimescaled);

+ stats->ac_utime_power_cons = tsk->utime_power_cons;
+ stats->ac_stime_power_cons = tsk->stime_power_cons;
+
stats->ac_minflt = tsk->min_flt;
stats->ac_majflt = tsk->maj_flt;

--
1.7.9.5

2013-07-30 08:19:13

by Konstantin Krivyakin

[permalink] [raw]
Subject: [PATCH RFC 3/3] Update current cpu power when cpu freq change for exynos.

Signed-off-by: Konstantin Krivyakin <[email protected]>
---
drivers/cpufreq/exynos-cpufreq.c | 2 ++
drivers/cpufreq/exynos-cpufreq.h | 1 +
drivers/cpufreq/exynos4x12-cpufreq.c | 19 ++++++++++++++++++-
3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/exynos-cpufreq.c b/drivers/cpufreq/exynos-cpufreq.c
index 0d32f02..02f17bc 100644
--- a/drivers/cpufreq/exynos-cpufreq.c
+++ b/drivers/cpufreq/exynos-cpufreq.c
@@ -178,6 +178,8 @@ static int exynos_target(struct cpufreq_policy *policy,
}

new_freq = freq_table[index].frequency;
+ if (exynos_info->power_table)
+ policy->current_power = exynos_info->power_table[index];

ret = exynos_cpufreq_scale(new_freq);

diff --git a/drivers/cpufreq/exynos-cpufreq.h b/drivers/cpufreq/exynos-cpufreq.h
index 92b852e..64f964f 100644
--- a/drivers/cpufreq/exynos-cpufreq.h
+++ b/drivers/cpufreq/exynos-cpufreq.h
@@ -38,6 +38,7 @@ struct exynos_dvfs_info {
unsigned int pll_safe_idx;
struct clk *cpu_clk;
unsigned int *volt_table;
+ u64 *power_table;
struct cpufreq_frequency_table *freq_table;
void (*set_freq)(unsigned int, unsigned int);
bool (*need_apll_change)(unsigned int, unsigned int);
diff --git a/drivers/cpufreq/exynos4x12-cpufreq.c b/drivers/cpufreq/exynos4x12-cpufreq.c
index 08b7477..8905f9b 100644
--- a/drivers/cpufreq/exynos4x12-cpufreq.c
+++ b/drivers/cpufreq/exynos4x12-cpufreq.c
@@ -219,6 +219,7 @@ static void exynos4x12_set_frequency(unsigned int old_index,
int exynos4x12_cpufreq_init(struct exynos_dvfs_info *info)
{
unsigned long rate;
+ int freq_count;

cpu_clk = clk_get(NULL, "armclk");
if (IS_ERR(cpu_clk))
@@ -252,8 +253,24 @@ int exynos4x12_cpufreq_init(struct exynos_dvfs_info *info)
info->set_freq = exynos4x12_set_frequency;
info->need_apll_change = exynos4x12_pms_change;

- return 0;
+ freq_count = sizeof(exynos4x12_freq_table) /
+ sizeof(struct cpufreq_frequency_table);
+ info->power_table = kzalloc(sizeof(u64) * freq_count, GFP_KERNEL);
+ if (!info->power_table)
+ goto err_power_table;
+
+ for (i = 0; i <= freq_count; ++i) {
+ u64 freq = info->freq_table[i].frequency;
+ u64 volt = info->volt_table[i];

+ do_div(freq, 1000);
+ do_div(volt, 1000);
+ info->power_table[i] = freq * volt * volt;
+ }
+
+ return 0;
+err_power_table:
+ clk_put(mout_appl);
err_mout_apll:
clk_put(mout_mpll);
err_mout_mpll:
--
1.7.9.5

2013-07-30 08:20:13

by Konstantin Krivyakin

[permalink] [raw]
Subject: [PATCH RFC 1/3] Add interface to receive current cpu power

Signed-off-by: Konstantin Krivyakin <[email protected]>
---
drivers/cpufreq/cpufreq.c | 17 +++++++++++++++++
include/linux/cpufreq.h | 6 ++++++
2 files changed, 23 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 6a015ad..4180e89 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1538,6 +1538,23 @@ int cpufreq_unregister_notifier(struct notifier_block *nb, unsigned int list)
}
EXPORT_SYMBOL(cpufreq_unregister_notifier);

+/**
+ * cpu_power_get - get current CPU power
+ * @cpu: CPU number
+ */
+u64 cpu_power_get(int cpu)
+{
+ struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+ u64 ret_power = 0;
+
+ if (policy) {
+ ret_power = policy->current_power;
+ cpufreq_cpu_put(policy);
+ }
+
+ return ret_power;
+}
+EXPORT_SYMBOL(cpu_power_get);

/*********************************************************************
* GOVERNORS *
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 4d7390b..67323af 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -107,6 +107,7 @@ struct cpufreq_policy {
unsigned int max; /* in kHz */
unsigned int cur; /* in kHz, only needed if cpufreq
* governors are used */
+ u64 current_power;
unsigned int policy; /* see above */
struct cpufreq_governor *governor; /* see below */
void *governor_data;
@@ -365,6 +366,7 @@ static inline unsigned int cpufreq_get(unsigned int cpu)
#ifdef CONFIG_CPU_FREQ
unsigned int cpufreq_quick_get(unsigned int cpu);
unsigned int cpufreq_quick_get_max(unsigned int cpu);
+u64 cpu_power_get(int cpu);
#else
static inline unsigned int cpufreq_quick_get(unsigned int cpu)
{
@@ -374,6 +376,10 @@ static inline unsigned int cpufreq_quick_get_max(unsigned int cpu)
{
return 0;
}
+static inline u64 cpu_power_get(unsigned int cpu)
+{
+ return 0;
+}
#endif

/*********************************************************************
--
1.7.9.5

2013-07-30 08:53:09

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH RFC 0/3] Per-process power consumption measurement facility

On Tue, Jul 30, 2013 at 12:17:36PM +0400, Konstantin Krivyakin wrote:
> This patchset adds per-process power consumption measurement facility.
> Power consumption is very important on mobile platforms. This code
> allows to measure consumed power in Watts*Hours. The consumed power
> for process is updated on scheduler switch and depends on current
> CPU voltage and frequency.
>
> The formula for computation is: P = C * V^2 * f, where C is a constant
> that reflects capacity of the system, V is the current voltage and
> f is the current frequency.
> (Taken from: http://en.wikipedia.org/wiki/CPU_power_dissipation).
>
> In this patchset was added implementation for Exynos platform
> to demonstrate how it works.
>
> To minimize scheduler impact for each CPU P-state the value of (V^2 *f)
> was precomputed at the time of platform initialization.

It seems to me the 3 multiplies that takes could be done when cpufreq
actually changes the P-state.

> And to reduce performance impact furthermore, the C constant is multiplied
> in userspace.

That seems particularly silly; how is userspace to know C and why
isn't it a much better idea to do this in the code generating the number
for userspace to consume.

Also, I intensely dislike this thing because:

- it adds more user interface
- it adds more accounting muck
- it completely lacks any useful changelogs
- it completely fails to even begin addressing the issues we already
have with cpufreq

There's been a lot of talk about power aware scheduling in the recent
past, there's also been a lot of problems listed we must overcome/solve.
This patch set completely fails to tie into any of that.

You also completely fail to explain the user case and thus related why
you can't use any of the other facilities like perf or ftrace to measure
this.

2013-07-30 12:10:43

by Igor Zhbanov

[permalink] [raw]
Subject: Re: [PATCH RFC 0/3] Per-process power consumption measurement facility

Dear Peter,

Peter Zijlstra wrote:

> On Tue, Jul 30, 2013 at 12:17:36PM +0400, Konstantin Krivyakin wrote:
>> This patchset adds per-process power consumption measurement facility.
>> Power consumption is very important on mobile platforms. This code
>> allows to measure consumed power in Watts*Hours. The consumed power
>> for process is updated on scheduler switch and depends on current
>> CPU voltage and frequency.
>>
>> The formula for computation is: P = C * V^2 * f, where C is a constant
>> that reflects capacity of the system, V is the current voltage and
>> f is the current frequency.
>> (Taken from: http://en.wikipedia.org/wiki/CPU_power_dissipation).
>>
>> In this patchset was added implementation for Exynos platform
>> to demonstrate how it works.
>>
>> To minimize scheduler impact for each CPU P-state the value of (V^2 *f)
>> was precomputed at the time of platform initialization.
> It seems to me the 3 multiplies that takes could be done when cpufreq
> actually changes the P-state.
>> And to reduce performance impact furthermore, the C constant is multiplied
>> in userspace.
> That seems particularly silly; how is userspace to know C and why
> isn't it a much better idea to do this in the code generating the number
> for userspace to consume.
>
> Also, I intensely dislike this thing because:
>
> - it adds more user interface
> - it adds more accounting muck
> - it completely lacks any useful changelogs
> - it completely fails to even begin addressing the issues we already
> have with cpufreq
>
> There's been a lot of talk about power aware scheduling in the recent
> past, there's also been a lot of problems listed we must overcome/solve.
> This patch set completely fails to tie into any of that.
>
> You also completely fail to explain the user case and thus related why
> you can't use any of the other facilities like perf or ftrace to measure
> this.

Our goal is to create new developer facility for power consumption measurement
of different components of mobile system such as CPU, GPU, modem, storage, etc.

This instrument should allow to correlate consumed power with applications' activity.
It should show what application or system componet consumes lots of power and why.
Then developers could optimize their applications to reduce power consumption.

The power consumption optimization is very critical for mobile platforms. And this
instrument should help developers to reduce consumed power of their applications.

This patch is a first piece of a power consumption framework. It is not related to
power aware scheduling. We want to get consumed power by CPU, GPU and other
devices accounted for user processes that performed corresponding activity.
As I know, any existing tools can't provide needed information or doing it
in not efficient way.

For example, it is possible to track scheduler switches and CPU frequency change
via trace events. But it clould be big number of events to handle (consider
CONFIG_HZ=1000 and 4 cores) just to track that at this timeslice a particaular
application consumed that amount of power. The PowerTOP utility does like that.
But we want to make it in a more efficient way. That's why we considered to modify
process accounting code.

It seems reasonable to us to calculate consumed power right where the power state
is changed -- in the kernel. For each power domain the kernel knows exact time when
the state was changed, the duration of the time and all characteristics of the state,
i.e. frequency and voltage.

In our "final" solution we think about power consumption counters for each power
domain (or device) that are orginized in a tree-like structure and state updates will
propage from leaves to roots. So for each operation (network IO, block IO, CPU and
GPU consuming) the corresponding user task will be accounted for real energy that
was spent by the system.

If you can suggest us some useful technique or existent approach, it would be very
helpful to implement our task in correct way and enhance current kernel functionality.

Thank you.

P.S. Since I'm not subscribed to LKML, please CC in reply.

--
Best regards,
Igor Zhbanov,
Sub-Project Leader,
phone: +7 (495) 797 25 00 ext 3981
e-mail: [email protected]

Mobile group, Moscow R&D center, Samsung Electronics
12 Dvintsev street, building 1
127018, Moscow, Russian Federation

2013-07-30 13:21:44

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [PATCH RFC 0/3] Per-process power consumption measurement facility

On Tuesday, July 30, 2013 10:52:56 AM Peter Zijlstra wrote:
> On Tue, Jul 30, 2013 at 12:17:36PM +0400, Konstantin Krivyakin wrote:
> > This patchset adds per-process power consumption measurement facility.
> > Power consumption is very important on mobile platforms. This code
> > allows to measure consumed power in Watts*Hours. The consumed power
> > for process is updated on scheduler switch and depends on current
> > CPU voltage and frequency.
> >
> > The formula for computation is: P = C * V^2 * f, where C is a constant
> > that reflects capacity of the system, V is the current voltage and
> > f is the current frequency.
> > (Taken from: http://en.wikipedia.org/wiki/CPU_power_dissipation).
> >
> > In this patchset was added implementation for Exynos platform
> > to demonstrate how it works.
> >
> > To minimize scheduler impact for each CPU P-state the value of (V^2 *f)
> > was precomputed at the time of platform initialization.
>
> It seems to me the 3 multiplies that takes could be done when cpufreq
> actually changes the P-state.
>
> > And to reduce performance impact furthermore, the C constant is multiplied
> > in userspace.
>
> That seems particularly silly; how is userspace to know C and why
> isn't it a much better idea to do this in the code generating the number
> for userspace to consume.
>
> Also, I intensely dislike this thing because:
>
> - it adds more user interface
> - it adds more accounting muck
> - it completely lacks any useful changelogs
> - it completely fails to even begin addressing the issues we already
> have with cpufreq
>
> There's been a lot of talk about power aware scheduling in the recent
> past, there's also been a lot of problems listed we must overcome/solve.
> This patch set completely fails to tie into any of that.
>
> You also completely fail to explain the user case and thus related why
> you can't use any of the other facilities like perf or ftrace to measure
> this.

Agreed, thanks Peter!


--
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.