2010-11-11 18:03:21

by Thomas Renninger

[permalink] [raw]
Subject: [RESEND] Power trace event cleanup by still providing old interface for some time

These have been discussed intensively on the trace/perf lists recently.
Outcome: cleanup power events in a (several kernel rounds) compatible way.

Ingo: As discussed, it would be great to see these in some tree/branch
soon as I'd like to base further cpu_idle cleanups/fixes on top.
Also Jean is waiting for these with further work/patches.

Thanks,

Thomas


2010-11-11 18:03:20

by Thomas Renninger

[permalink] [raw]
Subject: [PATCH 3/3] PERF(userspace): Adjust perf timechart to the new power events

Recent changes:
- Adjust state/cpuid to u32 as done in the kernel

The transition was rather smooth, only part I had to fiddle
some time was the check whether a tracepoint/event is
supported by the running kernel.

builtin-timechart must only pass -e power:xy events which
are supported by the running kernel.
For this I added the tiny helper function:
int is_valid_tracepoint(const char *event_string)
to parse-events.[hc]
which could be more generic as an interface and support
hardware/software/... events, not only tracepoints, but someone
else could extend that if needed...

Signed-off-by: Thomas Renninger <[email protected]>
CC: Jean Pihet <[email protected]>
CC: Arjan van de Ven <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: [email protected]
---
tools/perf/builtin-timechart.c | 91 ++++++++++++++++++++++++++++++++-------
tools/perf/util/parse-events.c | 41 ++++++++++++++++++
tools/perf/util/parse-events.h | 1 +
3 files changed, 116 insertions(+), 17 deletions(-)

diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 9bcc38f..299e488 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -32,6 +32,10 @@
#include "util/session.h"
#include "util/svghelper.h"

+#define SUPPORT_OLD_POWER_EVENTS 1
+#define PWR_EVENT_EXIT -1
+
+
static char const *input_name = "perf.data";
static char const *output_name = "output.svg";

@@ -298,12 +302,21 @@ struct trace_entry {
int lock_depth;
};

-struct power_entry {
+#if defined(SUPPORT_OLD_POWER_EVENTS)
+static int use_old_power_events;
+struct power_entry_old {
struct trace_entry te;
u64 type;
u64 value;
u64 cpu_id;
};
+#endif
+
+struct power_processor_entry {
+ struct trace_entry te;
+ u32 state;
+ u32 cpu_id;
+};

#define TASK_COMM_LEN 16
struct wakeup_entry {
@@ -489,29 +502,48 @@ static int process_sample_event(event_t *event, struct perf_session *session)
te = (void *)data.raw_data;
if (session->sample_type & PERF_SAMPLE_RAW && data.raw_size > 0) {
char *event_str;
- struct power_entry *pe;
-
- pe = (void *)te;
+#if defined(SUPPORT_OLD_POWER_EVENTS)
+ struct power_entry_old *peo;
+ peo = (void *)te;
+#endif

event_str = perf_header__find_event(te->type);

if (!event_str)
return 0;

- if (strcmp(event_str, "power:power_start") == 0)
- c_state_start(pe->cpu_id, data.time, pe->value);
-
- if (strcmp(event_str, "power:power_end") == 0)
- c_state_end(pe->cpu_id, data.time);
+ if (strcmp(event_str, "power:cpu_idle") == 0) {
+ struct power_processor_entry *ppe = (void *)te;
+ if (ppe->state == (u32)PWR_EVENT_EXIT)
+ c_state_end(ppe->cpu_id, data.time);
+ else
+ c_state_start(ppe->cpu_id, data.time,
+ ppe->state);
+ }

- if (strcmp(event_str, "power:power_frequency") == 0)
- p_state_change(pe->cpu_id, data.time, pe->value);
+ else if (strcmp(event_str, "power:cpu_frequency") == 0) {
+ struct power_processor_entry *ppe = (void *)te;
+ p_state_change(ppe->cpu_id, data.time, ppe->state);
+ }

- if (strcmp(event_str, "sched:sched_wakeup") == 0)
+ else if (strcmp(event_str, "sched:sched_wakeup") == 0)
sched_wakeup(data.cpu, data.time, data.pid, te);

- if (strcmp(event_str, "sched:sched_switch") == 0)
+ else if (strcmp(event_str, "sched:sched_switch") == 0)
sched_switch(data.cpu, data.time, te);
+
+#if defined(SUPPORT_OLD_POWER_EVENTS)
+ if (use_old_power_events) {
+ if (strcmp(event_str, "power:power_start") == 0)
+ c_state_start(peo->cpu_id, data.time, peo->value);
+
+ else if (strcmp(event_str, "power:power_end") == 0)
+ c_state_end(data.cpu, data.time);
+
+ else if (strcmp(event_str, "power:power_frequency") == 0)
+ p_state_change(peo->cpu_id, data.time, peo->value);
+ }
+#endif
}
return 0;
}
@@ -968,7 +1000,8 @@ static const char * const timechart_usage[] = {
NULL
};

-static const char *record_args[] = {
+#if defined(SUPPORT_OLD_POWER_EVENTS)
+static const char *record_old_args[] = {
"record",
"-a",
"-R",
@@ -980,16 +1013,40 @@ static const char *record_args[] = {
"-e", "sched:sched_wakeup",
"-e", "sched:sched_switch",
};
+#endif
+
+static const char *record_new_args[] = {
+ "record",
+ "-a",
+ "-R",
+ "-f",
+ "-c", "1",
+ "-e", "power:cpu_frequency",
+ "-e", "power:cpu_idle",
+ "-e", "sched:sched_wakeup",
+ "-e", "sched:sched_switch",
+};

static int __cmd_record(int argc, const char **argv)
{
unsigned int rec_argc, i, j;
const char **rec_argv;
-
- rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+ const char **record_args = record_new_args;
+ unsigned int record_elems = ARRAY_SIZE(record_new_args);
+
+#if defined(SUPPORT_OLD_POWER_EVENTS)
+ if (!is_valid_tracepoint("power:cpu_idle") &&
+ is_valid_tracepoint("power:power_start")) {
+ use_old_power_events = 1;
+ record_args = record_old_args;
+ record_elems = ARRAY_SIZE(record_old_args);
+ }
+#endif
+
+ rec_argc = record_elems + argc - 1;
rec_argv = calloc(rec_argc + 1, sizeof(char *));

- for (i = 0; i < ARRAY_SIZE(record_args); i++)
+ for (i = 0; i < record_elems; i++)
rec_argv[i] = strdup(record_args[i]);

for (j = 1; j < (unsigned int)argc; j++, i++)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 4af5bd5..35e3dea 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -906,6 +906,47 @@ static void print_tracepoint_events(void)
}

/*
+ * Check whether event is in <debugfs_mount_point>/tracing/events
+ */
+
+int is_valid_tracepoint(const char *event_string)
+{
+ DIR *sys_dir, *evt_dir;
+ struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
+ char evt_path[MAXPATHLEN];
+ char dir_path[MAXPATHLEN];
+
+ if (debugfs_valid_mountpoint(debugfs_path))
+ return 0;
+
+ sys_dir = opendir(debugfs_path);
+ if (!sys_dir)
+ return 0;
+
+ for_each_subsystem(sys_dir, sys_dirent, sys_next) {
+
+ snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path,
+ sys_dirent.d_name);
+ evt_dir = opendir(dir_path);
+ if (!evt_dir)
+ continue;
+
+ for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
+ snprintf(evt_path, MAXPATHLEN, "%s:%s",
+ sys_dirent.d_name, evt_dirent.d_name);
+ if (!strcmp(evt_path, event_string)) {
+ closedir(evt_dir);
+ closedir(sys_dir);
+ return 1;
+ }
+ }
+ closedir(evt_dir);
+ }
+ closedir(sys_dir);
+ return 0;
+}
+
+/*
* Print the help text for the event symbols:
*/
void print_events(void)
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index fc4ab3f..7ab4685 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -29,6 +29,7 @@ extern int parse_filter(const struct option *opt, const char *str, int unset);
#define EVENTS_HELP_MAX (128*1024)

extern void print_events(void);
+extern int is_valid_tracepoint(const char *event_string);

extern char debugfs_path[];
extern int valid_debugfs_mount(const char *debugfs);
--
1.6.3

2010-11-11 18:03:38

by Thomas Renninger

[permalink] [raw]
Subject: [PATCH 2/3] PERF(kernel): Cleanup power events

Recent changes:
- Enable EVENT_POWER_TRACING_DEPRECATED by default

New power trace events:
power:cpu_idle
power:cpu_frequency
power:machine_suspend


C-state/idle accounting events:
power:power_start
power:power_end
are replaced with:
power:cpu_idle

and
power:power_frequency
is replaced with:
power:cpu_frequency

power:machine_suspend
is newly introduced.
Jean Pihet has a patch integrated into the generic layer
(kernel/power/suspend.c) which will make use of it.

the type= field got removed from both, it was never
used and the type is differed by the event type itself.

perf timechart
userspace tool gets adjusted in a separate patch.

Signed-off-by: Thomas Renninger <[email protected]>
Acked-by: Arjan van de Ven <[email protected]>
Acked-by: Jean Pihet <[email protected]>
CC: Arjan van de Ven <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: [email protected]
CC: [email protected]
---
arch/x86/kernel/process.c | 7 +++-
arch/x86/kernel/process_32.c | 2 +-
arch/x86/kernel/process_64.c | 2 +
drivers/cpufreq/cpufreq.c | 1 +
drivers/cpuidle/cpuidle.c | 1 +
drivers/idle/intel_idle.c | 1 +
include/trace/events/power.h | 87 +++++++++++++++++++++++++++++++++++++++++-
kernel/trace/Kconfig | 15 +++++++
kernel/trace/power-traces.c | 3 +
9 files changed, 116 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 57d1868..155d975 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -374,6 +374,7 @@ void default_idle(void)
{
if (hlt_use_halt()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+ trace_cpu_idle(1, smp_processor_id());
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -444,6 +445,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
+ trace_cpu_idle((ax>>4)+1, smp_processor_id());
if (!need_resched()) {
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -460,6 +462,7 @@ static void mwait_idle(void)
{
if (!need_resched()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+ trace_cpu_idle(1, smp_processor_id());
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);

@@ -481,10 +484,12 @@ static void mwait_idle(void)
static void poll_idle(void)
{
trace_power_start(POWER_CSTATE, 0, smp_processor_id());
+ trace_cpu_idle(0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
- trace_power_end(0);
+ trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}

/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 96586c3..4b9befa 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -113,8 +113,8 @@ void cpu_idle(void)
stop_critical_timings();
pm_idle();
start_critical_timings();
-
trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3d7a3a..4c818a7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -142,6 +142,8 @@ void cpu_idle(void)
start_critical_timings();

trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT,
+ smp_processor_id());

/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index c63a438..1109f68 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -355,6 +355,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
dprintk("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new,
(unsigned long)freqs->cpu);
trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu);
+ trace_cpu_frequency(freqs->new, freqs->cpu);
srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
CPUFREQ_POSTCHANGE, freqs);
if (likely(policy) && likely(policy->cpu == freqs->cpu))
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index a507108..08d5f05 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -107,6 +107,7 @@ static void cpuidle_idle_call(void)
if (cpuidle_curr_governor->reflect)
cpuidle_curr_governor->reflect(dev);
trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}

/**
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 3c95325..ba5134f 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -221,6 +221,7 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)

stop_critical_timings();
trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
+ trace_cpu_idle((eax >> 4) + 1, cpu);
if (!need_resched()) {

__monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 286784d..ab26d8e 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -7,6 +7,67 @@
#include <linux/ktime.h>
#include <linux/tracepoint.h>

+DECLARE_EVENT_CLASS(cpu,
+
+ TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+ TP_ARGS(state, cpu_id),
+
+ TP_STRUCT__entry(
+ __field( u32, state )
+ __field( u32, cpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->state = state;
+ __entry->cpu_id = cpu_id;
+ ),
+
+ TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
+ (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(cpu, cpu_idle,
+
+ TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+ TP_ARGS(state, cpu_id)
+);
+
+/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING
+
+#define PWR_EVENT_EXIT -1
+
+#endif
+
+DEFINE_EVENT(cpu, cpu_frequency,
+
+ TP_PROTO(unsigned int frequency, unsigned int cpu_id),
+
+ TP_ARGS(frequency, cpu_id)
+);
+
+TRACE_EVENT(machine_suspend,
+
+ TP_PROTO(unsigned int state),
+
+ TP_ARGS(state),
+
+ TP_STRUCT__entry(
+ __field( u32, state )
+ ),
+
+ TP_fast_assign(
+ __entry->state = state;
+ ),
+
+ TP_printk("state=%lu", (unsigned long)__entry->state)
+);
+
+#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
+
#ifndef _TRACE_POWER_ENUM_
#define _TRACE_POWER_ENUM_
enum {
@@ -153,8 +214,32 @@ DEFINE_EVENT(power_domain, power_domain_target,

TP_ARGS(name, state, cpu_id)
);
-
+#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
#endif /* _TRACE_POWER_H */

+/* Deprecated dummy functions must be protected against multi-declartion */
+#ifndef EVENT_POWER_TRACING_DEPRECATED_PART_H
+#define EVENT_POWER_TRACING_DEPRECATED_PART_H
+
+#ifndef CONFIG_EVENT_POWER_TRACING_DEPRECATED
+
+#ifndef _TRACE_POWER_ENUM_
+#define _TRACE_POWER_ENUM_
+enum {
+ POWER_NONE = 0,
+ POWER_CSTATE = 1,
+ POWER_PSTATE = 2,
+};
+#endif
+
+static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {};
+static inline void trace_power_end(u64 cpuid) {};
+static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {};
+#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
+
+#endif /* EVENT_POWER_TRACING_DEPRECATED_PART_H */
+
+
+
/* This part must be outside protection */
#include <trace/define_trace.h>
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e04b8bc..0be2e7f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -69,6 +69,21 @@ config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
bool

+config EVENT_POWER_TRACING_DEPRECATED
+ depends on EVENT_TRACING
+ bool
+ default y
+ help
+ Provides old power event types:
+ C-state/idle accounting events:
+ power:power_start
+ power:power_end
+ and old cpufreq accounting event:
+ power:power_frequency
+ This is for userspace compatibility
+ and will vanish after 5 kernel iterations,
+ namely 2.6.41.
+
config CONTEXT_SWITCH_TRACER
bool

diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 0e0497d..f55fcf6 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>

+#ifdef EVENT_POWER_TRACING_DEPRECATED
EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+#endif
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);

--
1.6.3

2010-11-11 18:03:49

by Thomas Renninger

[permalink] [raw]
Subject: [PATCH 1/3] PERF: Do not export power_frequency, but power_start event

power_frequency moved to drivers/cpufreq/cpufreq.c which has
to be compiled in, no need to export it.

intel_idle can a be module though...

Signed-off-by: Thomas Renninger <[email protected]>
CC: Jean Pihet <[email protected]>
CC: Arjan van de Ven <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: [email protected]
---
drivers/idle/intel_idle.c | 2 --
kernel/trace/power-traces.c | 2 +-
2 files changed, 1 insertions(+), 3 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 41665d2..3c95325 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -220,9 +220,7 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)
kt_before = ktime_get_real();

stop_critical_timings();
-#ifndef MODULE
trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
-#endif
if (!need_resched()) {

__monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index a22582a..0e0497d 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,5 @@
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>

-EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);

--
1.6.3

2010-11-12 14:20:50

by Jean Pihet

[permalink] [raw]
Subject: Re: [PATCH 2/3] PERF(kernel): Cleanup power events

Thomas,

Thanks for the patches re-spin!

Here are my comments inlined.

On Thu, Nov 11, 2010 at 7:03 PM, Thomas Renninger <[email protected]> wrote:
> Recent changes:
> ?- Enable EVENT_POWER_TRACING_DEPRECATED by default
>
> New power trace events:
> power:cpu_idle
> power:cpu_frequency
> power:machine_suspend
>
>
> C-state/idle accounting events:
> ?power:power_start
> ?power:power_end
> are replaced with:
> ?power:cpu_idle
>
> and
> ?power:power_frequency
> is replaced with:
> ?power:cpu_frequency
>
> power:machine_suspend
> is newly introduced.
> Jean Pihet has a patch integrated into the generic layer
> (kernel/power/suspend.c) which will make use of it.
>
> the type= field got removed from both, it was never
> used and the type is differed by the event type itself.
>
> perf timechart
> userspace tool gets adjusted in a separate patch.
>
> Signed-off-by: Thomas Renninger <[email protected]>
> Acked-by: Arjan van de Ven <[email protected]>
> Acked-by: Jean Pihet <[email protected]>
> CC: Arjan van de Ven <[email protected]>
> CC: Ingo Molnar <[email protected]>
> CC: [email protected]
> CC: [email protected]
> ---
> ?arch/x86/kernel/process.c ? ?| ? ?7 +++-
> ?arch/x86/kernel/process_32.c | ? ?2 +-
> ?arch/x86/kernel/process_64.c | ? ?2 +
> ?drivers/cpufreq/cpufreq.c ? ?| ? ?1 +
> ?drivers/cpuidle/cpuidle.c ? ?| ? ?1 +
> ?drivers/idle/intel_idle.c ? ?| ? ?1 +
> ?include/trace/events/power.h | ? 87 +++++++++++++++++++++++++++++++++++++++++-
> ?kernel/trace/Kconfig ? ? ? ? | ? 15 +++++++
> ?kernel/trace/power-traces.c ?| ? ?3 +
> ?9 files changed, 116 insertions(+), 3 deletions(-)
>
...
> diff --git a/include/trace/events/power.h b/include/trace/events/power.h
> index 286784d..ab26d8e 100644
> --- a/include/trace/events/power.h
> +++ b/include/trace/events/power.h
> @@ -7,6 +7,67 @@
> ?#include <linux/ktime.h>
> ?#include <linux/tracepoint.h>
>
> +DECLARE_EVENT_CLASS(cpu,
> +
> + ? ? ? TP_PROTO(unsigned int state, unsigned int cpu_id),
> +
> + ? ? ? TP_ARGS(state, cpu_id),
> +
> + ? ? ? TP_STRUCT__entry(
> + ? ? ? ? ? ? ? __field( ? ? ? ?u32, ? ? ? ? ? ?state ? ? ? ? ? )
> + ? ? ? ? ? ? ? __field( ? ? ? ?u32, ? ? ? ? ? ?cpu_id ? ? ? ? ?)
> + ? ? ? ),
> +
> + ? ? ? TP_fast_assign(
> + ? ? ? ? ? ? ? __entry->state = state;
> + ? ? ? ? ? ? ? __entry->cpu_id = cpu_id;
> + ? ? ? ),
> +
> + ? ? ? TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
> + ? ? ? ? ? ? ? ? (unsigned long)__entry->cpu_id)
Using %lu for the state field causes PWR_EVENT_EXIT to appear as
4294967295 instead of -1. Can the field be of a signed type?

> +);
> +
> +DEFINE_EVENT(cpu, cpu_idle,
> +
> + ? ? ? TP_PROTO(unsigned int state, unsigned int cpu_id),
> +
> + ? ? ? TP_ARGS(state, cpu_id)
> +);
> +
> +/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
> +#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING
> +#define _PWR_EVENT_AVOID_DOUBLE_DEFINING
> +
> +#define PWR_EVENT_EXIT -1
> +
> +#endif
> +
> +DEFINE_EVENT(cpu, cpu_frequency,
> +
> + ? ? ? TP_PROTO(unsigned int frequency, unsigned int cpu_id),
> +
> + ? ? ? TP_ARGS(frequency, cpu_id)
> +);
> +
> +TRACE_EVENT(machine_suspend,
> +
> + ? ? ? TP_PROTO(unsigned int state),
> +
> + ? ? ? TP_ARGS(state),
> +
> + ? ? ? TP_STRUCT__entry(
> + ? ? ? ? ? ? ? __field( ? ? ? ?u32, ? ? ? ? ? ?state ? ? ? ? ? )
> + ? ? ? ),
> +
> + ? ? ? TP_fast_assign(
> + ? ? ? ? ? ? ? __entry->state = state;
> + ? ? ? ),
> +
> + ? ? ? TP_printk("state=%lu", (unsigned long)__entry->state)
Same remark about the unsigned type for the state field.

> +);
> +
> +#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
> +
> ?#ifndef _TRACE_POWER_ENUM_
> ?#define _TRACE_POWER_ENUM_
> ?enum {
> @@ -153,8 +214,32 @@ DEFINE_EVENT(power_domain, power_domain_target,
>
> ? ? ? ?TP_ARGS(name, state, cpu_id)
> ?);
> -
> +#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
The clock and power_domain events have been recently introduced and so
must be part of the new API. Can this #endif be moved right after the
definition of power_end?

> ?#endif /* _TRACE_POWER_H */
Should this be at the very end of the file?

>
> +/* Deprecated dummy functions must be protected against multi-declartion */
> +#ifndef EVENT_POWER_TRACING_DEPRECATED_PART_H
> +#define EVENT_POWER_TRACING_DEPRECATED_PART_H
> +
> +#ifndef CONFIG_EVENT_POWER_TRACING_DEPRECATED
> +
> +#ifndef _TRACE_POWER_ENUM_
> +#define _TRACE_POWER_ENUM_
> +enum {
> + ? ? ? POWER_NONE = 0,
> + ? ? ? POWER_CSTATE = 1,
> + ? ? ? POWER_PSTATE = 2,
> +};
> +#endif
> +
> +static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {};
> +static inline void trace_power_end(u64 cpuid) {};
> +static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {};
> +#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
> +
> +#endif /* EVENT_POWER_TRACING_DEPRECATED_PART_H */
> +
> +
> +
> ?/* This part must be outside protection */
> ?#include <trace/define_trace.h>
> diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
> index e04b8bc..0be2e7f 100644
> --- a/kernel/trace/Kconfig
> +++ b/kernel/trace/Kconfig
> @@ -69,6 +69,21 @@ config EVENT_TRACING
> ? ? ? ?select CONTEXT_SWITCH_TRACER
> ? ? ? ?bool
>
> +config EVENT_POWER_TRACING_DEPRECATED
> + ? ? ? depends on EVENT_TRACING
> + ? ? ? bool
A string is needed here. Without it it is impossible to have the option unset.
This does the trick: +bool "Deprecated power event trace API, to be removed"

> + ? ? ? default y
> + ? ? ? help
> + ? ? ? ? Provides old power event types:
> + ? ? ? ? C-state/idle accounting events:
> + ? ? ? ? power:power_start
> + ? ? ? ? power:power_end
> + ? ? ? ? and old cpufreq accounting event:
> + ? ? ? ? power:power_frequency
> + ? ? ? ? This is for userspace compatibility
> + ? ? ? ? and will vanish after 5 kernel iterations,
> + ? ? ? ? namely 2.6.41.
> +
> ?config CONTEXT_SWITCH_TRACER
> ? ? ? ?bool
>
...

Thanks,
Jean

2010-11-12 18:17:19

by Thomas Renninger

[permalink] [raw]
Subject: Re: [PATCH 2/3] PERF(kernel): Cleanup power events

On Friday 12 November 2010 08:20:47 am Jean Pihet wrote:
> Thomas,
...
> > +
> > + ? ? ? TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
> > + ? ? ? ? ? ? ? ? (unsigned long)__entry->cpu_id)
> Using %lu for the state field causes PWR_EVENT_EXIT to appear as
> 4294967295 instead of -1. Can the field be of a signed type?
This is intended, what exactly is the problem?

...
> > + ? ? ? TP_printk("state=%lu", (unsigned long)__entry->state)
> Same remark about the unsigned type for the state field.
Same.
>
> > +);
> > +
> > +#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
> > +
> > ?#ifndef _TRACE_POWER_ENUM_
> > ?#define _TRACE_POWER_ENUM_
> > ?enum {
> > @@ -153,8 +214,32 @@ DEFINE_EVENT(power_domain, power_domain_target,
> >
> > ? ? ? ?TP_ARGS(name, state, cpu_id)
> > ?);
> > -
> > +#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
> The clock and power_domain events have been recently introduced and so
> must be part of the new API. Can this #endif be moved right after the
> definition of power_end?
Oops, I pulled again meanwhile and the patches still patched without fuzz,
but probably with some offset.
I'll look at that and resend this one.

> > ?#endif /* _TRACE_POWER_H */
> Should this be at the very end of the file?
Not sure whether this also came from merge issues, but yes, several
#ifdef conditions need to get corrected.

...

> A string is needed here. Without it it is impossible to have the option
> unset.
> This does the trick: +bool "Deprecated power event trace API, to be removed"
Ok, thanks.

I am currently rebuilding on several archs/flavors and hope to be able
to re-send this one today or on Tue.

Thanks,

Thomas

2010-11-12 21:50:27

by Jean Pihet

[permalink] [raw]
Subject: Re: [PATCH 2/3] PERF(kernel): Cleanup power events

On Fri, Nov 12, 2010 at 7:17 PM, Thomas Renninger <[email protected]> wrote:
> On Friday 12 November 2010 08:20:47 am Jean Pihet wrote:
>> Thomas,
> ...
>> > +
>> > + ? ? ? TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
>> > + ? ? ? ? ? ? ? ? (unsigned long)__entry->cpu_id)
>> Using %lu for the state field causes PWR_EVENT_EXIT to appear as
>> 4294967295 instead of -1. Can the field be of a signed type?
> This is intended, what exactly is the problem?
There is no problem, I just wanted to warn about it. I am fine with it.

>
> ...
>> > + ? ? ? TP_printk("state=%lu", (unsigned long)__entry->state)
>> Same remark about the unsigned type for the state field.
> Same.
>>
>> > +);
>> > +
>> > +#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
>> > +
>> > ?#ifndef _TRACE_POWER_ENUM_
>> > ?#define _TRACE_POWER_ENUM_
>> > ?enum {
>> > @@ -153,8 +214,32 @@ DEFINE_EVENT(power_domain, power_domain_target,
>> >
>> > ? ? ? ?TP_ARGS(name, state, cpu_id)
>> > ?);
>> > -
>> > +#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
>> The clock and power_domain events have been recently introduced and so
>> must be part of the new API. Can this #endif be moved right after the
>> definition of power_end?
> Oops, I pulled again meanwhile and the patches still patched without fuzz,
> but probably with some offset.
> I'll look at that and resend this one.
Ok

>
>> > ?#endif /* _TRACE_POWER_H */
>> Should this be at the very end of the file?
> Not sure whether this also came from merge issues, but yes, several
> #ifdef conditions need to get corrected.
Ok

>
> ...
>
>> A string is needed here. Without it it is impossible to have the option
>> unset.
>> This does the trick: +bool "Deprecated power event trace API, to be removed"
> Ok, thanks.
>
> I am currently rebuilding on several archs/flavors and hope to be able
> to re-send this one today or on Tue.
>
> Thanks,
>
> ? ?Thomas
>
Thanks!

Jean

2010-11-14 13:22:48

by Thomas Renninger

[permalink] [raw]
Subject: Re: [PATCH 2/3] PERF(kernel): Cleanup power events

PERF(kernel): Cleanup power events

Recent changes:
- Fix pre-processor conditionals which got messed up silently by a recent merge/pull
- Add a comment to EVENT_POWER_TRACING_DEPRECATED .config option

New power trace events:
power:cpu_idle
power:cpu_frequency
power:machine_suspend


C-state/idle accounting events:
power:power_start
power:power_end
are replaced with:
power:cpu_idle

and
power:power_frequency
is replaced with:
power:cpu_frequency

power:machine_suspend
is newly introduced.
Jean Pihet has a patch integrated into the generic layer
(kernel/power/suspend.c) which will make use of it.

the type= field got removed from both, it was never
used and the type is differed by the event type itself.

perf timechart
userspace tool gets adjusted in a separate patch.

Signed-off-by: Thomas Renninger <[email protected]>
Acked-by: Arjan van de Ven <[email protected]>
Acked-by: Jean Pihet <[email protected]>
CC: Arjan van de Ven <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: [email protected]
CC: [email protected]

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 57d1868..155d975 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -374,6 +374,7 @@ void default_idle(void)
{
if (hlt_use_halt()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+ trace_cpu_idle(1, smp_processor_id());
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -444,6 +445,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
+ trace_cpu_idle((ax>>4)+1, smp_processor_id());
if (!need_resched()) {
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -460,6 +462,7 @@ static void mwait_idle(void)
{
if (!need_resched()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+ trace_cpu_idle(1, smp_processor_id());
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);

@@ -481,10 +484,12 @@ static void mwait_idle(void)
static void poll_idle(void)
{
trace_power_start(POWER_CSTATE, 0, smp_processor_id());
+ trace_cpu_idle(0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
- trace_power_end(0);
+ trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}

/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 96586c3..4b9befa 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -113,8 +113,8 @@ void cpu_idle(void)
stop_critical_timings();
pm_idle();
start_critical_timings();
-
trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3d7a3a..4c818a7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -142,6 +142,8 @@ void cpu_idle(void)
start_critical_timings();

trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT,
+ smp_processor_id());

/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index c63a438..1109f68 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -355,6 +355,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
dprintk("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new,
(unsigned long)freqs->cpu);
trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu);
+ trace_cpu_frequency(freqs->new, freqs->cpu);
srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
CPUFREQ_POSTCHANGE, freqs);
if (likely(policy) && likely(policy->cpu == freqs->cpu))
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index a507108..08d5f05 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -107,6 +107,7 @@ static void cpuidle_idle_call(void)
if (cpuidle_curr_governor->reflect)
cpuidle_curr_governor->reflect(dev);
trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}

/**
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 3c95325..ba5134f 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -221,6 +221,7 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)

stop_critical_timings();
trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
+ trace_cpu_idle((eax >> 4) + 1, cpu);
if (!need_resched()) {

__monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 286784d..00d9819 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -7,16 +7,67 @@
#include <linux/ktime.h>
#include <linux/tracepoint.h>

-#ifndef _TRACE_POWER_ENUM_
-#define _TRACE_POWER_ENUM_
-enum {
- POWER_NONE = 0,
- POWER_CSTATE = 1, /* C-State */
- POWER_PSTATE = 2, /* Fequency change or DVFS */
- POWER_SSTATE = 3, /* Suspend */
-};
+DECLARE_EVENT_CLASS(cpu,
+
+ TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+ TP_ARGS(state, cpu_id),
+
+ TP_STRUCT__entry(
+ __field( u32, state )
+ __field( u32, cpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->state = state;
+ __entry->cpu_id = cpu_id;
+ ),
+
+ TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
+ (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(cpu, cpu_idle,
+
+ TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+ TP_ARGS(state, cpu_id)
+);
+
+/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING
+
+#define PWR_EVENT_EXIT -1
#endif

+DEFINE_EVENT(cpu, cpu_frequency,
+
+ TP_PROTO(unsigned int frequency, unsigned int cpu_id),
+
+ TP_ARGS(frequency, cpu_id)
+);
+
+TRACE_EVENT(machine_suspend,
+
+ TP_PROTO(unsigned int state),
+
+ TP_ARGS(state),
+
+ TP_STRUCT__entry(
+ __field( u32, state )
+ ),
+
+ TP_fast_assign(
+ __entry->state = state;
+ ),
+
+ TP_printk("state=%lu", (unsigned long)__entry->state)
+);
+
+/* This code will be removed after deprecation time exceeded (2.6.41) */
+#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
+
/*
* The power events are used for cpuidle & suspend (power_start, power_end)
* and for cpufreq (power_frequency)
@@ -75,6 +126,24 @@ TRACE_EVENT(power_end,

);

+/* Deprecated dummy functions must be protected against multi-declartion */
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+
+enum {
+ POWER_NONE = 0,
+ POWER_CSTATE = 1,
+ POWER_PSTATE = 2,
+};
+#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
+#else
+/* These dummy declaration have to be ripped out when the deprecated
+ events get removed */
+static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {};
+static inline void trace_power_end(u64 cpuid) {};
+static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {};
+#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
+
/*
* The clock events are used for clock enable/disable and for
* clock rate change
@@ -153,7 +222,6 @@ DEFINE_EVENT(power_domain, power_domain_target,

TP_ARGS(name, state, cpu_id)
);
-
#endif /* _TRACE_POWER_H */

/* This part must be outside protection */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e04b8bc..59b44a1 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -69,6 +69,21 @@ config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
bool

+config EVENT_POWER_TRACING_DEPRECATED
+ depends on EVENT_TRACING
+ bool "Deprecated power event trace API, to be removed"
+ default y
+ help
+ Provides old power event types:
+ C-state/idle accounting events:
+ power:power_start
+ power:power_end
+ and old cpufreq accounting event:
+ power:power_frequency
+ This is for userspace compatibility
+ and will vanish after 5 kernel iterations,
+ namely 2.6.41.
+
config CONTEXT_SWITCH_TRACER
bool

diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 0e0497d..f55fcf6 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>

+#ifdef EVENT_POWER_TRACING_DEPRECATED
EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+#endif
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);

2010-11-14 13:34:15

by Thomas Renninger

[permalink] [raw]
Subject: Re: [PATCH 2/3] PERF(kernel): Cleanup power events

On Friday 12 November 2010 03:50:21 pm Jean Pihet wrote:
> On Fri, Nov 12, 2010 at 7:17 PM, Thomas Renninger <[email protected]> wrote:
...
> >> > +
> >> > +#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
> >> > +
> >> > ?#ifndef _TRACE_POWER_ENUM_
> >> > ?#define _TRACE_POWER_ENUM_
> >> > ?enum {
> >> > @@ -153,8 +214,32 @@ DEFINE_EVENT(power_domain, power_domain_target,
> >> >
> >> > ? ? ? ?TP_ARGS(name, state, cpu_id)
> >> > ?);
> >> > -
> >> > +#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
> >> The clock and power_domain events have been recently introduced and so
> >> must be part of the new API. Can this #endif be moved right after the
> >> definition of power_end?
> > Oops, I pulled again meanwhile and the patches still patched without fuzz,
> > but probably with some offset.
> > I'll look at that and resend this one.
> Ok
Thanks for pointing this out. Because pre-processor conditionals only have
been moved around it looks like my test build after pulling still succeeded,
while the #ifdefs/#endifs were rather messed up.

I adjusted these parts and successfully test-built on quite a lot .config
flavors on i386, x86_64, different ppc, ia64 and s390.

> >> A string is needed here. Without it it is impossible to have the option
> >> unset.
> >> This does the trick: +bool "Deprecated power event trace API, to be
> >> removed"
Adjusted, thanks.

> > I am currently rebuilding on several archs/flavors and hope to be able
> > to re-send this one today or on Tue.
Done.

@Ingo: If this does not go into x86/tip, but perf or whatever tree, it would
be great if you can ping me as soon as this stuff is in.
I want to cleanup the "double cpu_idle events" issues on top and make this
more architecture independent (throw cpu_idle events from cpuidle framework
instead of throwing very x86 specific mwait states, etc.).

Thanks,

Thomas

2010-11-15 15:49:58

by Jean Pihet

[permalink] [raw]
Subject: Re: [PATCH 2/3] PERF(kernel): Cleanup power events

Acked-by: Jean Pihet <[email protected]>

On Sun, Nov 14, 2010 at 2:22 PM, Thomas Renninger <[email protected]> wrote:
> PERF(kernel): Cleanup power events
>
> Recent changes:
> ?- Fix pre-processor conditionals which got messed up silently by a recent merge/pull
> ?- Add a comment to EVENT_POWER_TRACING_DEPRECATED .config option
>
> New power trace events:
> power:cpu_idle
> power:cpu_frequency
> power:machine_suspend
>
>
> C-state/idle accounting events:
> ?power:power_start
> ?power:power_end
> are replaced with:
> ?power:cpu_idle
>
> and
> ?power:power_frequency
> is replaced with:
> ?power:cpu_frequency
>
> power:machine_suspend
> is newly introduced.
> Jean Pihet has a patch integrated into the generic layer
> (kernel/power/suspend.c) which will make use of it.
>
> the type= field got removed from both, it was never
> used and the type is differed by the event type itself.
>
> perf timechart
> userspace tool gets adjusted in a separate patch.
>
> Signed-off-by: Thomas Renninger <[email protected]>
> Acked-by: Arjan van de Ven <[email protected]>
> Acked-by: Jean Pihet <[email protected]>
> CC: Arjan van de Ven <[email protected]>
> CC: Ingo Molnar <[email protected]>
> CC: [email protected]
> CC: [email protected]
>
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index 57d1868..155d975 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -374,6 +374,7 @@ void default_idle(void)
> ?{
> ? ? ? ?if (hlt_use_halt()) {
> ? ? ? ? ? ? ? ?trace_power_start(POWER_CSTATE, 1, smp_processor_id());
> + ? ? ? ? ? ? ? trace_cpu_idle(1, smp_processor_id());
> ? ? ? ? ? ? ? ?current_thread_info()->status &= ~TS_POLLING;
> ? ? ? ? ? ? ? ?/*
> ? ? ? ? ? ? ? ? * TS_POLLING-cleared state must be visible before we
> @@ -444,6 +445,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
> ?void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
> ?{
> ? ? ? ?trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
> + ? ? ? trace_cpu_idle((ax>>4)+1, smp_processor_id());
> ? ? ? ?if (!need_resched()) {
> ? ? ? ? ? ? ? ?if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
> ? ? ? ? ? ? ? ? ? ? ? ?clflush((void *)&current_thread_info()->flags);
> @@ -460,6 +462,7 @@ static void mwait_idle(void)
> ?{
> ? ? ? ?if (!need_resched()) {
> ? ? ? ? ? ? ? ?trace_power_start(POWER_CSTATE, 1, smp_processor_id());
> + ? ? ? ? ? ? ? trace_cpu_idle(1, smp_processor_id());
> ? ? ? ? ? ? ? ?if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
> ? ? ? ? ? ? ? ? ? ? ? ?clflush((void *)&current_thread_info()->flags);
>
> @@ -481,10 +484,12 @@ static void mwait_idle(void)
> ?static void poll_idle(void)
> ?{
> ? ? ? ?trace_power_start(POWER_CSTATE, 0, smp_processor_id());
> + ? ? ? trace_cpu_idle(0, smp_processor_id());
> ? ? ? ?local_irq_enable();
> ? ? ? ?while (!need_resched())
> ? ? ? ? ? ? ? ?cpu_relax();
> - ? ? ? trace_power_end(0);
> + ? ? ? trace_power_end(smp_processor_id());
> + ? ? ? trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
> ?}
>
> ?/*
> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
> index 96586c3..4b9befa 100644
> --- a/arch/x86/kernel/process_32.c
> +++ b/arch/x86/kernel/process_32.c
> @@ -113,8 +113,8 @@ void cpu_idle(void)
> ? ? ? ? ? ? ? ? ? ? ? ?stop_critical_timings();
> ? ? ? ? ? ? ? ? ? ? ? ?pm_idle();
> ? ? ? ? ? ? ? ? ? ? ? ?start_critical_timings();
> -
> ? ? ? ? ? ? ? ? ? ? ? ?trace_power_end(smp_processor_id());
> + ? ? ? ? ? ? ? ? ? ? ? trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
> ? ? ? ? ? ? ? ?}
> ? ? ? ? ? ? ? ?tick_nohz_restart_sched_tick();
> ? ? ? ? ? ? ? ?preempt_enable_no_resched();
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index b3d7a3a..4c818a7 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -142,6 +142,8 @@ void cpu_idle(void)
> ? ? ? ? ? ? ? ? ? ? ? ?start_critical_timings();
>
> ? ? ? ? ? ? ? ? ? ? ? ?trace_power_end(smp_processor_id());
> + ? ? ? ? ? ? ? ? ? ? ? trace_cpu_idle(PWR_EVENT_EXIT,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?smp_processor_id());
>
> ? ? ? ? ? ? ? ? ? ? ? ?/* In many cases the interrupt that ended idle
> ? ? ? ? ? ? ? ? ? ? ? ? ? has already called exit_idle. But some idle
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index c63a438..1109f68 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -355,6 +355,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
> ? ? ? ? ? ? ? ?dprintk("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new,
> ? ? ? ? ? ? ? ? ? ? ? ?(unsigned long)freqs->cpu);
> ? ? ? ? ? ? ? ?trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu);
> + ? ? ? ? ? ? ? trace_cpu_frequency(freqs->new, freqs->cpu);
> ? ? ? ? ? ? ? ?srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?CPUFREQ_POSTCHANGE, freqs);
> ? ? ? ? ? ? ? ?if (likely(policy) && likely(policy->cpu == freqs->cpu))
> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> index a507108..08d5f05 100644
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -107,6 +107,7 @@ static void cpuidle_idle_call(void)
> ? ? ? ?if (cpuidle_curr_governor->reflect)
> ? ? ? ? ? ? ? ?cpuidle_curr_governor->reflect(dev);
> ? ? ? ?trace_power_end(smp_processor_id());
> + ? ? ? trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
> ?}
>
> ?/**
> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> index 3c95325..ba5134f 100644
> --- a/drivers/idle/intel_idle.c
> +++ b/drivers/idle/intel_idle.c
> @@ -221,6 +221,7 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)
>
> ? ? ? ?stop_critical_timings();
> ? ? ? ?trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
> + ? ? ? trace_cpu_idle((eax >> 4) + 1, cpu);
> ? ? ? ?if (!need_resched()) {
>
> ? ? ? ? ? ? ? ?__monitor((void *)&current_thread_info()->flags, 0, 0);
> diff --git a/include/trace/events/power.h b/include/trace/events/power.h
> index 286784d..00d9819 100644
> --- a/include/trace/events/power.h
> +++ b/include/trace/events/power.h
> @@ -7,16 +7,67 @@
> ?#include <linux/ktime.h>
> ?#include <linux/tracepoint.h>
>
> -#ifndef _TRACE_POWER_ENUM_
> -#define _TRACE_POWER_ENUM_
> -enum {
> - ? ? ? POWER_NONE ? ? ?= 0,
> - ? ? ? POWER_CSTATE ? ?= 1, ? ?/* C-State */
> - ? ? ? POWER_PSTATE ? ?= 2, ? ?/* Fequency change or DVFS */
> - ? ? ? POWER_SSTATE ? ?= 3, ? ?/* Suspend */
> -};
> +DECLARE_EVENT_CLASS(cpu,
> +
> + ? ? ? TP_PROTO(unsigned int state, unsigned int cpu_id),
> +
> + ? ? ? TP_ARGS(state, cpu_id),
> +
> + ? ? ? TP_STRUCT__entry(
> + ? ? ? ? ? ? ? __field( ? ? ? ?u32, ? ? ? ? ? ?state ? ? ? ? ? )
> + ? ? ? ? ? ? ? __field( ? ? ? ?u32, ? ? ? ? ? ?cpu_id ? ? ? ? ?)
> + ? ? ? ),
> +
> + ? ? ? TP_fast_assign(
> + ? ? ? ? ? ? ? __entry->state = state;
> + ? ? ? ? ? ? ? __entry->cpu_id = cpu_id;
> + ? ? ? ),
> +
> + ? ? ? TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
> + ? ? ? ? ? ? ? ? (unsigned long)__entry->cpu_id)
> +);
> +
> +DEFINE_EVENT(cpu, cpu_idle,
> +
> + ? ? ? TP_PROTO(unsigned int state, unsigned int cpu_id),
> +
> + ? ? ? TP_ARGS(state, cpu_id)
> +);
> +
> +/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
> +#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING
> +#define _PWR_EVENT_AVOID_DOUBLE_DEFINING
> +
> +#define PWR_EVENT_EXIT -1
> ?#endif
>
> +DEFINE_EVENT(cpu, cpu_frequency,
> +
> + ? ? ? TP_PROTO(unsigned int frequency, unsigned int cpu_id),
> +
> + ? ? ? TP_ARGS(frequency, cpu_id)
> +);
> +
> +TRACE_EVENT(machine_suspend,
> +
> + ? ? ? TP_PROTO(unsigned int state),
> +
> + ? ? ? TP_ARGS(state),
> +
> + ? ? ? TP_STRUCT__entry(
> + ? ? ? ? ? ? ? __field( ? ? ? ?u32, ? ? ? ? ? ?state ? ? ? ? ? )
> + ? ? ? ),
> +
> + ? ? ? TP_fast_assign(
> + ? ? ? ? ? ? ? __entry->state = state;
> + ? ? ? ),
> +
> + ? ? ? TP_printk("state=%lu", (unsigned long)__entry->state)
> +);
> +
> +/* This code will be removed after deprecation time exceeded (2.6.41) */
> +#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
> +
> ?/*
> ?* The power events are used for cpuidle & suspend (power_start, power_end)
> ?* ?and for cpufreq (power_frequency)
> @@ -75,6 +126,24 @@ TRACE_EVENT(power_end,
>
> ?);
>
> +/* Deprecated dummy functions must be protected against multi-declartion */
> +#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
> +#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
> +
> +enum {
> + ? ? ? POWER_NONE = 0,
> + ? ? ? POWER_CSTATE = 1,
> + ? ? ? POWER_PSTATE = 2,
> +};
> +#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
> +#else
> +/* These dummy declaration have to be ripped out when the deprecated
> + ? events get removed */
> +static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {};
> +static inline void trace_power_end(u64 cpuid) {};
> +static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {};
> +#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
> +
> ?/*
> ?* The clock events are used for clock enable/disable and for
> ?* ?clock rate change
> @@ -153,7 +222,6 @@ DEFINE_EVENT(power_domain, power_domain_target,
>
> ? ? ? ?TP_ARGS(name, state, cpu_id)
> ?);
> -
> ?#endif /* _TRACE_POWER_H */
>
> ?/* This part must be outside protection */
> diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
> index e04b8bc..59b44a1 100644
> --- a/kernel/trace/Kconfig
> +++ b/kernel/trace/Kconfig
> @@ -69,6 +69,21 @@ config EVENT_TRACING
> ? ? ? ?select CONTEXT_SWITCH_TRACER
> ? ? ? ?bool
>
> +config EVENT_POWER_TRACING_DEPRECATED
> + ? ? ? depends on EVENT_TRACING
> + ? ? ? bool "Deprecated power event trace API, to be removed"
> + ? ? ? default y
> + ? ? ? help
> + ? ? ? ? Provides old power event types:
> + ? ? ? ? C-state/idle accounting events:
> + ? ? ? ? power:power_start
> + ? ? ? ? power:power_end
> + ? ? ? ? and old cpufreq accounting event:
> + ? ? ? ? power:power_frequency
> + ? ? ? ? This is for userspace compatibility
> + ? ? ? ? and will vanish after 5 kernel iterations,
> + ? ? ? ? namely 2.6.41.
> +
> ?config CONTEXT_SWITCH_TRACER
> ? ? ? ?bool
>
> diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
> index 0e0497d..f55fcf6 100644
> --- a/kernel/trace/power-traces.c
> +++ b/kernel/trace/power-traces.c
> @@ -13,5 +13,8 @@
> ?#define CREATE_TRACE_POINTS
> ?#include <trace/events/power.h>
>
> +#ifdef EVENT_POWER_TRACING_DEPRECATED
> ?EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
> +#endif
> +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
>
>

2010-11-15 15:50:12

by Jean Pihet

[permalink] [raw]
Subject: Re: [PATCH 1/3] PERF: Do not export power_frequency, but power_start event

Acked-by: Jean Pihet <[email protected]>

On Thu, Nov 11, 2010 at 7:03 PM, Thomas Renninger <[email protected]> wrote:
> power_frequency moved to drivers/cpufreq/cpufreq.c which has
> to be compiled in, no need to export it.
>
> intel_idle can a be module though...
>
> Signed-off-by: Thomas Renninger <[email protected]>
> CC: Jean Pihet <[email protected]>
> CC: Arjan van de Ven <[email protected]>
> CC: Ingo Molnar <[email protected]>
> CC: [email protected]
> ---
> ?drivers/idle/intel_idle.c ? | ? ?2 --
> ?kernel/trace/power-traces.c | ? ?2 +-
> ?2 files changed, 1 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> index 41665d2..3c95325 100644
> --- a/drivers/idle/intel_idle.c
> +++ b/drivers/idle/intel_idle.c
> @@ -220,9 +220,7 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)
> ? ? ? ?kt_before = ktime_get_real();
>
> ? ? ? ?stop_critical_timings();
> -#ifndef MODULE
> ? ? ? ?trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
> -#endif
> ? ? ? ?if (!need_resched()) {
>
> ? ? ? ? ? ? ? ?__monitor((void *)&current_thread_info()->flags, 0, 0);
> diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
> index a22582a..0e0497d 100644
> --- a/kernel/trace/power-traces.c
> +++ b/kernel/trace/power-traces.c
> @@ -13,5 +13,5 @@
> ?#define CREATE_TRACE_POINTS
> ?#include <trace/events/power.h>
>
> -EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
>
> --
> 1.6.3
>
>

2010-11-18 08:01:42

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/3] PERF(kernel): Cleanup power events


* Thomas Renninger <[email protected]> wrote:

> On Friday 12 November 2010 03:50:21 pm Jean Pihet wrote:
> > On Fri, Nov 12, 2010 at 7:17 PM, Thomas Renninger <[email protected]> wrote:
> ...
> > >> > +
> > >> > +#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
> > >> > +
> > >> > ?#ifndef _TRACE_POWER_ENUM_
> > >> > ?#define _TRACE_POWER_ENUM_
> > >> > ?enum {
> > >> > @@ -153,8 +214,32 @@ DEFINE_EVENT(power_domain, power_domain_target,
> > >> >
> > >> > ? ? ? ?TP_ARGS(name, state, cpu_id)
> > >> > ?);
> > >> > -
> > >> > +#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
> > >> The clock and power_domain events have been recently introduced and so
> > >> must be part of the new API. Can this #endif be moved right after the
> > >> definition of power_end?
> > > Oops, I pulled again meanwhile and the patches still patched without fuzz,
> > > but probably with some offset.
> > > I'll look at that and resend this one.
> > Ok
> Thanks for pointing this out. Because pre-processor conditionals only have
> been moved around it looks like my test build after pulling still succeeded,
> while the #ifdefs/#endifs were rather messed up.
>
> I adjusted these parts and successfully test-built on quite a lot .config
> flavors on i386, x86_64, different ppc, ia64 and s390.
>
> > >> A string is needed here. Without it it is impossible to have the option
> > >> unset.
> > >> This does the trick: +bool "Deprecated power event trace API, to be
> > >> removed"
> Adjusted, thanks.
>
> > > I am currently rebuilding on several archs/flavors and hope to be able
> > > to re-send this one today or on Tue.
> Done.
>
> @Ingo: If this does not go into x86/tip, but perf or whatever tree, it would
> be great if you can ping me as soon as this stuff is in.

Mind sending the latest version which has been adjusted/fixed and all acks added?

Thanks,

Ingo

2010-11-18 09:27:12

by Thomas Renninger

[permalink] [raw]
Subject: Re: [PATCH 2/3] PERF(kernel): Cleanup power events

On Thursday 18 November 2010 09:01:32 Ingo Molnar wrote:
...
> > @Ingo: If this does not go into x86/tip, but perf or whatever tree, it would
> > be great if you can ping me as soon as this stuff is in.
>
> Mind sending the latest version which has been adjusted/fixed and all acks added?
Done.
This time with lkml excluded as there were only cleanups/fixes
due to a messed merge.

Thanks,

Thomas

2010-11-18 09:35:13

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 3/3] PERF(userspace): Adjust perf timechart to the new power events


* Thomas Renninger <[email protected]> wrote:

> Recent changes:
> - Adjust state/cpuid to u32 as done in the kernel
>
> The transition was rather smooth, only part I had to fiddle
> some time was the check whether a tracepoint/event is
> supported by the running kernel.

Hm, there's no Ack here from Arjan.

Timechart still works fine with the other two patches, right?

Ingo

2010-11-18 09:37:12

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/3] PERF(kernel): Cleanup power events


* Thomas Renninger <[email protected]> wrote:

> On Thursday 18 November 2010 09:01:32 Ingo Molnar wrote:
> ...
> > > @Ingo: If this does not go into x86/tip, but perf or whatever tree, it would
> > > be great if you can ping me as soon as this stuff is in.
> >
> > Mind sending the latest version which has been adjusted/fixed and all acks added?
> Done.
> This time with lkml excluded as there were only cleanups/fixes
> due to a messed merge.

Please do not exclude lkml from such iterations of patches in the future - every
modification to patches is relevant - often pure resends get resent to lkml as well.

Thanks,

Ingo

2010-11-18 09:44:10

by Jean Pihet

[permalink] [raw]
Subject: Re: [PATCH 2/3] PERF(kernel): Cleanup power events

On Thu, Nov 18, 2010 at 10:36 AM, Ingo Molnar <[email protected]> wrote:
>
> * Thomas Renninger <[email protected]> wrote:
>
>> On Thursday 18 November 2010 09:01:32 Ingo Molnar wrote:
>> ...
>> > > @Ingo: If this does not go into x86/tip, but perf or whatever tree, it would
>> > > be great if you can ping me as soon as this stuff is in.
>> >
>> > Mind sending the latest version which has been adjusted/fixed and all acks added?
>> Done.
>> This time with lkml excluded as there were only cleanups/fixes
>> due to a messed merge.
>
> Please do not exclude lkml from such iterations of patches in the future - every
> modification to patches is relevant - often pure resends get resent to lkml as well.

Ok for me!

Acked-by: Jean Pihet <[email protected]>

Note the ti.com email address to be used for Sign-offs and Acks.

Thanks,
Jean

>
> Thanks,
>
> ? ? ? ?Ingo
>

2010-11-18 10:53:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/3] PERF(kernel): Cleanup power events


I am also getting build failures:

drivers/cpufreq/cpufreq.c:357: error: 'POWER_PSTATE' undeclared (first use in this function)
drivers/cpufreq/cpufreq.c:357: error: (Each undeclared identifier is reported only once
drivers/cpufreq/cpufreq.c:357: error: for each function it appears in.)
arch/x86/kernel/process.c:375: error: 'POWER_CSTATE' undeclared (first use in this function)
arch/x86/kernel/process.c:375: error: (Each undeclared identifier is reported only once
arch/x86/kernel/process.c:375: error: for each function it appears in.)
arch/x86/kernel/process.c:446: error: 'POWER_CSTATE' undeclared (first use in this function)
arch/x86/kernel/process.c:463: error: 'POWER_CSTATE' undeclared (first use in this function)
arch/x86/kernel/process.c:485: error: 'POWER_CSTATE' undeclared (first use in this function)
include/trace/events/power.h:142: error: redefinition of 'trace_power_start'

Config attached.

Note: please reuse the two commits from below for further work, i did some small
cleanups to the commit text and to the patches.

Thanks,

Ingo

---------------->
>From 87a2cfbda3f53c3bf00c424ce18d97b03b0c3aa0 Mon Sep 17 00:00:00 2001
From: Thomas Renninger <[email protected]>
Date: Thu, 18 Nov 2010 10:25:13 +0100
Subject: [PATCH] perf: Clean up power events

Add these new power trace events:

power:cpu_idle
power:cpu_frequency
power:machine_suspend

The old C-state/idle accounting events:
power:power_start
power:power_end

Have now a replacement (but we are still keeping the old
tracepoints for compatibility):

power:cpu_idle

and
power:power_frequency

is replaced with:
power:cpu_frequency

power:machine_suspend is newly introduced.

Jean Pihet has a patch integrated into the generic layer
(kernel/power/suspend.c) which will make use of it.

the type= field got removed from both, it was never
used and the type is differed by the event type itself.

perf timechart userspace tool gets adjusted in a separate patch.

Signed-off-by: Thomas Renninger <[email protected]>
Acked-by: Arjan van de Ven <[email protected]>
Acked-by: Jean Pihet <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: [email protected]
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/kernel/process.c | 7 +++-
arch/x86/kernel/process_32.c | 2 +-
arch/x86/kernel/process_64.c | 2 +
drivers/cpufreq/cpufreq.c | 1 +
drivers/cpuidle/cpuidle.c | 1 +
drivers/idle/intel_idle.c | 1 +
include/trace/events/power.h | 86 +++++++++++++++++++++++++++++++++++++----
kernel/trace/Kconfig | 15 +++++++
kernel/trace/power-traces.c | 3 +
9 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 57d1868..155d975 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -374,6 +374,7 @@ void default_idle(void)
{
if (hlt_use_halt()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+ trace_cpu_idle(1, smp_processor_id());
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -444,6 +445,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
+ trace_cpu_idle((ax>>4)+1, smp_processor_id());
if (!need_resched()) {
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -460,6 +462,7 @@ static void mwait_idle(void)
{
if (!need_resched()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+ trace_cpu_idle(1, smp_processor_id());
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);

@@ -481,10 +484,12 @@ static void mwait_idle(void)
static void poll_idle(void)
{
trace_power_start(POWER_CSTATE, 0, smp_processor_id());
+ trace_cpu_idle(0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
- trace_power_end(0);
+ trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}

/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 96586c3..4b9befa 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -113,8 +113,8 @@ void cpu_idle(void)
stop_critical_timings();
pm_idle();
start_critical_timings();
-
trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3d7a3a..4c818a7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -142,6 +142,8 @@ void cpu_idle(void)
start_critical_timings();

trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT,
+ smp_processor_id());

/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index c63a438..1109f68 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -355,6 +355,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
dprintk("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new,
(unsigned long)freqs->cpu);
trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu);
+ trace_cpu_frequency(freqs->new, freqs->cpu);
srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
CPUFREQ_POSTCHANGE, freqs);
if (likely(policy) && likely(policy->cpu == freqs->cpu))
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index a507108..08d5f05 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -107,6 +107,7 @@ static void cpuidle_idle_call(void)
if (cpuidle_curr_governor->reflect)
cpuidle_curr_governor->reflect(dev);
trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}

/**
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 3c95325..ba5134f 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -221,6 +221,7 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)

stop_critical_timings();
trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
+ trace_cpu_idle((eax >> 4) + 1, cpu);
if (!need_resched()) {

__monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 286784d..00d9819 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -7,16 +7,67 @@
#include <linux/ktime.h>
#include <linux/tracepoint.h>

-#ifndef _TRACE_POWER_ENUM_
-#define _TRACE_POWER_ENUM_
-enum {
- POWER_NONE = 0,
- POWER_CSTATE = 1, /* C-State */
- POWER_PSTATE = 2, /* Fequency change or DVFS */
- POWER_SSTATE = 3, /* Suspend */
-};
+DECLARE_EVENT_CLASS(cpu,
+
+ TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+ TP_ARGS(state, cpu_id),
+
+ TP_STRUCT__entry(
+ __field( u32, state )
+ __field( u32, cpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->state = state;
+ __entry->cpu_id = cpu_id;
+ ),
+
+ TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
+ (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(cpu, cpu_idle,
+
+ TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+ TP_ARGS(state, cpu_id)
+);
+
+/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING
+
+#define PWR_EVENT_EXIT -1
#endif

+DEFINE_EVENT(cpu, cpu_frequency,
+
+ TP_PROTO(unsigned int frequency, unsigned int cpu_id),
+
+ TP_ARGS(frequency, cpu_id)
+);
+
+TRACE_EVENT(machine_suspend,
+
+ TP_PROTO(unsigned int state),
+
+ TP_ARGS(state),
+
+ TP_STRUCT__entry(
+ __field( u32, state )
+ ),
+
+ TP_fast_assign(
+ __entry->state = state;
+ ),
+
+ TP_printk("state=%lu", (unsigned long)__entry->state)
+);
+
+/* This code will be removed after deprecation time exceeded (2.6.41) */
+#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
+
/*
* The power events are used for cpuidle & suspend (power_start, power_end)
* and for cpufreq (power_frequency)
@@ -75,6 +126,24 @@ TRACE_EVENT(power_end,

);

+/* Deprecated dummy functions must be protected against multi-declartion */
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+
+enum {
+ POWER_NONE = 0,
+ POWER_CSTATE = 1,
+ POWER_PSTATE = 2,
+};
+#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
+#else
+/* These dummy declaration have to be ripped out when the deprecated
+ events get removed */
+static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {};
+static inline void trace_power_end(u64 cpuid) {};
+static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {};
+#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
+
/*
* The clock events are used for clock enable/disable and for
* clock rate change
@@ -153,7 +222,6 @@ DEFINE_EVENT(power_domain, power_domain_target,

TP_ARGS(name, state, cpu_id)
);
-
#endif /* _TRACE_POWER_H */

/* This part must be outside protection */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e04b8bc..59b44a1 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -69,6 +69,21 @@ config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
bool

+config EVENT_POWER_TRACING_DEPRECATED
+ depends on EVENT_TRACING
+ bool "Deprecated power event trace API, to be removed"
+ default y
+ help
+ Provides old power event types:
+ C-state/idle accounting events:
+ power:power_start
+ power:power_end
+ and old cpufreq accounting event:
+ power:power_frequency
+ This is for userspace compatibility
+ and will vanish after 5 kernel iterations,
+ namely 2.6.41.
+
config CONTEXT_SWITCH_TRACER
bool

diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 0e0497d..f55fcf6 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>

+#ifdef EVENT_POWER_TRACING_DEPRECATED
EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+#endif
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);


>From b989c51b6f1989a834eecd9a64a7bd52ed230ea0 Mon Sep 17 00:00:00 2001
From: Thomas Renninger <[email protected]>
Date: Thu, 18 Nov 2010 10:25:12 +0100
Subject: [PATCH] perf: Do not export power_frequency, but power_start event

power_frequency moved to drivers/cpufreq/cpufreq.c which has
to be compiled in, no need to export it.

intel_idle can a be module though...

Signed-off-by: Thomas Renninger <[email protected]>
Acked-by: Jean Pihet <[email protected]>
CC: Arjan van de Ven <[email protected]>
Cc: [email protected]
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
drivers/idle/intel_idle.c | 2 --
kernel/trace/power-traces.c | 2 +-
2 files changed, 1 insertions(+), 3 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 41665d2..3c95325 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -220,9 +220,7 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)
kt_before = ktime_get_real();

stop_critical_timings();
-#ifndef MODULE
trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
-#endif
if (!need_resched()) {

__monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index a22582a..0e0497d 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,5 @@
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>

-EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);


Attachments:
(No filename) (11.81 kB)
config (38.76 kB)
Download all attachments

2010-11-18 16:34:17

by Jean Pihet

[permalink] [raw]
Subject: Re: [PATCH 2/3] PERF(kernel): Cleanup power events

On Thu, Nov 18, 2010 at 11:52 AM, Ingo Molnar <[email protected]> wrote:
>
> I am also getting build failures:
>
> drivers/cpufreq/cpufreq.c:357: error: 'POWER_PSTATE' undeclared (first use in this function)
> drivers/cpufreq/cpufreq.c:357: error: (Each undeclared identifier is reported only once
> drivers/cpufreq/cpufreq.c:357: error: for each function it appears in.)
> arch/x86/kernel/process.c:375: error: 'POWER_CSTATE' undeclared (first use in this function)
> arch/x86/kernel/process.c:375: error: (Each undeclared identifier is reported only once
> arch/x86/kernel/process.c:375: error: for each function it appears in.)
> arch/x86/kernel/process.c:446: error: 'POWER_CSTATE' undeclared (first use in this function)
> arch/x86/kernel/process.c:463: error: 'POWER_CSTATE' undeclared (first use in this function)
> arch/x86/kernel/process.c:485: error: 'POWER_CSTATE' undeclared (first use in this function)
> include/trace/events/power.h:142: error: redefinition of 'trace_power_start'
>
> Config attached.

The problem is because power.h gets included mutliple times, and so
the POWER_ enum and the empty deprecated functions need to be
protected from that.

Here is a patch below that fixes it, compile tested with and without
CONFIG_EVENT_POWER_TRACING_DEPRECATED set.

Ingo, Thomas, please let me know if you want me tp refresh the patches
with that fix.

diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 00d9819..89db5a1 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -136,12 +136,24 @@ enum {
POWER_PSTATE = 2,
};
#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
-#else
+
+#else /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
+
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+enum {
+ POWER_NONE = 0,
+ POWER_CSTATE = 1,
+ POWER_PSTATE = 2,
+};
+
/* These dummy declaration have to be ripped out when the deprecated
events get removed */
static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {};
static inline void trace_power_end(u64 cpuid) {};
static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {};
+#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
+
#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */

/*

Thanks,
Jean

>
> Note: please reuse the two commits from below for further work, i did some small
> cleanups to the commit text and to the patches.
>
> Thanks,
>
> ? ? ? ?Ingo
>
> ---------------->
> From 87a2cfbda3f53c3bf00c424ce18d97b03b0c3aa0 Mon Sep 17 00:00:00 2001
> From: Thomas Renninger <[email protected]>
> Date: Thu, 18 Nov 2010 10:25:13 +0100
> Subject: [PATCH] perf: Clean up power events
>
> Add these new power trace events:
>
> ?power:cpu_idle
> ?power:cpu_frequency
> ?power:machine_suspend
>
> The old C-state/idle accounting events:
> ?power:power_start
> ?power:power_end
>
> Have now a replacement (but we are still keeping the old
> tracepoints for compatibility):
>
> ?power:cpu_idle
>
> and
> ?power:power_frequency
>
> is replaced with:
> ?power:cpu_frequency
>
> power:machine_suspend is newly introduced.
>
> Jean Pihet has a patch integrated into the generic layer
> (kernel/power/suspend.c) which will make use of it.
>
> the type= field got removed from both, it was never
> used and the type is differed by the event type itself.
>
> perf timechart userspace tool gets adjusted in a separate patch.
>
> Signed-off-by: Thomas Renninger <[email protected]>
> Acked-by: Arjan van de Ven <[email protected]>
> Acked-by: Jean Pihet <[email protected]>
> Cc: Linus Torvalds <[email protected]>
> Cc: [email protected]
> LKML-Reference: <[email protected]>
> Signed-off-by: Ingo Molnar <[email protected]>
> ---
> ?arch/x86/kernel/process.c ? ?| ? ?7 +++-
> ?arch/x86/kernel/process_32.c | ? ?2 +-
> ?arch/x86/kernel/process_64.c | ? ?2 +
> ?drivers/cpufreq/cpufreq.c ? ?| ? ?1 +
> ?drivers/cpuidle/cpuidle.c ? ?| ? ?1 +
> ?drivers/idle/intel_idle.c ? ?| ? ?1 +
> ?include/trace/events/power.h | ? 86 +++++++++++++++++++++++++++++++++++++----
> ?kernel/trace/Kconfig ? ? ? ? | ? 15 +++++++
> ?kernel/trace/power-traces.c ?| ? ?3 +
> ?9 files changed, 107 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index 57d1868..155d975 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -374,6 +374,7 @@ void default_idle(void)
> ?{
> ? ? ? ?if (hlt_use_halt()) {
> ? ? ? ? ? ? ? ?trace_power_start(POWER_CSTATE, 1, smp_processor_id());
> + ? ? ? ? ? ? ? trace_cpu_idle(1, smp_processor_id());
> ? ? ? ? ? ? ? ?current_thread_info()->status &= ~TS_POLLING;
> ? ? ? ? ? ? ? ?/*
> ? ? ? ? ? ? ? ? * TS_POLLING-cleared state must be visible before we
> @@ -444,6 +445,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
> ?void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
> ?{
> ? ? ? ?trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
> + ? ? ? trace_cpu_idle((ax>>4)+1, smp_processor_id());
> ? ? ? ?if (!need_resched()) {
> ? ? ? ? ? ? ? ?if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
> ? ? ? ? ? ? ? ? ? ? ? ?clflush((void *)&current_thread_info()->flags);
> @@ -460,6 +462,7 @@ static void mwait_idle(void)
> ?{
> ? ? ? ?if (!need_resched()) {
> ? ? ? ? ? ? ? ?trace_power_start(POWER_CSTATE, 1, smp_processor_id());
> + ? ? ? ? ? ? ? trace_cpu_idle(1, smp_processor_id());
> ? ? ? ? ? ? ? ?if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
> ? ? ? ? ? ? ? ? ? ? ? ?clflush((void *)&current_thread_info()->flags);
>
> @@ -481,10 +484,12 @@ static void mwait_idle(void)
> ?static void poll_idle(void)
> ?{
> ? ? ? ?trace_power_start(POWER_CSTATE, 0, smp_processor_id());
> + ? ? ? trace_cpu_idle(0, smp_processor_id());
> ? ? ? ?local_irq_enable();
> ? ? ? ?while (!need_resched())
> ? ? ? ? ? ? ? ?cpu_relax();
> - ? ? ? trace_power_end(0);
> + ? ? ? trace_power_end(smp_processor_id());
> + ? ? ? trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
> ?}
>
> ?/*
> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
> index 96586c3..4b9befa 100644
> --- a/arch/x86/kernel/process_32.c
> +++ b/arch/x86/kernel/process_32.c
> @@ -113,8 +113,8 @@ void cpu_idle(void)
> ? ? ? ? ? ? ? ? ? ? ? ?stop_critical_timings();
> ? ? ? ? ? ? ? ? ? ? ? ?pm_idle();
> ? ? ? ? ? ? ? ? ? ? ? ?start_critical_timings();
> -
> ? ? ? ? ? ? ? ? ? ? ? ?trace_power_end(smp_processor_id());
> + ? ? ? ? ? ? ? ? ? ? ? trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
> ? ? ? ? ? ? ? ?}
> ? ? ? ? ? ? ? ?tick_nohz_restart_sched_tick();
> ? ? ? ? ? ? ? ?preempt_enable_no_resched();
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index b3d7a3a..4c818a7 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -142,6 +142,8 @@ void cpu_idle(void)
> ? ? ? ? ? ? ? ? ? ? ? ?start_critical_timings();
>
> ? ? ? ? ? ? ? ? ? ? ? ?trace_power_end(smp_processor_id());
> + ? ? ? ? ? ? ? ? ? ? ? trace_cpu_idle(PWR_EVENT_EXIT,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?smp_processor_id());
>
> ? ? ? ? ? ? ? ? ? ? ? ?/* In many cases the interrupt that ended idle
> ? ? ? ? ? ? ? ? ? ? ? ? ? has already called exit_idle. But some idle
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index c63a438..1109f68 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -355,6 +355,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
> ? ? ? ? ? ? ? ?dprintk("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new,
> ? ? ? ? ? ? ? ? ? ? ? ?(unsigned long)freqs->cpu);
> ? ? ? ? ? ? ? ?trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu);
> + ? ? ? ? ? ? ? trace_cpu_frequency(freqs->new, freqs->cpu);
> ? ? ? ? ? ? ? ?srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?CPUFREQ_POSTCHANGE, freqs);
> ? ? ? ? ? ? ? ?if (likely(policy) && likely(policy->cpu == freqs->cpu))
> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> index a507108..08d5f05 100644
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -107,6 +107,7 @@ static void cpuidle_idle_call(void)
> ? ? ? ?if (cpuidle_curr_governor->reflect)
> ? ? ? ? ? ? ? ?cpuidle_curr_governor->reflect(dev);
> ? ? ? ?trace_power_end(smp_processor_id());
> + ? ? ? trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
> ?}
>
> ?/**
> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> index 3c95325..ba5134f 100644
> --- a/drivers/idle/intel_idle.c
> +++ b/drivers/idle/intel_idle.c
> @@ -221,6 +221,7 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)
>
> ? ? ? ?stop_critical_timings();
> ? ? ? ?trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
> + ? ? ? trace_cpu_idle((eax >> 4) + 1, cpu);
> ? ? ? ?if (!need_resched()) {
>
> ? ? ? ? ? ? ? ?__monitor((void *)&current_thread_info()->flags, 0, 0);
> diff --git a/include/trace/events/power.h b/include/trace/events/power.h
> index 286784d..00d9819 100644
> --- a/include/trace/events/power.h
> +++ b/include/trace/events/power.h
> @@ -7,16 +7,67 @@
> ?#include <linux/ktime.h>
> ?#include <linux/tracepoint.h>
>
> -#ifndef _TRACE_POWER_ENUM_
> -#define _TRACE_POWER_ENUM_
> -enum {
> - ? ? ? POWER_NONE ? ? ?= 0,
> - ? ? ? POWER_CSTATE ? ?= 1, ? ?/* C-State */
> - ? ? ? POWER_PSTATE ? ?= 2, ? ?/* Fequency change or DVFS */
> - ? ? ? POWER_SSTATE ? ?= 3, ? ?/* Suspend */
> -};
> +DECLARE_EVENT_CLASS(cpu,
> +
> + ? ? ? TP_PROTO(unsigned int state, unsigned int cpu_id),
> +
> + ? ? ? TP_ARGS(state, cpu_id),
> +
> + ? ? ? TP_STRUCT__entry(
> + ? ? ? ? ? ? ? __field( ? ? ? ?u32, ? ? ? ? ? ?state ? ? ? ? ? )
> + ? ? ? ? ? ? ? __field( ? ? ? ?u32, ? ? ? ? ? ?cpu_id ? ? ? ? ?)
> + ? ? ? ),
> +
> + ? ? ? TP_fast_assign(
> + ? ? ? ? ? ? ? __entry->state = state;
> + ? ? ? ? ? ? ? __entry->cpu_id = cpu_id;
> + ? ? ? ),
> +
> + ? ? ? TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
> + ? ? ? ? ? ? ? ? (unsigned long)__entry->cpu_id)
> +);
> +
> +DEFINE_EVENT(cpu, cpu_idle,
> +
> + ? ? ? TP_PROTO(unsigned int state, unsigned int cpu_id),
> +
> + ? ? ? TP_ARGS(state, cpu_id)
> +);
> +
> +/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
> +#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING
> +#define _PWR_EVENT_AVOID_DOUBLE_DEFINING
> +
> +#define PWR_EVENT_EXIT -1
> ?#endif
>
> +DEFINE_EVENT(cpu, cpu_frequency,
> +
> + ? ? ? TP_PROTO(unsigned int frequency, unsigned int cpu_id),
> +
> + ? ? ? TP_ARGS(frequency, cpu_id)
> +);
> +
> +TRACE_EVENT(machine_suspend,
> +
> + ? ? ? TP_PROTO(unsigned int state),
> +
> + ? ? ? TP_ARGS(state),
> +
> + ? ? ? TP_STRUCT__entry(
> + ? ? ? ? ? ? ? __field( ? ? ? ?u32, ? ? ? ? ? ?state ? ? ? ? ? )
> + ? ? ? ),
> +
> + ? ? ? TP_fast_assign(
> + ? ? ? ? ? ? ? __entry->state = state;
> + ? ? ? ),
> +
> + ? ? ? TP_printk("state=%lu", (unsigned long)__entry->state)
> +);
> +
> +/* This code will be removed after deprecation time exceeded (2.6.41) */
> +#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
> +
> ?/*
> ?* The power events are used for cpuidle & suspend (power_start, power_end)
> ?* ?and for cpufreq (power_frequency)
> @@ -75,6 +126,24 @@ TRACE_EVENT(power_end,
>
> ?);
>
> +/* Deprecated dummy functions must be protected against multi-declartion */
> +#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
> +#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
> +
> +enum {
> + ? ? ? POWER_NONE = 0,
> + ? ? ? POWER_CSTATE = 1,
> + ? ? ? POWER_PSTATE = 2,
> +};
> +#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
> +#else
> +/* These dummy declaration have to be ripped out when the deprecated
> + ? events get removed */
> +static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {};
> +static inline void trace_power_end(u64 cpuid) {};
> +static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {};
> +#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
> +
> ?/*
> ?* The clock events are used for clock enable/disable and for
> ?* ?clock rate change
> @@ -153,7 +222,6 @@ DEFINE_EVENT(power_domain, power_domain_target,
>
> ? ? ? ?TP_ARGS(name, state, cpu_id)
> ?);
> -
> ?#endif /* _TRACE_POWER_H */
>
> ?/* This part must be outside protection */
> diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
> index e04b8bc..59b44a1 100644
> --- a/kernel/trace/Kconfig
> +++ b/kernel/trace/Kconfig
> @@ -69,6 +69,21 @@ config EVENT_TRACING
> ? ? ? ?select CONTEXT_SWITCH_TRACER
> ? ? ? ?bool
>
> +config EVENT_POWER_TRACING_DEPRECATED
> + ? ? ? depends on EVENT_TRACING
> + ? ? ? bool "Deprecated power event trace API, to be removed"
> + ? ? ? default y
> + ? ? ? help
> + ? ? ? ? Provides old power event types:
> + ? ? ? ? C-state/idle accounting events:
> + ? ? ? ? power:power_start
> + ? ? ? ? power:power_end
> + ? ? ? ? and old cpufreq accounting event:
> + ? ? ? ? power:power_frequency
> + ? ? ? ? This is for userspace compatibility
> + ? ? ? ? and will vanish after 5 kernel iterations,
> + ? ? ? ? namely 2.6.41.
> +
> ?config CONTEXT_SWITCH_TRACER
> ? ? ? ?bool
>
> diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
> index 0e0497d..f55fcf6 100644
> --- a/kernel/trace/power-traces.c
> +++ b/kernel/trace/power-traces.c
> @@ -13,5 +13,8 @@
> ?#define CREATE_TRACE_POINTS
> ?#include <trace/events/power.h>
>
> +#ifdef EVENT_POWER_TRACING_DEPRECATED
> ?EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
> +#endif
> +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
>
>
> From b989c51b6f1989a834eecd9a64a7bd52ed230ea0 Mon Sep 17 00:00:00 2001
> From: Thomas Renninger <[email protected]>
> Date: Thu, 18 Nov 2010 10:25:12 +0100
> Subject: [PATCH] perf: Do not export power_frequency, but power_start event
>
> power_frequency moved to drivers/cpufreq/cpufreq.c which has
> to be compiled in, no need to export it.
>
> intel_idle can a be module though...
>
> Signed-off-by: Thomas Renninger <[email protected]>
> Acked-by: Jean Pihet <[email protected]>
> CC: Arjan van de Ven <[email protected]>
> Cc: [email protected]
> LKML-Reference: <[email protected]>
> Signed-off-by: Ingo Molnar <[email protected]>
> ---
> ?drivers/idle/intel_idle.c ? | ? ?2 --
> ?kernel/trace/power-traces.c | ? ?2 +-
> ?2 files changed, 1 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> index 41665d2..3c95325 100644
> --- a/drivers/idle/intel_idle.c
> +++ b/drivers/idle/intel_idle.c
> @@ -220,9 +220,7 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)
> ? ? ? ?kt_before = ktime_get_real();
>
> ? ? ? ?stop_critical_timings();
> -#ifndef MODULE
> ? ? ? ?trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
> -#endif
> ? ? ? ?if (!need_resched()) {
>
> ? ? ? ? ? ? ? ?__monitor((void *)&current_thread_info()->flags, 0, 0);
> diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
> index a22582a..0e0497d 100644
> --- a/kernel/trace/power-traces.c
> +++ b/kernel/trace/power-traces.c
> @@ -13,5 +13,5 @@
> ?#define CREATE_TRACE_POINTS
> ?#include <trace/events/power.h>
>
> -EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
>
>

2010-11-19 00:14:30

by Thomas Renninger

[permalink] [raw]
Subject: Re: [PATCH 2/3] PERF(kernel): Cleanup power events

On Thursday 18 November 2010 05:34:15 pm Jean Pihet wrote:
> On Thu, Nov 18, 2010 at 11:52 AM, Ingo Molnar <[email protected]> wrote:
> >
...
> The problem is because power.h gets included mutliple times, and so
> the POWER_ enum and the empty deprecated functions need to be
> protected from that.
>
> Here is a patch below that fixes it, compile tested with and without
> CONFIG_EVENT_POWER_TRACING_DEPRECATED set.
Yep, that should be the correct fix.
While I tested both options before, after the pre-processor mess ups, it
looks like I did test a lot of different archs/flavors through our
build service, but all .configs were set by default to
CONFIG_EVENT_POWER_TRACING_DEPRECATED=y
Stupid, sorry about that.

> Ingo, Thomas, please let me know if you want me tp refresh the patches
> with that fix.
I'll add it to the end, based on the one Ingo sent.
Ingo: As you started fiddling with it, is that enough or do you prefer
another whole patch series resend?

...

> > From b989c51b6f1989a834eecd9a64a7bd52ed230ea0 Mon Sep 17 00:00:00 2001
> > From: Thomas Renninger <[email protected]>
> > Date: Thu, 18 Nov 2010 10:25:12 +0100
> > Subject: [PATCH] perf: Do not export power_frequency, but power_start
> > event
This is an independent fix, you can just push it.

Thanks Jean/Ingo, hope that was the last remaining issue...

Thomas

--------
perf: Clean up power events

Add these new power trace events:

power:cpu_idle
power:cpu_frequency
power:machine_suspend

The old C-state/idle accounting events:
power:power_start
power:power_end

Have now a replacement (but we are still keeping the old
tracepoints for compatibility):

power:cpu_idle

and
power:power_frequency

is replaced with:
power:cpu_frequency

power:machine_suspend is newly introduced.

Jean Pihet has a patch integrated into the generic layer
(kernel/power/suspend.c) which will make use of it.

the type= field got removed from both, it was never
used and the type is differed by the event type itself.

perf timechart userspace tool gets adjusted in a separate patch.

Signed-off-by: Thomas Renninger <[email protected]>
Acked-by: Arjan van de Ven <[email protected]>
Acked-by: Jean Pihet <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: [email protected]
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 57d1868..155d975 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -374,6 +374,7 @@ void default_idle(void)
{
if (hlt_use_halt()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+ trace_cpu_idle(1, smp_processor_id());
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -444,6 +445,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
+ trace_cpu_idle((ax>>4)+1, smp_processor_id());
if (!need_resched()) {
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -460,6 +462,7 @@ static void mwait_idle(void)
{
if (!need_resched()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+ trace_cpu_idle(1, smp_processor_id());
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);

@@ -481,10 +484,12 @@ static void mwait_idle(void)
static void poll_idle(void)
{
trace_power_start(POWER_CSTATE, 0, smp_processor_id());
+ trace_cpu_idle(0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
- trace_power_end(0);
+ trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}

/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 96586c3..4b9befa 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -113,8 +113,8 @@ void cpu_idle(void)
stop_critical_timings();
pm_idle();
start_critical_timings();
-
trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3d7a3a..4c818a7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -142,6 +142,8 @@ void cpu_idle(void)
start_critical_timings();

trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT,
+ smp_processor_id());

/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index c63a438..1109f68 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -355,6 +355,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
dprintk("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new,
(unsigned long)freqs->cpu);
trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu);
+ trace_cpu_frequency(freqs->new, freqs->cpu);
srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
CPUFREQ_POSTCHANGE, freqs);
if (likely(policy) && likely(policy->cpu == freqs->cpu))
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index a507108..08d5f05 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -107,6 +107,7 @@ static void cpuidle_idle_call(void)
if (cpuidle_curr_governor->reflect)
cpuidle_curr_governor->reflect(dev);
trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}

/**
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 3c95325..ba5134f 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -221,6 +221,7 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)

stop_critical_timings();
trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
+ trace_cpu_idle((eax >> 4) + 1, cpu);
if (!need_resched()) {

__monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 286784d..46596ad 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -7,16 +7,67 @@
#include <linux/ktime.h>
#include <linux/tracepoint.h>

-#ifndef _TRACE_POWER_ENUM_
-#define _TRACE_POWER_ENUM_
-enum {
- POWER_NONE = 0,
- POWER_CSTATE = 1, /* C-State */
- POWER_PSTATE = 2, /* Fequency change or DVFS */
- POWER_SSTATE = 3, /* Suspend */
-};
+DECLARE_EVENT_CLASS(cpu,
+
+ TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+ TP_ARGS(state, cpu_id),
+
+ TP_STRUCT__entry(
+ __field( u32, state )
+ __field( u32, cpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->state = state;
+ __entry->cpu_id = cpu_id;
+ ),
+
+ TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
+ (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(cpu, cpu_idle,
+
+ TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+ TP_ARGS(state, cpu_id)
+);
+
+/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING
+
+#define PWR_EVENT_EXIT -1
#endif

+DEFINE_EVENT(cpu, cpu_frequency,
+
+ TP_PROTO(unsigned int frequency, unsigned int cpu_id),
+
+ TP_ARGS(frequency, cpu_id)
+);
+
+TRACE_EVENT(machine_suspend,
+
+ TP_PROTO(unsigned int state),
+
+ TP_ARGS(state),
+
+ TP_STRUCT__entry(
+ __field( u32, state )
+ ),
+
+ TP_fast_assign(
+ __entry->state = state;
+ ),
+
+ TP_printk("state=%lu", (unsigned long)__entry->state)
+);
+
+/* This code will be removed after deprecation time exceeded (2.6.41) */
+#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
+
/*
* The power events are used for cpuidle & suspend (power_start, power_end)
* and for cpufreq (power_frequency)
@@ -75,6 +126,35 @@ TRACE_EVENT(power_end,

);

+/* Deprecated dummy functions must be protected against multi-declartion */
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+
+enum {
+ POWER_NONE = 0,
+ POWER_CSTATE = 1,
+ POWER_PSTATE = 2,
+};
+#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
+
+#else /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
+
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+enum {
+ POWER_NONE = 0,
+ POWER_CSTATE = 1,
+ POWER_PSTATE = 2,
+};
+
+/* These dummy declaration have to be ripped out when the deprecated
+ events get removed */
+static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {};
+static inline void trace_power_end(u64 cpuid) {};
+static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {};
+#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
+#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
+
/*
* The clock events are used for clock enable/disable and for
* clock rate change
@@ -153,7 +233,6 @@ DEFINE_EVENT(power_domain, power_domain_target,

TP_ARGS(name, state, cpu_id)
);
-
#endif /* _TRACE_POWER_H */

/* This part must be outside protection */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ea37e2f..14674dc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -69,6 +69,21 @@ config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
bool

+config EVENT_POWER_TRACING_DEPRECATED
+ depends on EVENT_TRACING
+ bool "Deprecated power event trace API, to be removed"
+ default y
+ help
+ Provides old power event types:
+ C-state/idle accounting events:
+ power:power_start
+ power:power_end
+ and old cpufreq accounting event:
+ power:power_frequency
+ This is for userspace compatibility
+ and will vanish after 5 kernel iterations,
+ namely 2.6.41.
+
config CONTEXT_SWITCH_TRACER
bool

diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 0e0497d..f55fcf6 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>

+#ifdef EVENT_POWER_TRACING_DEPRECATED
EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+#endif
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);