David Ahern (2):
perf tools: Add reference timestamp to perf header
perf tools: Add option to show time history of event samples
tools/perf/builtin-record.c | 8 +++
tools/perf/builtin-report.c | 131 ++++++++++++++++++++++++++++++++++++++++++-
tools/perf/util/header.c | 51 +++++++++++++++++
tools/perf/util/header.h | 3 +
tools/perf/util/session.c | 26 +++++++++
5 files changed, 217 insertions(+), 2 deletions(-)
--
1.7.2.3
Add a reference timestamp to the perf header - snapshotting kernel
time to gettimeofday. This allows 'perf report' to convert kernel
timestamps to time-of-day which is convenient for comparing to other
(non perf related) log files.
The timestamp is added using a feature bit for compatibility with
older binaries and data files.
Signed-off-by: David Ahern <[email protected]>
---
tools/perf/util/header.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
tools/perf/util/header.h | 3 ++
tools/perf/util/session.c | 26 +++++++++++++++++++++++
3 files changed, 80 insertions(+), 0 deletions(-)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 76e949a..2fe893b 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -191,6 +191,40 @@ static int write_padded(int fd, const void *bf, size_t count,
return err;
}
+static int perf_header__read_ref_time(struct perf_header *header,
+ int fd, u64 offset, u64 size)
+{
+ size_t sz_nsec = sizeof(header->nsec_ref);
+ size_t sz_tv = sizeof(header->tv_ref);
+ int err = -1;
+
+ if (((size - offset) < (sz_nsec + sz_tv)) ||
+ (read(fd, &header->nsec_ref, sz_nsec) != (ssize_t) sz_nsec) ||
+ (read(fd, &header->tv_ref, sz_tv) != (ssize_t) sz_tv))
+ goto out;
+
+ err = 0;
+
+out:
+ return err;
+}
+
+static int perf_header__write_ref_time(struct perf_header *header, int fd)
+{
+ size_t sz_nsec = sizeof(header->nsec_ref);
+ size_t sz_tv = sizeof(header->tv_ref);
+ int err = -1;
+
+ if ((write(fd, &header->nsec_ref, sz_nsec) != (ssize_t) sz_nsec) ||
+ (write(fd, &header->tv_ref, sz_tv) != (ssize_t) sz_tv))
+ goto out;
+
+ err = 0;
+
+out:
+ return err;
+}
+
#define dsos__for_each_with_build_id(pos, head) \
list_for_each_entry(pos, head, node) \
if (!pos->has_build_id) \
@@ -483,6 +517,19 @@ static int perf_header__adds_write(struct perf_header *self, int fd)
perf_session__cache_build_ids(session);
}
+ if (perf_header__has_feat(self, HEADER_REFERENCE_TIME)) {
+ struct perf_file_section *tref_sec;
+
+ tref_sec = &feat_sec[idx++];
+ tref_sec->offset = lseek(fd, 0, SEEK_CUR);
+ err = perf_header__write_ref_time(self, fd);
+ if (err < 0) {
+ pr_debug("failed to write reference time\n");
+ goto out_free;
+ }
+ tref_sec->size = lseek(fd, 0, SEEK_CUR) - tref_sec->offset;
+ }
+
lseek(fd, sec_start, SEEK_SET);
err = do_write(fd, feat_sec, sec_size);
if (err < 0)
@@ -810,6 +857,10 @@ static int perf_file_section__process(struct perf_file_section *self,
if (perf_header__read_build_ids(ph, fd, self->offset, self->size))
pr_debug("Failed to read buildids, continuing...\n");
break;
+ case HEADER_REFERENCE_TIME:
+ if (perf_header__read_ref_time(ph, fd, self->offset, self->size))
+ pr_debug("Failed to read reference time, continuing...\n");
+ break;
default:
pr_debug("unknown feature %d, continuing...\n", feat);
}
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 6335965..0721c78 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -19,6 +19,7 @@ struct perf_header_attr {
enum {
HEADER_TRACE_INFO = 1,
HEADER_BUILD_ID,
+ HEADER_REFERENCE_TIME,
HEADER_LAST_FEATURE,
};
@@ -59,6 +60,8 @@ struct perf_header {
u64 data_size;
u64 event_offset;
u64 event_size;
+ u64 nsec_ref;
+ struct timeval tv_ref;
DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS);
};
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 3074d38..720838e 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -110,6 +110,29 @@ void perf_session__update_sample_type(struct perf_session *self)
perf_session__id_header_size(self);
}
+static int perf_session__create_ref_time(struct perf_session *session)
+{
+ struct timespec tp;
+
+ /* race here between successive calls, but should be close enough */
+ if (gettimeofday(&session->header.tv_ref, NULL) != 0) {
+ pr_err("gettimeofday failed. Cannot set reference time.\n");
+ return -1;
+ }
+
+ if (clock_gettime(CLOCK_MONOTONIC, &tp) != 0) {
+ pr_err("clock_gettime failed. Cannot set reference time.\n");
+ return -1;
+ }
+
+ session->header.nsec_ref = (u64) tp.tv_sec * NSEC_PER_SEC
+ + (u64) tp.tv_nsec;
+
+ perf_header__set_feat(&session->header, HEADER_REFERENCE_TIME);
+
+ return 0;
+}
+
int perf_session__create_kernel_maps(struct perf_session *self)
{
int ret = machine__create_kernel_maps(&self->host_machine);
@@ -167,6 +190,9 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
*/
if (perf_session__create_kernel_maps(self) < 0)
goto out_delete;
+
+ if (perf_session__create_ref_time(self) < 0)
+ goto out_delete;
}
perf_session__update_sample_type(self);
--
1.7.2.3
Add a timehist option to perf record to save cpu and kernel timestamp
with each sample.
Add timehist option to perf report to display a pretty-print format
of the timestamp, cpu, and event data each event rather than
generating a histogram. The timehist option leverages the reference
timestamp from the perf header to correlate kernel timestamps to
time-of-day which allows correlating with external (non-perf)
log files.
As an example, consider recording context switches:
perf record -e cs -c 1 -ga --timehist
Today the 'perf report -D' option gives output in the form
0 182392227877 0x17d8f8 [0xd8]: PERF_RECORD_SAMPLE(IP, 1): \
1861/1861: 0xffffffff810348d2 period: 2057672
... chain: nr:21
..... 0: ffffffffffffff80
..... 1: ffffffff810348d2
..... 2: ffffffff8139c02b
..... 3: ffffffffa004b4a3
..... 4: ffffffff8139c79c
..... 5: ffffffff8139c83d
..... 6: ffffffffa004b43f
..... 7: ffffffffa00ec135
..... 8: ffffffffa00ec214
..... 9: ffffffffa00f01ec
..... 10: ffffffffa00f0439
..... 11: ffffffffa00d6e8b
..... 12: ffffffff8110b763
..... 13: ffffffff8110b883
..... 14: ffffffff8110ddcd
..... 15: ffffffff8110e45d
..... 16: ffffffff81101f36
..... 17: ffffffff81101ff3
..... 18: ffffffff81002c82
..... 19: fffffffffffffe00
..... 20: 00002acfd31ce050
... thread: cc1:1861
...... dso: /lib/modules/2.6.37-rc5/build/vmlinux
With the timehist option 'perf report --timehist' displays the same sample as
(lines wrapped to < 80 and truncated here):
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffff810348d2 perf_event_task_sched_out ([kernel.kallsyms])
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffff8139c02b schedule ([kernel.kallsyms])
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffffa004b4a3 rpc_wait_bit_killable (/lib/modules/2.6.37-rc5/...
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffff8139c79c __wait_on_bit ([kernel.kallsyms])
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffff8139c83d out_of_line_wait_on_bit ([kernel.kallsyms])
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffffa004b43f __rpc_wait_for_completion_task (/lib/modules/...
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffffa00ec135 nfs4_wait_for_completion_rpc_task (/lib/modules/...
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffffa00ec214 nfs4_run_open_task (/lib/modules/2.6.37-rc5/...
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffffa00f01ec nfs4_do_open.clone.21 (/lib/modules/2.6.37-rc5/...
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffffa00f0439 nfs4_atomic_open (/lib/modules/2.6.37-rc5/...
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffffa00d6e8b nfs_open_revalidate (/lib/modules/2.6.37-rc5/...
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffff8110b763 do_revalidate ([kernel.kallsyms])
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffff8110b883 do_lookup ([kernel.kallsyms])
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffff8110ddcd do_last ([kernel.kallsyms])
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffff8110e45d do_filp_open ([kernel.kallsyms])
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffff81101f36 do_sys_open ([kernel.kallsyms])
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffff81101ff3 sys_open ([kernel.kallsyms])
17:11:10.174502 182392227877 0 cc1 1861 \
ffffffff81002c82 system_call ([kernel.kallsyms])
17:11:10.174502 182392227877 0 cc1 1861 \
2acfd31ce050 __GI___libc_open (/lib64/libc-2.12.90.so)
While the line lengths are horrible for a git commit message and email,
the data is indispensable. It allows you to see each schedule out point,
where the process is when it gets scheduled out and, with additional
processing on the timehist output, time between schedule-in events and
time on the processor.
Signed-off-by: David Ahern <[email protected]>
---
tools/perf/builtin-record.c | 8 +++
tools/perf/builtin-report.c | 131 ++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 137 insertions(+), 2 deletions(-)
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 699dd21..cf30c98 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -51,6 +51,7 @@ static int realtime_prio = 0;
static bool raw_samples = false;
static bool sample_id_all_avail = true;
static bool system_wide = false;
+static bool time_history = false;
static pid_t target_pid = -1;
static pid_t target_tid = -1;
static pid_t *all_tids = NULL;
@@ -288,6 +289,11 @@ static void create_counter(int counter, int cpu)
if (sample_time)
attr->sample_type |= PERF_SAMPLE_TIME;
+ if (time_history) {
+ attr->sample_type |= PERF_SAMPLE_TIME;
+ attr->sample_type |= PERF_SAMPLE_CPU;
+ }
+
if (raw_samples) {
attr->sample_type |= PERF_SAMPLE_TIME;
attr->sample_type |= PERF_SAMPLE_RAW;
@@ -863,6 +869,8 @@ const struct option record_options[] = {
"do not update the buildid cache"),
OPT_BOOLEAN('B', "no-buildid", &no_buildid,
"do not collect buildids in perf.data"),
+ OPT_BOOLEAN(0, "timehist", &time_history,
+ "collect data for time history report"),
OPT_END()
};
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 904519f..bcde3c2 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -35,6 +35,7 @@ static char const *input_name = "perf.data";
static bool force, use_tui, use_stdio;
static bool hide_unresolved;
static bool dont_use_callchains;
+static bool time_history;
static bool show_threads;
static struct perf_read_values show_threads_values;
@@ -124,6 +125,117 @@ out_free_syms:
return err;
}
+static const char *timestr(u64 timestamp, struct perf_session *session)
+{
+ struct tm ltime;
+ u64 dt;
+ struct timeval tv_dt, tv_res;
+ static char tstr[64];
+
+ dt = timestamp - session->header.nsec_ref;
+ tv_dt.tv_sec = (time_t) (dt / NSEC_PER_SEC);
+ tv_dt.tv_usec = (suseconds_t) ((dt - tv_dt.tv_sec * NSEC_PER_SEC) / 1000);
+
+ timeradd(&session->header.tv_ref, &tv_dt, &tv_res);
+
+ if ((session->header.nsec_ref == 0) ||
+ (localtime_r(&tv_res.tv_sec, <ime) == NULL)) {
+ snprintf(tstr, sizeof(tstr), "00:00:00.000000 %16Ld", timestamp);
+ } else {
+ char date[64];
+ strftime(date, sizeof(date), "%H:%M:%S", <ime);
+ snprintf(tstr, sizeof(tstr), "%s.%06ld %16Ld",
+ date, tv_res.tv_usec, timestamp);
+ }
+
+ return tstr;
+}
+
+#define TIMEHIST_FMT "%32s %3d %-16s %5d %16Lx %s (%s)\n"
+
+static void timehist_header(void)
+{
+ printf("%32s %3s %-16s %5s %16s %s (%s)\n",
+ " Time-of-Day Kernel Timestamp",
+ "cpu", "Command Name", "PID",
+ " IP ", "Symbol Name", "DSO Name");
+}
+
+static int perf_session__print_sample(struct perf_session *session,
+ struct addr_location *al,
+ struct sample_data *data)
+{
+ static int show_timehist_error = 1;
+ u64 timestamp = 0;
+ struct map_symbol *syms = NULL;
+ struct symbol *parent = NULL;
+ const char *tstr;
+ bool need_spacer = false;
+
+ if (show_timehist_error && ((data->cpu == (u32) -1) ||
+ !(session->sample_type & PERF_SAMPLE_TIME))) {
+ pr_err("Data for time history missing in perf event samples.\n"
+ "Did you record with --timehist option?\n");
+ show_timehist_error = 0;
+ }
+
+ if (session->sample_type & PERF_SAMPLE_TIME)
+ timestamp = data->time;
+
+ tstr = timestr(timestamp, session);
+
+
+ if ((symbol_conf.use_callchain) && data->callchain) {
+ u64 i;
+
+ syms = perf_session__resolve_callchain(session, al->thread,
+ data->callchain, &parent);
+ if (syms == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < data->callchain->nr; ++i) {
+ const char *symname = "", *dsoname = "";
+
+ if (data->callchain->ips[i] >= PERF_CONTEXT_MAX)
+ continue;
+
+ if (syms[i].sym && syms[i].sym->name)
+ symname = syms[i].sym->name;
+ else if (hide_unresolved)
+ continue;
+
+ if (syms[i].map && syms[i].map->dso &&
+ syms[i].map->dso->name)
+ dsoname = syms[i].map->dso->name;
+ else if (hide_unresolved)
+ continue;
+
+ printf(TIMEHIST_FMT,
+ tstr, data->cpu,
+ al->thread->comm, al->thread->pid,
+ data->callchain->ips[i], symname, dsoname);
+
+ need_spacer = true;
+ }
+
+ free(syms);
+
+ } else {
+ printf(TIMEHIST_FMT,
+ tstr, data->cpu,
+ al->thread->comm, al->thread->pid, al->addr,
+ al->sym->name, al->map->dso->name);
+
+ need_spacer = true;
+ }
+
+ /* put a gap between records */
+ if (need_spacer)
+ printf("\n");
+
+ return 0;
+}
+
static int add_event_total(struct perf_session *session,
struct sample_data *data,
struct perf_event_attr *attr)
@@ -165,7 +277,9 @@ static int process_sample_event(event_t *event, struct sample_data *sample,
if (al.filtered || (hide_unresolved && al.sym == NULL))
return 0;
- if (perf_session__add_hist_entry(session, &al, sample)) {
+ if (time_history) {
+ perf_session__print_sample(session, &al, sample);
+ } else if (perf_session__add_hist_entry(session, &al, sample)) {
pr_debug("problem incrementing symbol period, skipping event\n");
return -1;
}
@@ -319,6 +433,14 @@ static int __cmd_report(void)
if (ret)
goto out_delete;
+ if (time_history) {
+ if (!session->header.nsec_ref) {
+ pr_err("Reference timestamp missing in perf.data file.\n"
+ "Cannot convert kernel timestamps to time-of-day.\n");
+ }
+ timehist_header();
+ }
+
ret = perf_session__process_events(session, &event_ops);
if (ret)
goto out_delete;
@@ -334,6 +456,9 @@ static int __cmd_report(void)
if (verbose > 2)
perf_session__fprintf_dsos(session, stdout);
+ if (time_history)
+ goto out_delete;
+
next = rb_first(&session->hists_tree);
while (next) {
struct hists *hists;
@@ -479,6 +604,8 @@ static const struct option options[] = {
"columns '.' is reserved."),
OPT_BOOLEAN('U', "hide-unresolved", &hide_unresolved,
"Only display entries resolved to a symbol"),
+ OPT_BOOLEAN(0, "timehist", &time_history,
+ "Dump time history of event samples"),
OPT_END()
};
@@ -486,7 +613,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
{
argc = parse_options(argc, argv, options, report_usage, 0);
- if (use_stdio)
+ if (use_stdio || time_history)
use_browser = 0;
else if (use_tui)
use_browser = 1;
--
1.7.2.3
Em Tue, Dec 07, 2010 at 06:54:45PM -0700, David Ahern escreveu:
> Add a timehist option to perf record to save cpu and kernel timestamp
> with each sample.
>
> Add timehist option to perf report to display a pretty-print format
> of the timestamp, cpu, and event data each event rather than
> generating a histogram. The timehist option leverages the reference
> timestamp from the perf header to correlate kernel timestamps to
> time-of-day which allows correlating with external (non-perf)
> log files.
>
> As an example, consider recording context switches:
> perf record -e cs -c 1 -ga --timehist
>
> Today the 'perf report -D' option gives output in the form
> 0 182392227877 0x17d8f8 [0xd8]: PERF_RECORD_SAMPLE(IP, 1): \
> 1861/1861: 0xffffffff810348d2 period: 2057672
> ... chain: nr:21
> ..... 0: ffffffffffffff80
> ..... 1: ffffffff810348d2
> ..... 2: ffffffff8139c02b
> ..... 3: ffffffffa004b4a3
> ..... 4: ffffffff8139c79c
> ..... 5: ffffffff8139c83d
> ..... 6: ffffffffa004b43f
> ..... 7: ffffffffa00ec135
> ..... 8: ffffffffa00ec214
> ..... 9: ffffffffa00f01ec
> ..... 10: ffffffffa00f0439
> ..... 11: ffffffffa00d6e8b
> ..... 12: ffffffff8110b763
> ..... 13: ffffffff8110b883
> ..... 14: ffffffff8110ddcd
> ..... 15: ffffffff8110e45d
> ..... 16: ffffffff81101f36
> ..... 17: ffffffff81101ff3
> ..... 18: ffffffff81002c82
> ..... 19: fffffffffffffe00
> ..... 20: 00002acfd31ce050
> ... thread: cc1:1861
> ...... dso: /lib/modules/2.6.37-rc5/build/vmlinux
>
> With the timehist option 'perf report --timehist' displays the same sample as
> (lines wrapped to < 80 and truncated here):
You can instead compress the same info as:
-------------------------------
With the timehist option 'perf report --timehist' displays the same
sample as (some fields removed/compressed to make it fit under 80 columns)
17:11:10.1 cc1 61 ff810348d2 perf_event_task_sched_out ([kernel.kallsyms])
17:11:10.1 cc1 61 ff8139c02b schedule ([kernel.kallsyms])
17:11:10.1 cc1 61 ffa004b4a3 rpc_wait_bit_killable (/lib/modules/2.6.37-rc5/...
17:11:10.1 cc1 61 ff8139c79c __wait_on_bit ([kernel.kallsyms])
17:11:10.1 cc1 61 ff8139c83d out_of_line_wait_on_bit ([kernel.kallsyms])
17:11:10.1 cc1 61 ffa004b43f __rpc_wait_for_completion_task (/lib/modules/...
17:11:10.1 cc1 61 ffa00ec135 nfs4_wait_for_completion_rpc_task (/lib/modules/...
17:11:10.1 cc1 61 ffa00ec214 nfs4_run_open_task (/lib/modules/2.6.37-rc5/...
17:11:10.1 cc1 61 ffa00f01ec nfs4_do_open.clone.21 (/lib/modules/2.6.37-rc5/...
17:11:10.1 cc1 61 ffa00f0439 nfs4_atomic_open (/lib/modules/2.6.37-rc5/...
17:11:10.1 cc1 61 ffa00d6e8b nfs_open_revalidate (/lib/modules/2.6.37-rc5/...
17:11:10.1 cc1 61 ff8110b763 do_revalidate ([kernel.kallsyms])
17:11:10.1 cc1 61 ff8110b883 do_lookup ([kernel.kallsyms])
17:11:10.1 cc1 61 ff8110ddcd do_last ([kernel.kallsyms])
17:11:10.1 cc1 61 ff8110e45d do_filp_open ([kernel.kallsyms])
17:11:10.1 cc1 61 ff81101f36 do_sys_open ([kernel.kallsyms])
17:11:10.1 cc1 61 ff81101ff3 sys_open ([kernel.kallsyms])
17:11:10.1 cc1 61 ff81002c82 system_call ([kernel.kallsyms])
17:11:10.1 cc1 61 1ce050 __GI___libc_open (/lib64/libc-2.12.90.so)
It allows you to see each schedule out point, where the process is when
it gets scheduled out and, with additional processing on the timehist
output, time between schedule-in events and time on the processor.
-----------------------------
changing the PID and addresses on the 'perf record -D' output to match the
shorter PID, etc, helps too.
will continue reviewing later, thanks
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff810348d2 perf_event_task_sched_out ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8139c02b schedule ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa004b4a3 rpc_wait_bit_killable (/lib/modules/2.6.37-rc5/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8139c79c __wait_on_bit ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8139c83d out_of_line_wait_on_bit ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa004b43f __rpc_wait_for_completion_task (/lib/modules/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa00ec135 nfs4_wait_for_completion_rpc_task (/lib/modules/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa00ec214 nfs4_run_open_task (/lib/modules/2.6.37-rc5/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa00f01ec nfs4_do_open.clone.21 (/lib/modules/2.6.37-rc5/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa00f0439 nfs4_atomic_open (/lib/modules/2.6.37-rc5/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa00d6e8b nfs_open_revalidate (/lib/modules/2.6.37-rc5/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8110b763 do_revalidate ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8110b883 do_lookup ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8110ddcd do_last ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8110e45d do_filp_open ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff81101f36 do_sys_open ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff81101ff3 sys_open ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff81002c82 system_call ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> 2acfd31ce050 __GI___libc_open (/lib64/libc-2.12.90.so)
>
> While the line lengths are horrible for a git commit message and email,
> the data is indispensable. It allows you to see each schedule out point,
> where the process is when it gets scheduled out and, with additional
> processing on the timehist output, time between schedule-in events and
> time on the processor.
>
> Signed-off-by: David Ahern <[email protected]>
> ---
> tools/perf/builtin-record.c | 8 +++
> tools/perf/builtin-report.c | 131 ++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 137 insertions(+), 2 deletions(-)
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 699dd21..cf30c98 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -51,6 +51,7 @@ static int realtime_prio = 0;
> static bool raw_samples = false;
> static bool sample_id_all_avail = true;
> static bool system_wide = false;
> +static bool time_history = false;
> static pid_t target_pid = -1;
> static pid_t target_tid = -1;
> static pid_t *all_tids = NULL;
> @@ -288,6 +289,11 @@ static void create_counter(int counter, int cpu)
> if (sample_time)
> attr->sample_type |= PERF_SAMPLE_TIME;
>
> + if (time_history) {
> + attr->sample_type |= PERF_SAMPLE_TIME;
> + attr->sample_type |= PERF_SAMPLE_CPU;
> + }
> +
> if (raw_samples) {
> attr->sample_type |= PERF_SAMPLE_TIME;
> attr->sample_type |= PERF_SAMPLE_RAW;
> @@ -863,6 +869,8 @@ const struct option record_options[] = {
> "do not update the buildid cache"),
> OPT_BOOLEAN('B', "no-buildid", &no_buildid,
> "do not collect buildids in perf.data"),
> + OPT_BOOLEAN(0, "timehist", &time_history,
> + "collect data for time history report"),
> OPT_END()
> };
>
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index 904519f..bcde3c2 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -35,6 +35,7 @@ static char const *input_name = "perf.data";
> static bool force, use_tui, use_stdio;
> static bool hide_unresolved;
> static bool dont_use_callchains;
> +static bool time_history;
>
> static bool show_threads;
> static struct perf_read_values show_threads_values;
> @@ -124,6 +125,117 @@ out_free_syms:
> return err;
> }
>
> +static const char *timestr(u64 timestamp, struct perf_session *session)
> +{
> + struct tm ltime;
> + u64 dt;
> + struct timeval tv_dt, tv_res;
> + static char tstr[64];
> +
> + dt = timestamp - session->header.nsec_ref;
> + tv_dt.tv_sec = (time_t) (dt / NSEC_PER_SEC);
> + tv_dt.tv_usec = (suseconds_t) ((dt - tv_dt.tv_sec * NSEC_PER_SEC) / 1000);
> +
> + timeradd(&session->header.tv_ref, &tv_dt, &tv_res);
> +
> + if ((session->header.nsec_ref == 0) ||
> + (localtime_r(&tv_res.tv_sec, <ime) == NULL)) {
> + snprintf(tstr, sizeof(tstr), "00:00:00.000000 %16Ld", timestamp);
> + } else {
> + char date[64];
> + strftime(date, sizeof(date), "%H:%M:%S", <ime);
> + snprintf(tstr, sizeof(tstr), "%s.%06ld %16Ld",
> + date, tv_res.tv_usec, timestamp);
> + }
> +
> + return tstr;
> +}
> +
> +#define TIMEHIST_FMT "%32s %3d %-16s %5d %16Lx %s (%s)\n"
> +
> +static void timehist_header(void)
> +{
> + printf("%32s %3s %-16s %5s %16s %s (%s)\n",
> + " Time-of-Day Kernel Timestamp",
> + "cpu", "Command Name", "PID",
> + " IP ", "Symbol Name", "DSO Name");
> +}
> +
> +static int perf_session__print_sample(struct perf_session *session,
> + struct addr_location *al,
> + struct sample_data *data)
> +{
> + static int show_timehist_error = 1;
> + u64 timestamp = 0;
> + struct map_symbol *syms = NULL;
> + struct symbol *parent = NULL;
> + const char *tstr;
> + bool need_spacer = false;
> +
> + if (show_timehist_error && ((data->cpu == (u32) -1) ||
> + !(session->sample_type & PERF_SAMPLE_TIME))) {
> + pr_err("Data for time history missing in perf event samples.\n"
> + "Did you record with --timehist option?\n");
> + show_timehist_error = 0;
> + }
> +
> + if (session->sample_type & PERF_SAMPLE_TIME)
> + timestamp = data->time;
> +
> + tstr = timestr(timestamp, session);
> +
> +
> + if ((symbol_conf.use_callchain) && data->callchain) {
> + u64 i;
> +
> + syms = perf_session__resolve_callchain(session, al->thread,
> + data->callchain, &parent);
> + if (syms == NULL)
> + return -ENOMEM;
> +
> + for (i = 0; i < data->callchain->nr; ++i) {
> + const char *symname = "", *dsoname = "";
> +
> + if (data->callchain->ips[i] >= PERF_CONTEXT_MAX)
> + continue;
> +
> + if (syms[i].sym && syms[i].sym->name)
> + symname = syms[i].sym->name;
> + else if (hide_unresolved)
> + continue;
> +
> + if (syms[i].map && syms[i].map->dso &&
> + syms[i].map->dso->name)
> + dsoname = syms[i].map->dso->name;
> + else if (hide_unresolved)
> + continue;
> +
> + printf(TIMEHIST_FMT,
> + tstr, data->cpu,
> + al->thread->comm, al->thread->pid,
> + data->callchain->ips[i], symname, dsoname);
> +
> + need_spacer = true;
> + }
> +
> + free(syms);
> +
> + } else {
> + printf(TIMEHIST_FMT,
> + tstr, data->cpu,
> + al->thread->comm, al->thread->pid, al->addr,
> + al->sym->name, al->map->dso->name);
> +
> + need_spacer = true;
> + }
> +
> + /* put a gap between records */
> + if (need_spacer)
> + printf("\n");
> +
> + return 0;
> +}
> +
> static int add_event_total(struct perf_session *session,
> struct sample_data *data,
> struct perf_event_attr *attr)
> @@ -165,7 +277,9 @@ static int process_sample_event(event_t *event, struct sample_data *sample,
> if (al.filtered || (hide_unresolved && al.sym == NULL))
> return 0;
>
> - if (perf_session__add_hist_entry(session, &al, sample)) {
> + if (time_history) {
> + perf_session__print_sample(session, &al, sample);
> + } else if (perf_session__add_hist_entry(session, &al, sample)) {
> pr_debug("problem incrementing symbol period, skipping event\n");
> return -1;
> }
> @@ -319,6 +433,14 @@ static int __cmd_report(void)
> if (ret)
> goto out_delete;
>
> + if (time_history) {
> + if (!session->header.nsec_ref) {
> + pr_err("Reference timestamp missing in perf.data file.\n"
> + "Cannot convert kernel timestamps to time-of-day.\n");
> + }
> + timehist_header();
> + }
> +
> ret = perf_session__process_events(session, &event_ops);
> if (ret)
> goto out_delete;
> @@ -334,6 +456,9 @@ static int __cmd_report(void)
> if (verbose > 2)
> perf_session__fprintf_dsos(session, stdout);
>
> + if (time_history)
> + goto out_delete;
> +
> next = rb_first(&session->hists_tree);
> while (next) {
> struct hists *hists;
> @@ -479,6 +604,8 @@ static const struct option options[] = {
> "columns '.' is reserved."),
> OPT_BOOLEAN('U', "hide-unresolved", &hide_unresolved,
> "Only display entries resolved to a symbol"),
> + OPT_BOOLEAN(0, "timehist", &time_history,
> + "Dump time history of event samples"),
> OPT_END()
> };
>
> @@ -486,7 +613,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
> {
> argc = parse_options(argc, argv, options, report_usage, 0);
>
> - if (use_stdio)
> + if (use_stdio || time_history)
> use_browser = 0;
> else if (use_tui)
> use_browser = 1;
> --
> 1.7.2.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-perf-users" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Dec 07, 2010 at 06:54:44PM -0700, David Ahern wrote:
> Add a reference timestamp to the perf header - snapshotting kernel
> time to gettimeofday. This allows 'perf report' to convert kernel
> timestamps to time-of-day which is convenient for comparing to other
> (non perf related) log files.
>
> The timestamp is added using a feature bit for compatibility with
> older binaries and data files.
>
> Signed-off-by: David Ahern <[email protected]>
> ---
> tools/perf/util/header.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
> tools/perf/util/header.h | 3 ++
> tools/perf/util/session.c | 26 +++++++++++++++++++++++
> 3 files changed, 80 insertions(+), 0 deletions(-)
>
> diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
> index 76e949a..2fe893b 100644
> --- a/tools/perf/util/header.c
> +++ b/tools/perf/util/header.c
> @@ -191,6 +191,40 @@ static int write_padded(int fd, const void *bf, size_t count,
> return err;
> }
>
> +static int perf_header__read_ref_time(struct perf_header *header,
> + int fd, u64 offset, u64 size)
> +{
> + size_t sz_nsec = sizeof(header->nsec_ref);
> + size_t sz_tv = sizeof(header->tv_ref);
> + int err = -1;
> +
> + if (((size - offset) < (sz_nsec + sz_tv)) ||
> + (read(fd, &header->nsec_ref, sz_nsec) != (ssize_t) sz_nsec) ||
> + (read(fd, &header->tv_ref, sz_tv) != (ssize_t) sz_tv))
> + goto out;
Hmm, could we have endianness related troubles if we write the timespec on
an arch and cross read from another, or other cross read issues?
> +
> + err = 0;
> +
> +out:
> + return err;
> +}
> +
> +static int perf_header__write_ref_time(struct perf_header *header, int fd)
> +{
> + size_t sz_nsec = sizeof(header->nsec_ref);
> + size_t sz_tv = sizeof(header->tv_ref);
> + int err = -1;
> +
> + if ((write(fd, &header->nsec_ref, sz_nsec) != (ssize_t) sz_nsec) ||
> + (write(fd, &header->tv_ref, sz_tv) != (ssize_t) sz_tv))
> + goto out;
> +
> + err = 0;
> +
> +out:
> + return err;
> +}
> +
> #define dsos__for_each_with_build_id(pos, head) \
> list_for_each_entry(pos, head, node) \
> if (!pos->has_build_id) \
> @@ -483,6 +517,19 @@ static int perf_header__adds_write(struct perf_header *self, int fd)
> perf_session__cache_build_ids(session);
> }
>
> + if (perf_header__has_feat(self, HEADER_REFERENCE_TIME)) {
> + struct perf_file_section *tref_sec;
> +
> + tref_sec = &feat_sec[idx++];
> + tref_sec->offset = lseek(fd, 0, SEEK_CUR);
> + err = perf_header__write_ref_time(self, fd);
> + if (err < 0) {
> + pr_debug("failed to write reference time\n");
> + goto out_free;
> + }
> + tref_sec->size = lseek(fd, 0, SEEK_CUR) - tref_sec->offset;
> + }
> +
> lseek(fd, sec_start, SEEK_SET);
> err = do_write(fd, feat_sec, sec_size);
> if (err < 0)
> @@ -810,6 +857,10 @@ static int perf_file_section__process(struct perf_file_section *self,
> if (perf_header__read_build_ids(ph, fd, self->offset, self->size))
> pr_debug("Failed to read buildids, continuing...\n");
> break;
> + case HEADER_REFERENCE_TIME:
> + if (perf_header__read_ref_time(ph, fd, self->offset, self->size))
> + pr_debug("Failed to read reference time, continuing...\n");
> + break;
> default:
> pr_debug("unknown feature %d, continuing...\n", feat);
> }
> diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
> index 6335965..0721c78 100644
> --- a/tools/perf/util/header.h
> +++ b/tools/perf/util/header.h
> @@ -19,6 +19,7 @@ struct perf_header_attr {
> enum {
> HEADER_TRACE_INFO = 1,
> HEADER_BUILD_ID,
> + HEADER_REFERENCE_TIME,
> HEADER_LAST_FEATURE,
> };
>
> @@ -59,6 +60,8 @@ struct perf_header {
> u64 data_size;
> u64 event_offset;
> u64 event_size;
> + u64 nsec_ref;
> + struct timeval tv_ref;
> DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS);
> };
>
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 3074d38..720838e 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -110,6 +110,29 @@ void perf_session__update_sample_type(struct perf_session *self)
> perf_session__id_header_size(self);
> }
>
> +static int perf_session__create_ref_time(struct perf_session *session)
> +{
> + struct timespec tp;
> +
> + /* race here between successive calls, but should be close enough */
> + if (gettimeofday(&session->header.tv_ref, NULL) != 0) {
> + pr_err("gettimeofday failed. Cannot set reference time.\n");
> + return -1;
> + }
> +
> + if (clock_gettime(CLOCK_MONOTONIC, &tp) != 0) {
> + pr_err("clock_gettime failed. Cannot set reference time.\n");
> + return -1;
> + }
> +
> + session->header.nsec_ref = (u64) tp.tv_sec * NSEC_PER_SEC
> + + (u64) tp.tv_nsec;
> +
> + perf_header__set_feat(&session->header, HEADER_REFERENCE_TIME);
> +
> + return 0;
> +}
> +
> int perf_session__create_kernel_maps(struct perf_session *self)
> {
> int ret = machine__create_kernel_maps(&self->host_machine);
> @@ -167,6 +190,9 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
> */
> if (perf_session__create_kernel_maps(self) < 0)
> goto out_delete;
> +
> + if (perf_session__create_ref_time(self) < 0)
> + goto out_delete;
So, it does record it anytime?
> }
>
> perf_session__update_sample_type(self);
Other than that, looks good!
On Tue, Dec 07, 2010 at 06:54:45PM -0700, David Ahern wrote:
> Add a timehist option to perf record to save cpu and kernel timestamp
> with each sample.
>
> Add timehist option to perf report to display a pretty-print format
> of the timestamp, cpu, and event data each event rather than
> generating a histogram. The timehist option leverages the reference
> timestamp from the perf header to correlate kernel timestamps to
> time-of-day which allows correlating with external (non-perf)
> log files.
>
> As an example, consider recording context switches:
> perf record -e cs -c 1 -ga --timehist
>
> Today the 'perf report -D' option gives output in the form
> 0 182392227877 0x17d8f8 [0xd8]: PERF_RECORD_SAMPLE(IP, 1): \
> 1861/1861: 0xffffffff810348d2 period: 2057672
> ... chain: nr:21
> ..... 0: ffffffffffffff80
> ..... 1: ffffffff810348d2
> ..... 2: ffffffff8139c02b
> ..... 3: ffffffffa004b4a3
> ..... 4: ffffffff8139c79c
> ..... 5: ffffffff8139c83d
> ..... 6: ffffffffa004b43f
> ..... 7: ffffffffa00ec135
> ..... 8: ffffffffa00ec214
> ..... 9: ffffffffa00f01ec
> ..... 10: ffffffffa00f0439
> ..... 11: ffffffffa00d6e8b
> ..... 12: ffffffff8110b763
> ..... 13: ffffffff8110b883
> ..... 14: ffffffff8110ddcd
> ..... 15: ffffffff8110e45d
> ..... 16: ffffffff81101f36
> ..... 17: ffffffff81101ff3
> ..... 18: ffffffff81002c82
> ..... 19: fffffffffffffe00
> ..... 20: 00002acfd31ce050
> ... thread: cc1:1861
> ...... dso: /lib/modules/2.6.37-rc5/build/vmlinux
>
> With the timehist option 'perf report --timehist' displays the same sample as
> (lines wrapped to < 80 and truncated here):
>
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff810348d2 perf_event_task_sched_out ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8139c02b schedule ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa004b4a3 rpc_wait_bit_killable (/lib/modules/2.6.37-rc5/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8139c79c __wait_on_bit ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8139c83d out_of_line_wait_on_bit ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa004b43f __rpc_wait_for_completion_task (/lib/modules/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa00ec135 nfs4_wait_for_completion_rpc_task (/lib/modules/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa00ec214 nfs4_run_open_task (/lib/modules/2.6.37-rc5/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa00f01ec nfs4_do_open.clone.21 (/lib/modules/2.6.37-rc5/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa00f0439 nfs4_atomic_open (/lib/modules/2.6.37-rc5/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffffa00d6e8b nfs_open_revalidate (/lib/modules/2.6.37-rc5/...
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8110b763 do_revalidate ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8110b883 do_lookup ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8110ddcd do_last ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff8110e45d do_filp_open ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff81101f36 do_sys_open ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff81101ff3 sys_open ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> ffffffff81002c82 system_call ([kernel.kallsyms])
> 17:11:10.174502 182392227877 0 cc1 1861 \
> 2acfd31ce050 __GI___libc_open (/lib64/libc-2.12.90.so)
>
> While the line lengths are horrible for a git commit message and email,
> the data is indispensable. It allows you to see each schedule out point,
> where the process is when it gets scheduled out and, with additional
> processing on the timehist output, time between schedule-in events and
> time on the processor.
>
> Signed-off-by: David Ahern <[email protected]>
> ---
> tools/perf/builtin-record.c | 8 +++
> tools/perf/builtin-report.c | 131 ++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 137 insertions(+), 2 deletions(-)
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 699dd21..cf30c98 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -51,6 +51,7 @@ static int realtime_prio = 0;
> static bool raw_samples = false;
> static bool sample_id_all_avail = true;
> static bool system_wide = false;
> +static bool time_history = false;
> static pid_t target_pid = -1;
> static pid_t target_tid = -1;
> static pid_t *all_tids = NULL;
> @@ -288,6 +289,11 @@ static void create_counter(int counter, int cpu)
> if (sample_time)
> attr->sample_type |= PERF_SAMPLE_TIME;
>
> + if (time_history) {
> + attr->sample_type |= PERF_SAMPLE_TIME;
> + attr->sample_type |= PERF_SAMPLE_CPU;
> + }
> +
> if (raw_samples) {
> attr->sample_type |= PERF_SAMPLE_TIME;
> attr->sample_type |= PERF_SAMPLE_RAW;
> @@ -863,6 +869,8 @@ const struct option record_options[] = {
> "do not update the buildid cache"),
> OPT_BOOLEAN('B', "no-buildid", &no_buildid,
> "do not collect buildids in perf.data"),
> + OPT_BOOLEAN(0, "timehist", &time_history,
> + "collect data for time history report"),
> OPT_END()
> };
It seems you should only to the gettimeofday() / clock_gettime()
record if you have time_history set?
Thanks.
>
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index 904519f..bcde3c2 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -35,6 +35,7 @@ static char const *input_name = "perf.data";
> static bool force, use_tui, use_stdio;
> static bool hide_unresolved;
> static bool dont_use_callchains;
> +static bool time_history;
>
> static bool show_threads;
> static struct perf_read_values show_threads_values;
> @@ -124,6 +125,117 @@ out_free_syms:
> return err;
> }
>
> +static const char *timestr(u64 timestamp, struct perf_session *session)
> +{
> + struct tm ltime;
> + u64 dt;
> + struct timeval tv_dt, tv_res;
> + static char tstr[64];
> +
> + dt = timestamp - session->header.nsec_ref;
> + tv_dt.tv_sec = (time_t) (dt / NSEC_PER_SEC);
> + tv_dt.tv_usec = (suseconds_t) ((dt - tv_dt.tv_sec * NSEC_PER_SEC) / 1000);
> +
> + timeradd(&session->header.tv_ref, &tv_dt, &tv_res);
> +
> + if ((session->header.nsec_ref == 0) ||
> + (localtime_r(&tv_res.tv_sec, <ime) == NULL)) {
> + snprintf(tstr, sizeof(tstr), "00:00:00.000000 %16Ld", timestamp);
> + } else {
> + char date[64];
> + strftime(date, sizeof(date), "%H:%M:%S", <ime);
> + snprintf(tstr, sizeof(tstr), "%s.%06ld %16Ld",
> + date, tv_res.tv_usec, timestamp);
> + }
> +
> + return tstr;
> +}
> +
> +#define TIMEHIST_FMT "%32s %3d %-16s %5d %16Lx %s (%s)\n"
> +
> +static void timehist_header(void)
> +{
> + printf("%32s %3s %-16s %5s %16s %s (%s)\n",
> + " Time-of-Day Kernel Timestamp",
> + "cpu", "Command Name", "PID",
> + " IP ", "Symbol Name", "DSO Name");
> +}
> +
> +static int perf_session__print_sample(struct perf_session *session,
> + struct addr_location *al,
> + struct sample_data *data)
> +{
> + static int show_timehist_error = 1;
> + u64 timestamp = 0;
> + struct map_symbol *syms = NULL;
> + struct symbol *parent = NULL;
> + const char *tstr;
> + bool need_spacer = false;
> +
> + if (show_timehist_error && ((data->cpu == (u32) -1) ||
> + !(session->sample_type & PERF_SAMPLE_TIME))) {
> + pr_err("Data for time history missing in perf event samples.\n"
> + "Did you record with --timehist option?\n");
> + show_timehist_error = 0;
> + }
> +
> + if (session->sample_type & PERF_SAMPLE_TIME)
> + timestamp = data->time;
> +
> + tstr = timestr(timestamp, session);
> +
> +
> + if ((symbol_conf.use_callchain) && data->callchain) {
> + u64 i;
> +
> + syms = perf_session__resolve_callchain(session, al->thread,
> + data->callchain, &parent);
> + if (syms == NULL)
> + return -ENOMEM;
> +
> + for (i = 0; i < data->callchain->nr; ++i) {
> + const char *symname = "", *dsoname = "";
> +
> + if (data->callchain->ips[i] >= PERF_CONTEXT_MAX)
> + continue;
> +
> + if (syms[i].sym && syms[i].sym->name)
> + symname = syms[i].sym->name;
> + else if (hide_unresolved)
> + continue;
> +
> + if (syms[i].map && syms[i].map->dso &&
> + syms[i].map->dso->name)
> + dsoname = syms[i].map->dso->name;
> + else if (hide_unresolved)
> + continue;
> +
> + printf(TIMEHIST_FMT,
> + tstr, data->cpu,
> + al->thread->comm, al->thread->pid,
> + data->callchain->ips[i], symname, dsoname);
> +
> + need_spacer = true;
> + }
> +
> + free(syms);
> +
> + } else {
> + printf(TIMEHIST_FMT,
> + tstr, data->cpu,
> + al->thread->comm, al->thread->pid, al->addr,
> + al->sym->name, al->map->dso->name);
> +
> + need_spacer = true;
> + }
> +
> + /* put a gap between records */
> + if (need_spacer)
> + printf("\n");
> +
> + return 0;
> +}
> +
> static int add_event_total(struct perf_session *session,
> struct sample_data *data,
> struct perf_event_attr *attr)
> @@ -165,7 +277,9 @@ static int process_sample_event(event_t *event, struct sample_data *sample,
> if (al.filtered || (hide_unresolved && al.sym == NULL))
> return 0;
>
> - if (perf_session__add_hist_entry(session, &al, sample)) {
> + if (time_history) {
> + perf_session__print_sample(session, &al, sample);
> + } else if (perf_session__add_hist_entry(session, &al, sample)) {
> pr_debug("problem incrementing symbol period, skipping event\n");
> return -1;
> }
> @@ -319,6 +433,14 @@ static int __cmd_report(void)
> if (ret)
> goto out_delete;
>
> + if (time_history) {
> + if (!session->header.nsec_ref) {
> + pr_err("Reference timestamp missing in perf.data file.\n"
> + "Cannot convert kernel timestamps to time-of-day.\n");
> + }
> + timehist_header();
> + }
> +
> ret = perf_session__process_events(session, &event_ops);
> if (ret)
> goto out_delete;
> @@ -334,6 +456,9 @@ static int __cmd_report(void)
> if (verbose > 2)
> perf_session__fprintf_dsos(session, stdout);
>
> + if (time_history)
> + goto out_delete;
> +
> next = rb_first(&session->hists_tree);
> while (next) {
> struct hists *hists;
> @@ -479,6 +604,8 @@ static const struct option options[] = {
> "columns '.' is reserved."),
> OPT_BOOLEAN('U', "hide-unresolved", &hide_unresolved,
> "Only display entries resolved to a symbol"),
> + OPT_BOOLEAN(0, "timehist", &time_history,
> + "Dump time history of event samples"),
> OPT_END()
> };
>
> @@ -486,7 +613,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
> {
> argc = parse_options(argc, argv, options, report_usage, 0);
>
> - if (use_stdio)
> + if (use_stdio || time_history)
> use_browser = 0;
> else if (use_tui)
> use_browser = 1;
> --
> 1.7.2.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
Em Sun, Dec 12, 2010 at 09:16:16PM +0100, Frederic Weisbecker escreveu:
> On Tue, Dec 07, 2010 at 06:54:44PM -0700, David Ahern wrote:
> > +++ b/tools/perf/util/header.c
> > @@ -191,6 +191,40 @@ static int write_padded(int fd, const void *bf, size_t count,
> > +static int perf_header__read_ref_time(struct perf_header *header,
> > + int fd, u64 offset, u64 size)
> > +{
> > + size_t sz_nsec = sizeof(header->nsec_ref);
> > + size_t sz_tv = sizeof(header->tv_ref);
> > + int err = -1;
> > +
> > + if (((size - offset) < (sz_nsec + sz_tv)) ||
> > + (read(fd, &header->nsec_ref, sz_nsec) != (ssize_t) sz_nsec) ||
> > + (read(fd, &header->tv_ref, sz_tv) != (ssize_t) sz_tv))
> > + goto out;
>
> Hmm, could we have endianness related troubles if we write the timespec on
> an arch and cross read from another, or other cross read issues?
Well spotted, it needs to use perf_header__getbuffer64 and make sure
that nsec_ref, etc are u64 fields.
- Arnaldo
On 12/12/10 13:16, Frederic Weisbecker wrote:
>> int perf_session__create_kernel_maps(struct perf_session *self)
>> {
>> int ret = machine__create_kernel_maps(&self->host_machine);
>> @@ -167,6 +190,9 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
>> */
>> if (perf_session__create_kernel_maps(self) < 0)
>> goto out_delete;
>> +
>> + if (perf_session__create_ref_time(self) < 0)
>> + goto out_delete;
>
> So, it does record it anytime?
Simplest to always add it the header as it does no harm if not used. To
make it conditional means adding an input parameter to
perf_session__new() and then updating all the callers.
Is the preference to make it conditional?
David
>
>> }
>>
>> perf_session__update_sample_type(self);
>
>
> Other than that, looks good!
> --
> To unsubscribe from this list: send the line "unsubscribe linux-perf-users" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Dec 13, 2010 at 07:39:24AM -0700, David S. Ahern wrote:
>
>
> On 12/12/10 13:16, Frederic Weisbecker wrote:
>
> >> int perf_session__create_kernel_maps(struct perf_session *self)
> >> {
> >> int ret = machine__create_kernel_maps(&self->host_machine);
> >> @@ -167,6 +190,9 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
> >> */
> >> if (perf_session__create_kernel_maps(self) < 0)
> >> goto out_delete;
> >> +
> >> + if (perf_session__create_ref_time(self) < 0)
> >> + goto out_delete;
> >
> > So, it does record it anytime?
>
> Simplest to always add it the header as it does no harm if not used. To
> make it conditional means adding an input parameter to
> perf_session__new() and then updating all the callers.
>
> Is the preference to make it conditional?
No, I suspect the overhead is plain unnoticeable. And if it becomes
any problem one day, we can still make that conditional later.
Thanks.
> David
>
>
> >
> >> }
> >>
> >> perf_session__update_sample_type(self);
> >
> >
> > Other than that, looks good!
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-perf-users" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
Em Mon, Dec 13, 2010 at 04:54:56PM +0100, Frederic Weisbecker escreveu:
> On Mon, Dec 13, 2010 at 07:39:24AM -0700, David S. Ahern wrote:
> > On 12/12/10 13:16, Frederic Weisbecker wrote:
> > >> int perf_session__create_kernel_maps(struct perf_session *self)
> > >> {
> > >> int ret = machine__create_kernel_maps(&self->host_machine);
> > >> @@ -167,6 +190,9 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
> > >> */
> > >> if (perf_session__create_kernel_maps(self) < 0)
> > >> goto out_delete;
> > >> +
> > >> + if (perf_session__create_ref_time(self) < 0)
> > >> + goto out_delete;
> > > So, it does record it anytime?
> > Simplest to always add it the header as it does no harm if not used.
> > To make it conditional means adding an input parameter to
> > perf_session__new() and then updating all the callers.
> > Is the preference to make it conditional?
> No, I suspect the overhead is plain unnoticeable. And if it becomes
> any problem one day, we can still make that conditional later.
Right, but it just ocurred to me, can't we encode this in the file stat?
Also, how do we deal with:
$ perf record --help
<SNIP>
-A, --append
Append to the output file to do incremental profiling.
<SNIP>
?
I now think this should be implemented as a PERF_RECORD_WALL_CLOCK
synthesized event, to be inserted just before we start collecting the
other events.
That way, when --appending, we just insert another and use that from
that point on.
Look at tools/perf/util/event.h, enum perf_user_event_type and look how
those are synthesized.
Thanks,
- Arnaldo
Em Mon, Dec 13, 2010 at 11:15:57AM -0200, Arnaldo Carvalho de Melo escreveu:
> Em Sun, Dec 12, 2010 at 09:16:16PM +0100, Frederic Weisbecker escreveu:
> > On Tue, Dec 07, 2010 at 06:54:44PM -0700, David Ahern wrote:
> > > +++ b/tools/perf/util/header.c
> > > @@ -191,6 +191,40 @@ static int write_padded(int fd, const void *bf, size_t count,
> > > +static int perf_header__read_ref_time(struct perf_header *header,
> > > + int fd, u64 offset, u64 size)
> > > +{
> > > + size_t sz_nsec = sizeof(header->nsec_ref);
> > > + size_t sz_tv = sizeof(header->tv_ref);
> > > + int err = -1;
> > > +
> > > + if (((size - offset) < (sz_nsec + sz_tv)) ||
> > > + (read(fd, &header->nsec_ref, sz_nsec) != (ssize_t) sz_nsec) ||
> > > + (read(fd, &header->tv_ref, sz_tv) != (ssize_t) sz_tv))
> > > + goto out;
> >
> > Hmm, could we have endianness related troubles if we write the timespec on
> > an arch and cross read from another, or other cross read issues?
>
> Well spotted, it needs to use perf_header__getbuffer64 and make sure
> that nsec_ref, etc are u64 fields.
When implemented as a PERF_RECORD_REF_TIME or PERF_RECORD_WALL_CLOCK
this will be handled in perf_session__process_event, implementing a
event__ref_time_swap routine, etc.
- Arnaldo
On Mon, Dec 13, 2010 at 02:48:54PM -0200, Arnaldo Carvalho de Melo wrote:
> Em Mon, Dec 13, 2010 at 04:54:56PM +0100, Frederic Weisbecker escreveu:
> > On Mon, Dec 13, 2010 at 07:39:24AM -0700, David S. Ahern wrote:
> > > On 12/12/10 13:16, Frederic Weisbecker wrote:
> > > >> int perf_session__create_kernel_maps(struct perf_session *self)
> > > >> {
> > > >> int ret = machine__create_kernel_maps(&self->host_machine);
> > > >> @@ -167,6 +190,9 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
> > > >> */
> > > >> if (perf_session__create_kernel_maps(self) < 0)
> > > >> goto out_delete;
> > > >> +
> > > >> + if (perf_session__create_ref_time(self) < 0)
> > > >> + goto out_delete;
>
> > > > So, it does record it anytime?
>
> > > Simplest to always add it the header as it does no harm if not used.
> > > To make it conditional means adding an input parameter to
> > > perf_session__new() and then updating all the callers.
>
> > > Is the preference to make it conditional?
>
> > No, I suspect the overhead is plain unnoticeable. And if it becomes
> > any problem one day, we can still make that conditional later.
>
> Right, but it just ocurred to me, can't we encode this in the file stat?
> Also, how do we deal with:
>
> $ perf record --help
> <SNIP>
> -A, --append
> Append to the output file to do incremental profiling.
> <SNIP>
>
> ?
>
> I now think this should be implemented as a PERF_RECORD_WALL_CLOCK
> synthesized event, to be inserted just before we start collecting the
> other events.
>
> That way, when --appending, we just insert another and use that from
> that point on.
>
> Look at tools/perf/util/event.h, enum perf_user_event_type and look how
> those are synthesized.
>
> Thanks,
>
> - Arnaldo
(Adding peterz)
Right, but Peter wanted us to get rid of these user events types.
I guess we can't really do this as new perf tools must be able
to support old perf files.
So this should be the last one to add. Something like PERF_RECORD_GEN_EVT
that has a field in its headers containing a sub-type which can be this
wall clock but can also host about everything in the future.
This way we don't propagate more the possible overlap with the kernel.
Hm?
On Mon, 2010-12-13 at 18:09 +0100, Frederic Weisbecker wrote:
>
>
> Right, but Peter wanted us to get rid of these user events types.
Yes they suck.
> I guess we can't really do this as new perf tools must be able
> to support old perf files.
Preferably, yes, but I don't see why we can't break the data file format
if we've got good reasons to.
> So this should be the last one to add. Something like PERF_RECORD_GEN_EVT
> that has a field in its headers containing a sub-type which can be this
> wall clock but can also host about everything in the future.
>
> This way we don't propagate more the possible overlap with the kernel.
I don't see why we should add more, that's going the wrong direction.
On Mon, 2010-12-13 at 18:11 +0100, Peter Zijlstra wrote:
>
> Preferably, yes, but I don't see why we can't break the data file format
> if we've got good reasons to.
I mean, we pretty much _have_ to break data file format when we want to
do splice() support.
Em Mon, Dec 13, 2010 at 06:09:26PM +0100, Frederic Weisbecker escreveu:
> On Mon, Dec 13, 2010 at 02:48:54PM -0200, Arnaldo Carvalho de Melo wrote:
> > Em Mon, Dec 13, 2010 at 04:54:56PM +0100, Frederic Weisbecker escreveu:
> > > On Mon, Dec 13, 2010 at 07:39:24AM -0700, David S. Ahern wrote:
> > > > On 12/12/10 13:16, Frederic Weisbecker wrote:
> > > > >> int perf_session__create_kernel_maps(struct perf_session *self)
> > > > >> {
> > > > >> int ret = machine__create_kernel_maps(&self->host_machine);
> > > > >> @@ -167,6 +190,9 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
> > > > >> */
> > > > >> if (perf_session__create_kernel_maps(self) < 0)
> > > > >> goto out_delete;
> > > > >> +
> > > > >> + if (perf_session__create_ref_time(self) < 0)
> > > > >> + goto out_delete;
> >
> > > > > So, it does record it anytime?
> >
> > > > Simplest to always add it the header as it does no harm if not used.
> > > > To make it conditional means adding an input parameter to
> > > > perf_session__new() and then updating all the callers.
> >
> > > > Is the preference to make it conditional?
> >
> > > No, I suspect the overhead is plain unnoticeable. And if it becomes
> > > any problem one day, we can still make that conditional later.
> >
> > Right, but it just ocurred to me, can't we encode this in the file stat?
> > Also, how do we deal with:
> >
> > $ perf record --help
> > <SNIP>
> > -A, --append
> > Append to the output file to do incremental profiling.
> > <SNIP>
> >
> > ?
> >
> > I now think this should be implemented as a PERF_RECORD_WALL_CLOCK
> > synthesized event, to be inserted just before we start collecting the
> > other events.
> >
> > That way, when --appending, we just insert another and use that from
> > that point on.
> >
> > Look at tools/perf/util/event.h, enum perf_user_event_type and look how
> > those are synthesized.
>
> (Adding peterz)
Adding tglx
> Right, but Peter wanted us to get rid of these user events types.
> I guess we can't really do this as new perf tools must be able
> to support old perf files.
>
> So this should be the last one to add. Something like PERF_RECORD_GEN_EVT
> that has a field in its headers containing a sub-type which can be this
> wall clock but can also host about everything in the future.
>
> This way we don't propagate more the possible overlap with the kernel.
That works for me. I would call it:
PERF_RECORD_SYNTH
our ioctl ;-)
But whatever, just a name.
- Arnaldo
On Mon, Dec 13, 2010 at 06:11:29PM +0100, Peter Zijlstra wrote:
> On Mon, 2010-12-13 at 18:09 +0100, Frederic Weisbecker wrote:
> >
> >
> > Right, but Peter wanted us to get rid of these user events types.
>
> Yes they suck.
>
> > I guess we can't really do this as new perf tools must be able
> > to support old perf files.
>
> Preferably, yes, but I don't see why we can't break the data file format
> if we've got good reasons to.
IMO we should prioritize the backward compatibility over some little code sanity.
It's worth a very small range of values to reserve in the kernel and we are done.
> > So this should be the last one to add. Something like PERF_RECORD_GEN_EVT
> > that has a field in its headers containing a sub-type which can be this
> > wall clock but can also host about everything in the future.
> >
> > This way we don't propagate more the possible overlap with the kernel.
>
> I don't see why we should add more, that's going the wrong direction.
This must be the last one.
On Mon, 2010-12-13 at 18:15 +0100, Frederic Weisbecker wrote:
> On Mon, Dec 13, 2010 at 06:11:29PM +0100, Peter Zijlstra wrote:
> > On Mon, 2010-12-13 at 18:09 +0100, Frederic Weisbecker wrote:
> > >
> > >
> > > Right, but Peter wanted us to get rid of these user events types.
> >
> > Yes they suck.
> >
> > > I guess we can't really do this as new perf tools must be able
> > > to support old perf files.
> >
> > Preferably, yes, but I don't see why we can't break the data file format
> > if we've got good reasons to.
>
> IMO we should prioritize the backward compatibility over some little code sanity.
> It's worth a very small range of values to reserve in the kernel and we are done.
Feh, excuses just grow more fungus on your code.
> > > So this should be the last one to add. Something like PERF_RECORD_GEN_EVT
> > > that has a field in its headers containing a sub-type which can be this
> > > wall clock but can also host about everything in the future.
> > >
> > > This way we don't propagate more the possible overlap with the kernel.
> >
> > I don't see why we should add more, that's going the wrong direction.
>
> This must be the last one.
No, the last one already happened, you cannot postpone the last one,
there will always be another excuse.
Em Mon, Dec 13, 2010 at 06:18:19PM +0100, Peter Zijlstra escreveu:
> On Mon, 2010-12-13 at 18:15 +0100, Frederic Weisbecker wrote:
> > On Mon, Dec 13, 2010 at 06:11:29PM +0100, Peter Zijlstra wrote:
> > > On Mon, 2010-12-13 at 18:09 +0100, Frederic Weisbecker wrote:
> > > > So this should be the last one to add. Something like PERF_RECORD_GEN_EVT
> > > > that has a field in its headers containing a sub-type which can be this
> > > > wall clock but can also host about everything in the future.
> > > >
> > > > This way we don't propagate more the possible overlap with the kernel.
> > >
> > > I don't see why we should add more, that's going the wrong direction.
> >
> > This must be the last one.
>
> No, the last one already happened, you cannot postpone the last one,
> there will always be another excuse.
Did you understand the use case? How to have multiple reference times
when appending?
"last one" means adding a multiplexor, PERF_RECORD_LAST_ONE if you will,
and inside it we add new events if the need arises. Then never again we
add a PERF_RECORD_ event in userspace.
- Arnaldo
On Mon, Dec 13, 2010 at 06:13:39PM +0100, Peter Zijlstra wrote:
> On Mon, 2010-12-13 at 18:11 +0100, Peter Zijlstra wrote:
> >
> > Preferably, yes, but I don't see why we can't break the data file format
> > if we've got good reasons to.
>
> I mean, we pretty much _have_ to break data file format when we want to
> do splice() support.
Because we'll have one file per-cpu?
But perf.data on UP will be sensibly the same as today so I suspect
we won't need to be compatible.
But I guess I am missing something, in which case that probably doesn't
change much the picture. It's not because one day we'll need to break
the format that we can happily do so everyday. Backward compatibility
is important and we should preserve it when it's possible.
On Mon, 2010-12-13 at 15:22 -0200, Arnaldo Carvalho de Melo wrote:
> > No, the last one already happened, you cannot postpone the last one,
> > there will always be another excuse.
>
> Did you understand the use case? How to have multiple reference times
> when appending?
Yes, I understood it perfectly, I just detest the existence of these
pure user records, they need to die ASAP.
> "last one" means adding a multiplexor, PERF_RECORD_LAST_ONE if you will,
> and inside it we add new events if the need arises. Then never again we
> add a PERF_RECORD_ event in userspace.
Nah, that sucks too, the whole concept of pure user-space events in that
stream sucks.
There's multiple things you can do, you could:
- create a kernel event PERF_RECORD_NEW_BUFFER and stuff that into each
fresh buffer when its created, it could contain all kinds of 1-time
information, like:
* this CLOCK_MONOTONIC offset (for what little good that does, since
our clock isn't strictly sync'ed to CLOCK_MONOTONIC so we can incur
arbitrary drift).
* architecture details, like 64/32 host info needed for the
PERF_SAMPLE_REGS stuff.
- extend the existing header infrastructure to write a new header in
front of the new stream. The main header already has a data section that
points to the end of the stream, add a continuation header section that
points to a continuation-header used to appends and record the clock
offset data in there.
- something else entirely.
Just stop using these stupid fake events and be somewhat creative.
On 12/13/10 09:48, Arnaldo Carvalho de Melo wrote:
> Em Mon, Dec 13, 2010 at 04:54:56PM +0100, Frederic Weisbecker escreveu:
>> On Mon, Dec 13, 2010 at 07:39:24AM -0700, David S. Ahern wrote:
>>> On 12/12/10 13:16, Frederic Weisbecker wrote:
>>>>> int perf_session__create_kernel_maps(struct perf_session *self)
>>>>> {
>>>>> int ret = machine__create_kernel_maps(&self->host_machine);
>>>>> @@ -167,6 +190,9 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
>>>>> */
>>>>> if (perf_session__create_kernel_maps(self) < 0)
>>>>> goto out_delete;
>>>>> +
>>>>> + if (perf_session__create_ref_time(self) < 0)
>>>>> + goto out_delete;
>
>>>> So, it does record it anytime?
>
>>> Simplest to always add it the header as it does no harm if not used.
>>> To make it conditional means adding an input parameter to
>>> perf_session__new() and then updating all the callers.
>
>>> Is the preference to make it conditional?
>
>> No, I suspect the overhead is plain unnoticeable. And if it becomes
>> any problem one day, we can still make that conditional later.
>
> Right, but it just ocurred to me, can't we encode this in the file stat?
I would not want to rely on stat output for timestamps. eg., copy that
file off box and not have the timestamp preserved.
> Also, how do we deal with:
>
> $ perf record --help
> <SNIP>
> -A, --append
> Append to the output file to do incremental profiling.
> <SNIP>
>
> ?
Is it realistic to expect/allow an append using different perf binaries?
That just seems plain odd. Then, if one invocation used --timehist and
another did not output would be bizarre. Of course an append with a
reboot between tests would cause problems as well.
>
> I now think this should be implemented as a PERF_RECORD_WALL_CLOCK
> synthesized event, to be inserted just before we start collecting the
> other events.
>
> That way, when --appending, we just insert another and use that from
> that point on.
>
> Look at tools/perf/util/event.h, enum perf_user_event_type and look how
> those are synthesized.
I see the thread on this part. I don't have preference; just let me know
what the final decision is. ie., synthesized event versus header entry.
David
>
> Thanks,
>
> - Arnaldo
On Mon, 2010-12-13 at 18:23 +0100, Frederic Weisbecker wrote:
> On Mon, Dec 13, 2010 at 06:13:39PM +0100, Peter Zijlstra wrote:
> > On Mon, 2010-12-13 at 18:11 +0100, Peter Zijlstra wrote:
> > >
> > > Preferably, yes, but I don't see why we can't break the data file format
> > > if we've got good reasons to.
> >
> > I mean, we pretty much _have_ to break data file format when we want to
> > do splice() support.
>
> Because we'll have one file per-cpu?
Right.
> But perf.data on UP will be sensibly the same as today so I suspect
> we won't need to be compatible.
Nope, since one file simply doesn't scale, data-streams will have to
live outside of the main file.
And when you have to make that happen, there is no reason to maintain
any kind of compatibility. The only thing you could do is provide some
script to convert old data files to the new format.
> But I guess I am missing something, in which case that probably doesn't
> change much the picture. It's not because one day we'll need to break
> the format that we can happily do so everyday. Backward compatibility
> is important and we should preserve it when it's possible.
But since you're all so focused on adding warts to the thing, maybe
possible has sailed. But see my email to acme.
Em Mon, Dec 13, 2010 at 06:35:16PM +0100, Peter Zijlstra escreveu:
> On Mon, 2010-12-13 at 15:22 -0200, Arnaldo Carvalho de Melo wrote:
> > > No, the last one already happened, you cannot postpone the last one,
> > > there will always be another excuse.
> >
> > Did you understand the use case? How to have multiple reference times
> > when appending?
>
> Yes, I understood it perfectly, I just detest the existence of these
> pure user records, they need to die ASAP.
Heck, that was be the first I suggested to introduce ;-)
> > "last one" means adding a multiplexor, PERF_RECORD_LAST_ONE if you will,
> > and inside it we add new events if the need arises. Then never again we
> > add a PERF_RECORD_ event in userspace.
>
> Nah, that sucks too, the whole concept of pure user-space events in that
> stream sucks.
>
> There's multiple things you can do, you could:
>
> - create a kernel event PERF_RECORD_NEW_BUFFER and stuff that into each
> fresh buffer when its created, it could contain all kinds of 1-time
> information, like:
> * this CLOCK_MONOTONIC offset (for what little good that does, since
> our clock isn't strictly sync'ed to CLOCK_MONOTONIC so we can incur
> arbitrary drift).
> * architecture details, like 64/32 host info needed for the
> PERF_SAMPLE_REGS stuff.
>
> - extend the existing header infrastructure to write a new header in
> front of the new stream. The main header already has a data section that
Ok, so for this specific case its just a matter of s/event/header/g ?
:-)
I.e. just add a new header when appending and have the size of the
"sessions" in each header? Fine with me as well, the information will be
at the same place I intended, i.e. before each recording session.
> points to the end of the stream, add a continuation header section that
> points to a continuation-header used to appends and record the clock
> offset data in there.
>
> - something else entirely.
>
> Just stop using these stupid fake events and be somewhat creative.
Whatever, never introduced one, they are inserted, IIRC, in the
beggining of the stream, so I guess we can just do that sed and call
them headers and be done with it.
Will verify if that is the case.
- Arnaldo
On Mon, 2010-12-13 at 18:35 +0100, Peter Zijlstra wrote:
> * this CLOCK_MONOTONIC offset (for what little good that does, since
> our clock isn't strictly sync'ed to CLOCK_MONOTONIC so we can incur
> arbitrary drift).
In fact, the only sane way to do that is by creating a software counter
that represents CLOCK_MONOTONIC and sample that say once a minute (or
more often if you want smaller drift).
On Mon, 2010-12-13 at 18:35 +0100, Peter Zijlstra wrote:
>
> - extend the existing header infrastructure to write a new header in
> front of the new stream. The main header already has a data section that
> points to the end of the stream, add a continuation header section that
> points to a continuation-header used to appends and record the clock
> offset data in there.
Also, if you want to add perf-stat functionality to perf-record you
could use something similar, a tail-header, and write the stat values in
there.
On 12/13/10 10:47, Peter Zijlstra wrote:
> On Mon, 2010-12-13 at 18:35 +0100, Peter Zijlstra wrote:
>> * this CLOCK_MONOTONIC offset (for what little good that does, since
>> our clock isn't strictly sync'ed to CLOCK_MONOTONIC so we can incur
>> arbitrary drift).
>
> In fact, the only sane way to do that is by creating a software counter
> that represents CLOCK_MONOTONIC and sample that say once a minute (or
> more often if you want smaller drift).
>
>
What about creating a PERF_RECORD_TIME and generate an event when the
counter is opened? It contains a PERF_SAMPLE_TIME and say
PERF_SAMPLE_TOD (time-of-day)? We're not sending rockets to saturn; we
just need the timestamps to match other log files.
David
On Mon, Dec 13, 2010 at 06:37:40PM +0100, Peter Zijlstra wrote:
> On Mon, 2010-12-13 at 18:23 +0100, Frederic Weisbecker wrote:
> > On Mon, Dec 13, 2010 at 06:13:39PM +0100, Peter Zijlstra wrote:
> > > On Mon, 2010-12-13 at 18:11 +0100, Peter Zijlstra wrote:
> > > >
> > > > Preferably, yes, but I don't see why we can't break the data file format
> > > > if we've got good reasons to.
> > >
> > > I mean, we pretty much _have_ to break data file format when we want to
> > > do splice() support.
> >
> > Because we'll have one file per-cpu?
>
> Right.
>
> > But perf.data on UP will be sensibly the same as today so I suspect
> > we won't need to be compatible.
>
> Nope, since one file simply doesn't scale, data-streams will have to
> live outside of the main file.
>
> And when you have to make that happen, there is no reason to maintain
> any kind of compatibility. The only thing you could do is provide some
> script to convert old data files to the new format.
Right, a script should do the trick.
Em Mon, Dec 13, 2010 at 10:36:39AM -0700, David S. Ahern escreveu:
>
>
> On 12/13/10 09:48, Arnaldo Carvalho de Melo wrote:
> > Em Mon, Dec 13, 2010 at 04:54:56PM +0100, Frederic Weisbecker escreveu:
> >> On Mon, Dec 13, 2010 at 07:39:24AM -0700, David S. Ahern wrote:
> >>> Simplest to always add it the header as it does no harm if not used.
> >>> To make it conditional means adding an input parameter to
> >>> perf_session__new() and then updating all the callers.
> >
> >>> Is the preference to make it conditional?
> >
> >> No, I suspect the overhead is plain unnoticeable. And if it becomes
> >> any problem one day, we can still make that conditional later.
> >
> > Right, but it just ocurred to me, can't we encode this in the file stat?
>
> I would not want to rely on stat output for timestamps. eg., copy that
> file off box and not have the timestamp preserved.
Yeah, I should have deleted that part, as I suggested something else
after it :-\
> > Also, how do we deal with:
> >
> > $ perf record --help
> > <SNIP>
> > -A, --append
> > Append to the output file to do incremental profiling.
> > <SNIP>
> >
> > ?
>
> Is it realistic to expect/allow an append using different perf binaries?
> That just seems plain odd. Then, if one invocation used --timehist and
> another did not output would be bizarre. Of course an append with a
Well, if you use the existence of a ref time and refuse to append to a
file if --timehist is not used, nope.
> reboot between tests would cause problems as well.
Why different perf binaries? Isn't the following scenario valid?
1. perf record --timehist
<stop recording for a while, wait some other time when we know something
interesting may happen, some periodic higher load that happens,
whatever>
2. perf record -A --timehist
3. use perf report showing the wall clock time and merge it with your
app specific log
- Arnaldo
On Mon, 2010-12-13 at 15:43 -0200, Arnaldo Carvalho de Melo wrote:
> Whatever, never introduced one, they are inserted, IIRC, in the
> beggining of the stream, so I guess we can just do that sed and call
> them headers and be done with it.
There's the flush thing from Frederic, that's all over the place.
The easiest way to kill that is to go to separate data files now.
writing to different files is better for contention anyway, we can grow
a flusher-thread per buffer and let them write to their own file.
On Mon, 2010-12-13 at 10:49 -0700, David S. Ahern wrote:
>
> On 12/13/10 10:47, Peter Zijlstra wrote:
> > On Mon, 2010-12-13 at 18:35 +0100, Peter Zijlstra wrote:
> >> * this CLOCK_MONOTONIC offset (for what little good that does, since
> >> our clock isn't strictly sync'ed to CLOCK_MONOTONIC so we can incur
> >> arbitrary drift).
> >
> > In fact, the only sane way to do that is by creating a software counter
> > that represents CLOCK_MONOTONIC and sample that say once a minute (or
> > more often if you want smaller drift).
> >
> >
>
> What about creating a PERF_RECORD_TIME and generate an event when the
> counter is opened? It contains a PERF_SAMPLE_TIME and say
> PERF_SAMPLE_TOD (time-of-day)? We're not sending rockets to saturn; we
> just need the timestamps to match other log files.
That's similar to the first thing I proposed. The problem is with long
record sessions your drift can become quite significant, then when you
merge sort your other log events stuff can get out of order. Which can
lead to some serious head-scratching..
Another problem with this approach is things like flight-recorder mode
where you constantly over-write your old data, you'd have to build some
trigger to always output a new record before you over-write the old one,
so there's always one consistent record around. Drift is an even more
serious problem here since flight-record more could be running for days
before (if ever) you dump it.
On 12/13/10 10:57, Peter Zijlstra wrote:
>> What about creating a PERF_RECORD_TIME and generate an event when the
>> counter is opened? It contains a PERF_SAMPLE_TIME and say
>> PERF_SAMPLE_TOD (time-of-day)? We're not sending rockets to saturn; we
>> just need the timestamps to match other log files.
>
> That's similar to the first thing I proposed. The problem is with long
> record sessions your drift can become quite significant, then when you
> merge sort your other log events stuff can get out of order. Which can
> lead to some serious head-scratching..
Gotcha. Missed that in the flury of emails.
Arnaldo: Are you ok with this option? This should append mode as well.
>
> Another problem with this approach is things like flight-recorder mode
> where you constantly over-write your old data, you'd have to build some
> trigger to always output a new record before you over-write the old one,
> so there's always one consistent record around. Drift is an even more
> serious problem here since flight-record more could be running for days
> before (if ever) you dump it.
Ok. I was not aware flight-recorder mode was an option today.
David
>
>
On Mon, Dec 13, 2010 at 06:51:13PM +0100, Peter Zijlstra wrote:
> On Mon, 2010-12-13 at 15:43 -0200, Arnaldo Carvalho de Melo wrote:
> > Whatever, never introduced one, they are inserted, IIRC, in the
> > beggining of the stream, so I guess we can just do that sed and call
> > them headers and be done with it.
>
> There's the flush thing from Frederic, that's all over the place.
>
> The easiest way to kill that is to go to separate data files now.
> writing to different files is better for contention anyway, we can grow
> a flusher-thread per buffer and let them write to their own file.
Right. We need to go there in the end anyway.
I had an old experiment of this some month ago:
git://git.kernel.org/pub/scm/linux/kernel/git/frederic/random-tracing.git
perf/percpu
There is one writer per cpu and a (stupid) multiplexing at report time.
It was working. Just the sorting on report time was (stupidly) slow.
I should perhaps rebase that branch and try to make something with it.
On Mon, 2010-12-13 at 11:01 -0700, David S. Ahern wrote:
> > Another problem with this approach is things like flight-recorder mode
> > where you constantly over-write your old data, you'd have to build some
> > trigger to always output a new record before you over-write the old one,
> > so there's always one consistent record around. Drift is an even more
> > serious problem here since flight-record more could be running for days
> > before (if ever) you dump it.
>
> Ok. I was not aware flight-recorder mode was an option today.
You get it when you mmap the fd RO -- the only thing is you don't get a
consistent tail pointer so you have to fudge a bit.
But even if we didn't have it, we'd need a solution that would work once
we did grow it ;-)
On Mon, 2010-12-13 at 19:05 +0100, Frederic Weisbecker wrote:
>
> There is one writer per cpu and a (stupid) multiplexing at report time.
> It was working. Just the sorting on report time was (stupidly) slow.
>
> I should perhaps rebase that branch and try to make something with it.
Right, so we need to have a small ~16 events sort window per file [*],
and then a merge-sort over all files, simply consumer the most recent
event across all files.
We can have out-of-order events on a single cpu due to nesting and
taking the time-stamp _before_ we reserve the buffer. We could cure some
of this by moving the perf_clock() call from
__perf_event_header__init_id()/perf_prepare_sample() to
__perf_event_output_id_sample()/perf_output_sample().
On 12/13/10 11:06, Peter Zijlstra wrote:
> On Mon, 2010-12-13 at 11:01 -0700, David S. Ahern wrote:
>>> Another problem with this approach is things like flight-recorder mode
>>> where you constantly over-write your old data, you'd have to build some
>>> trigger to always output a new record before you over-write the old one,
>>> so there's always one consistent record around. Drift is an even more
>>> serious problem here since flight-record more could be running for days
>>> before (if ever) you dump it.
>>
>> Ok. I was not aware flight-recorder mode was an option today.
>
> You get it when you mmap the fd RO -- the only thing is you don't get a
> consistent tail pointer so you have to fudge a bit.
Awesome. Flight recorder mode is on our wish list. I'll try it out when
I get some time.
David
>
> But even if we didn't have it, we'd need a solution that would work once
> we did grow it ;-)
Em Mon, Dec 13, 2010 at 11:01:13AM -0700, David S. Ahern escreveu:
> On 12/13/10 10:57, Peter Zijlstra wrote:
> >> What about creating a PERF_RECORD_TIME and generate an event when the
> >> counter is opened? It contains a PERF_SAMPLE_TIME and say
> >> PERF_SAMPLE_TOD (time-of-day)? We're not sending rockets to saturn; we
> >> just need the timestamps to match other log files.
> > That's similar to the first thing I proposed. The problem is with long
> > record sessions your drift can become quite significant, then when you
> > merge sort your other log events stuff can get out of order. Which can
> > lead to some serious head-scratching..
> Gotcha. Missed that in the flury of emails.
> Arnaldo: Are you ok with this option? This should append mode as well.
What option, this one:
-----------------------------------------------------------------------
Peter Zijlstra wrote:
> In fact, the only sane way to do that is by creating a software counter
> that represents CLOCK_MONOTONIC and sample that say once a minute (or
> more often if you want smaller drift).
-----------------------------------------------------------------------
?
Yes, that looks the best option. I.e. no changes on headers, no new
user fake events, a new software event that may be useful for other
usecases.
We have:
[acme@mica linux]$ perf list | grep -- -clock
cpu-clock [Software event]
task-clock [Software event]
[acme@mica linux]$
So we would have a new one:
monotonic-clock
Peter, agreed? I'll try to implement it now, good opportunity to learn a
bit more about soft pmus, I'd have to do that anyway for NIC stats, etc.
- Arnaldo
> We have:
>
> [acme@mica linux]$ perf list | grep -- -clock
> cpu-clock [Software event]
> task-clock [Software event]
> [acme@mica linux]$
>
> So we would have a new one:
>
> monotonic-clock
Name seems off. The 'event' is to grab time-of-day and have the 'time'
attribute set which gives the correlation between wall-clock and perf
timestamps.
David
>
> Peter, agreed? I'll try to implement it now, good opportunity to learn a
> bit more about soft pmus, I'd have to do that anyway for NIC stats, etc.
>
> - Arnaldo
On Mon, 2010-12-13 at 17:08 -0200, Arnaldo Carvalho de Melo wrote:
> Em Mon, Dec 13, 2010 at 11:01:13AM -0700, David S. Ahern escreveu:
> > On 12/13/10 10:57, Peter Zijlstra wrote:
> > >> What about creating a PERF_RECORD_TIME and generate an event when the
> > >> counter is opened? It contains a PERF_SAMPLE_TIME and say
> > >> PERF_SAMPLE_TOD (time-of-day)? We're not sending rockets to saturn; we
> > >> just need the timestamps to match other log files.
>
> > > That's similar to the first thing I proposed. The problem is with long
> > > record sessions your drift can become quite significant, then when you
> > > merge sort your other log events stuff can get out of order. Which can
> > > lead to some serious head-scratching..
>
> > Gotcha. Missed that in the flury of emails.
>
> > Arnaldo: Are you ok with this option? This should append mode as well.
>
> What option, this one:
>
> -----------------------------------------------------------------------
> Peter Zijlstra wrote:
>
> > In fact, the only sane way to do that is by creating a software counter
> > that represents CLOCK_MONOTONIC and sample that say once a minute (or
> > more often if you want smaller drift).
> -----------------------------------------------------------------------
>
> ?
>
> Yes, that looks the best option. I.e. no changes on headers, no new
> user fake events, a new software event that may be useful for other
> usecases.
>
> We have:
>
> [acme@mica linux]$ perf list | grep -- -clock
> cpu-clock [Software event]
> task-clock [Software event]
> [acme@mica linux]$
>
> So we would have a new one:
>
> monotonic-clock
>
> Peter, agreed? I'll try to implement it now, good opportunity to learn a
> bit more about soft pmus, I'd have to do that anyway for NIC stats, etc.
Right, don't look too closely at cpu-clock though, its currently a tad
broken, but it should give enough hints as how to implement what you
want.
Once you've got it working we might consider adding something like
PERF_TYPE_CLOCK, where perf_event_attr::config is the posix clock id to
read. That might (or might not) be sensible, given that they're
proposing dynamic posix clocks, which suggests there will be more than
the normal few.
On Mon, 2010-12-13 at 12:15 -0700, David S. Ahern wrote:
> > monotonic-clock
>
> Name seems off. The 'event' is to grab time-of-day and have the 'time'
> attribute set which gives the correlation between wall-clock and perf
> timestamps.
Well, you really want CLOCK_MONOTONIC, not CLOCK_REALTIME (nothing what
so ever to do with Real-Time computing, just another brilliant POSIX
misnomer), or possibly even CLOCK_MONOTONIC_RAW.
gettimeofday() usually returns CLOCK_REALTIME which is subject to both
NTP and timezone adjustments.
CLOCK_MONOTONIC is only subject to NTP
CLOCK_MONOTONIC_RAW is neither.
At some point we'll try and have CLOCK_TRACING and provide a similar
time to user-space that we currently use for perf (using the VDSO to
avoid any actual system-calls).