2015-11-24 13:56:08

by Yunlong Song

[permalink] [raw]
Subject: [PATCH] perf record: Add snapshot mode support for perf's regular events

This idea is issued and motivated from:
https://lwn.net/Articles/650499/

After the first RFC is sent:
http://lkml.iu.edu/hypermail/linux/kernel/1509.2/04347.html

Both David Ahern and Borislav Petkov have replied to that RFC:
http://lkml.iu.edu/hypermail/linux/kernel/1509.2/04350.html
http://lkml.iu.edu/hypermail/linux/kernel/1509.2/03914.html

Thanks to David's and Borislav's advice.

However, David's perf-based scheduling daemon just makes some count when
the signal triggers perf sched, with no sample recording and has nothing
to do with perf.data. As for Borislav's persistent events, when perf
record runs, it just makes fd to attach to the persistent event to read,
and all the persistent event's tracing info will still dump to perf.data
during perf's running.

As a result, neither David's nor Borislav's patches makes the similar
snapshot mode support as what aux trace does.

In our patch, we create and maintain a user space ring buffer to store
perf's tracing info, instead of directly writing to perf.data file as
before. In snapshot mode, only a SIGUSR2 signal can trigger perf to dump
the tracing info currently stored in the user space ring buffer to
perf.data file.

Yunlong Song (1):
perf record: Add snapshot mode support for perf's regular events

tools/perf/builtin-record.c | 181 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 170 insertions(+), 11 deletions(-)

--
1.8.5.2


2015-11-24 13:55:58

by Yunlong Song

[permalink] [raw]
Subject: [PATCH] perf record: Add snapshot mode support for perf's regular events

For aux area tracing, there is already a snapshot mode support for
intel-pt and intel-bts events. Similarly, this patch adds a snapshot
mode for perf's regular events. A user space ring buffer is allocated to
handle the tracing data from the kernel space ring buffer, and the
tracing data will only dump to perf.data when perf receives a SIGUSR2
signal.

Similarly like '-S' in aux trace snapshot mode, '-M' enables perf's
regular event's snapshot mode by defining the size (bytes) of the user
space ring buffer.

Example 1:

$ perf record -a -M 10000000
/*
* Let perf record runs for some time before finally ends, and do not
* send any SIGUSR2 signal to perf during perf's running.
*/

$ perf report

Error:
The perf.data file has no samples!
# To display the perf.data header info, please use --header/--header-only options.

As shown above, without any SIGUSR2 signal, perf record will dump no samples
to perf.data in the snapshot mode.

Example 2:

$ perf record -a -M 10000000
/*
* Let perf record runs for some time before finally ends, and send
* several times of SIGUSR2 signal to perf during perf's running.
*/

# kill -SIGUSR2 `pidof perf`
...
# kill -SIGUSR2 `pidof perf`

$ perf report
<SNIP>
# Total Lost Samples: 0
#
# Samples: 942 of event 'cycles:pp'
# Event count (approx.): 175168972
#
# Overhead Command Shared Object Symbol
# ........ ............... ....................... .........................................
#
8.20% kworker/2:0 [kernel.kallsyms] [k] default_send_IPI_mask_allbutself_phys
6.33% swapper [kernel.kallsyms] [k] intel_idle
2.64% pidof [kernel.kallsyms] [k] arch_get_unmapped_area_topdown
2.56% pidof [kernel.kallsyms] [k] unmap_region
2.26% pidof [kernel.kallsyms] [k] memcpy
2.26% pidof libc-2.19.so [.] _IO_vfscanf
2.03% pidof [kernel.kallsyms] [k] lookup_fast
1.72% pidof [kernel.kallsyms] [k] filp_close
1.62% pidof [kernel.kallsyms] [k] apparmor_file_open
1.56% pidof [kernel.kallsyms] [k] process_measurement
1.50% pidof [kernel.kallsyms] [k] find_vma
<SNIP>

As shown above, perf record will dump samples to perf.data every time
it receives a SIGUSR2 signal in the snapshot mode.

Signed-off-by: Yunlong Song <[email protected]>
---
tools/perf/builtin-record.c | 181 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 170 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 199fc31..75606a6 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -37,6 +37,16 @@
#include <sched.h>
#include <sys/mman.h>

+static volatile int memory_enabled;
+static volatile int memory_signalled;
+/* The maximum size of one perf_event is 65536*/
+#define MEMORY_SIZE_MIN 65537
+
+struct memory {
+ void *start;
+ u64 head, tail;
+ u64 size;
+};

struct record {
struct perf_tool tool;
@@ -51,16 +61,134 @@ struct record {
bool no_buildid;
bool no_buildid_cache;
unsigned long long samples;
+ struct memory memory;
};

-static int record__write(struct record *rec, void *bf, size_t size)
+static int buf_to_file(struct record *rec, void *buf,
+ size_t size, u64 head, u64 tail)
{
- if (perf_data_file__write(rec->session->file, bf, size) < 0) {
- pr_err("failed to write perf data, error: %m\n");
+ size_t written = 0;
+
+ if (head < tail) {
+ if (perf_data_file__write(rec->session->file,
+ buf + head, tail - head) < 0)
+ goto out;
+ written += tail - head;
+ } else if (head > tail) {
+ if (perf_data_file__write(rec->session->file,
+ buf + head, size - head) < 0)
+ goto out;
+ written += size - head;
+
+ if (perf_data_file__write(rec->session->file, buf, tail) < 0)
+ goto out;
+ written += tail;
+ }
+
+ rec->bytes_written += written;
+ return 0;
+out:
+ pr_err("failed to write perf data, error: %m\n");
+ return -1;
+}
+
+static int memory_to_file(struct record *rec)
+{
+ if (buf_to_file(rec, rec->memory.start, rec->memory.size,
+ rec->memory.head, rec->memory.tail) < 0)
return -1;
+ rec->memory.head = rec->memory.tail;
+
+ return 0;
+}
+
+static ssize_t perf_memory__write(struct memory *memory, void *buf, size_t size)
+{
+ void *buf_start = buf;
+ size_t left = size, written, delta, skip;
+ union perf_event *event;
+ struct perf_event_header hdr;
+ struct record *rec = container_of(memory, struct record, memory);
+
+ while (left) {
+ skip = 0;
+ written = min(left, memory->size - memory->tail);
+ if (memory->head > memory->tail)
+ delta = memory->head - memory->tail;
+ else
+ delta = memory->size - memory->tail + memory->head;
+ if (delta <= written) {
+ do {
+ if ((memory->head + skip) <= (memory->size -
+ sizeof(struct perf_event_header)))
+ event = (union perf_event *)(memory->start +
+ memory->head + skip);
+ else {
+ size_t hdr_left;
+
+ hdr_left = sizeof(struct perf_event_header) -
+ memory->size + memory->head + skip;
+ memcpy(&hdr, memory->start + memory->head + skip,
+ sizeof(struct perf_event_header) - hdr_left);
+
+ if (hdr_left <= memory->tail)
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, memory->start, hdr_left);
+ else if (!memory->tail)
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, buf, hdr_left);
+ else {
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, memory->start, memory->tail);
+ hdr_left -= memory->tail;
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, buf, hdr_left);
+ }
+
+ event = (union perf_event *)&hdr;
+ if (rec->session->header.needs_swap)
+ perf_event_header__bswap(&event->header);
+ }
+
+ if (event->header.type != PERF_RECORD_SAMPLE) {
+ if (buf_to_file(rec, memory->start, memory->size,
+ memory->head + skip, (memory->head + skip +
+ event->header.size) % memory->size) < 0)
+ return -1;
+ }
+
+ skip += event->header.size;
+ } while (skip <= written - delta);
+ }
+
+ memcpy(memory->start + memory->tail, buf, written);
+
+ memory->head = (memory->head + skip) % memory->size;
+ memory->tail = (memory->tail + written) % memory->size;
+
+ left -= written;
+ buf += written;
+ }
+
+ BUG_ON((size_t)(buf - buf_start) != size);
+ return size;
+}
+
+static int record__write(struct record *rec, void *bf, size_t size)
+{
+ if (rec->memory.size && memory_enabled) {
+ if (perf_memory__write(&rec->memory, bf, size) < 0) {
+ pr_err("failed to write memory data, error: %m\n");
+ return -1;
+ }
+ } else {
+ if (perf_data_file__write(rec->session->file, bf, size) < 0) {
+ pr_err("failed to write perf data, error: %m\n");
+ return -1;
+ }
+ rec->bytes_written += size;
}

- rec->bytes_written += size;
return 0;
}

@@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int idx)
if (old == head)
return 0;

+ memory_enabled = 1;
+
rec->samples++;

size = head - old;
@@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int idx)
md->prev = old;
perf_evlist__mmap_consume(rec->evlist, idx);
out:
+ memory_enabled = 0;
return rc;
}

@@ -426,8 +557,11 @@ static int record__mmap_read_all(struct record *rec)
* Mark the round finished in case we wrote
* at least one event.
*/
- if (bytes_written != rec->bytes_written)
+ if (bytes_written != rec->bytes_written) {
+ memory_enabled = 1;
rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
+ memory_enabled = 0;
+ }

out:
return rc;
@@ -492,7 +626,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
signal(SIGCHLD, sig_handler);
signal(SIGINT, sig_handler);
signal(SIGTERM, sig_handler);
- if (rec->opts.auxtrace_snapshot_mode)
+ if (rec->opts.auxtrace_snapshot_mode || rec->memory.size)
signal(SIGUSR2, snapshot_sig_handler);
else
signal(SIGUSR2, SIG_IGN);
@@ -687,6 +821,14 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
}
}

+ if (memory_signalled) {
+ memory_signalled = 0;
+ if (memory_to_file(rec) < 0) {
+ err = -1;
+ goto out_child;
+ }
+ }
+
if (hits == rec->samples) {
if (done || draining)
break;
@@ -1009,6 +1151,12 @@ static struct record record = {
.mmap2 = perf_event__process_mmap2,
.ordered_events = true,
},
+ .memory = {
+ .start = NULL,
+ .head = 0,
+ .tail = 0,
+ .size = 0,
+ },
};

const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
@@ -1119,6 +1267,7 @@ struct option __record_options[] = {
OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
"options passed to clang when compiling BPF scriptlets"),
#endif
+ OPT_U64('M', "memory", &record.memory.size, "user space ring buffer memory size (bytes)"),
OPT_END()
};

@@ -1220,19 +1369,29 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
goto out_symbol_exit;
}

+ if (rec->memory.size) {
+ if (rec->memory.size < MEMORY_SIZE_MIN)
+ rec->memory.size = MEMORY_SIZE_MIN;
+ rec->memory.start = malloc(rec->memory.size);
+ }
+
err = __cmd_record(&record, argc, argv);
out_symbol_exit:
perf_evlist__delete(rec->evlist);
symbol__exit();
auxtrace_record__free(rec->itr);
+ if (rec->memory.size)
+ free(rec->memory.start);
return err;
}

static void snapshot_sig_handler(int sig __maybe_unused)
{
- if (!auxtrace_snapshot_enabled)
- return;
- auxtrace_snapshot_enabled = 0;
- auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
- auxtrace_record__snapshot_started = 1;
+ if (record.opts.auxtrace_snapshot_mode && auxtrace_snapshot_enabled) {
+ auxtrace_snapshot_enabled = 0;
+ auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
+ auxtrace_record__snapshot_started = 1;
+ }
+ if (record.memory.size && !memory_signalled)
+ memory_signalled = 1;
}
--
1.8.5.2

2015-11-24 14:30:39

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

Em Tue, Nov 24, 2015 at 10:00:32PM +0800, Yunlong Song escreveu:
> For aux area tracing, there is already a snapshot mode support for
> intel-pt and intel-bts events. Similarly, this patch adds a snapshot
> mode for perf's regular events. A user space ring buffer is allocated to
> handle the tracing data from the kernel space ring buffer, and the
> tracing data will only dump to perf.data when perf receives a SIGUSR2
> signal.
>
> Similarly like '-S' in aux trace snapshot mode, '-M' enables perf's
> regular event's snapshot mode by defining the size (bytes) of the user
> space ring buffer.

Looks like an interesting feature, if for no other reason, to match what
we have for aux area tracing.

But perhaps having an event in the stream itself that would signal when
to dump the snapshot would be a better approach?

One that perhaps you create with 'perf probe' or a tracepoint, both
could use the in-kernel filtering capabilities to specify when to
trigger that snapshot?

If this snapshotting could happen in the kernel, that would be even
better, i.e. no need for two buffers (kernel rb + userspace rb) for
achieving this? Anyway, see below some comments.

> Example 1:
>
> $ perf record -a -M 10000000
> /*
> * Let perf record runs for some time before finally ends, and do not
> * send any SIGUSR2 signal to perf during perf's running.
> */
>
> $ perf report
>
> Error:
> The perf.data file has no samples!
> # To display the perf.data header info, please use --header/--header-only options.
>
> As shown above, without any SIGUSR2 signal, perf record will dump no samples
> to perf.data in the snapshot mode.
>
> Example 2:
>
> $ perf record -a -M 10000000
> /*
> * Let perf record runs for some time before finally ends, and send
> * several times of SIGUSR2 signal to perf during perf's running.
> */
>
> # kill -SIGUSR2 `pidof perf`
> ...
> # kill -SIGUSR2 `pidof perf`
>
> $ perf report
> <SNIP>
> # Total Lost Samples: 0
> #
> # Samples: 942 of event 'cycles:pp'
> # Event count (approx.): 175168972
> #
> # Overhead Command Shared Object Symbol
> # ........ ............... ....................... .........................................
> #
> 8.20% kworker/2:0 [kernel.kallsyms] [k] default_send_IPI_mask_allbutself_phys
> 6.33% swapper [kernel.kallsyms] [k] intel_idle
> 2.64% pidof [kernel.kallsyms] [k] arch_get_unmapped_area_topdown
> 2.56% pidof [kernel.kallsyms] [k] unmap_region
> 2.26% pidof [kernel.kallsyms] [k] memcpy
> 2.26% pidof libc-2.19.so [.] _IO_vfscanf
> 2.03% pidof [kernel.kallsyms] [k] lookup_fast
> 1.72% pidof [kernel.kallsyms] [k] filp_close
> 1.62% pidof [kernel.kallsyms] [k] apparmor_file_open
> 1.56% pidof [kernel.kallsyms] [k] process_measurement
> 1.50% pidof [kernel.kallsyms] [k] find_vma
> <SNIP>
>
> As shown above, perf record will dump samples to perf.data every time
> it receives a SIGUSR2 signal in the snapshot mode.
>
> Signed-off-by: Yunlong Song <[email protected]>
> ---
> tools/perf/builtin-record.c | 181 +++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 170 insertions(+), 11 deletions(-)
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 199fc31..75606a6 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -37,6 +37,16 @@
> #include <sched.h>
> #include <sys/mman.h>
>
> +static volatile int memory_enabled;
> +static volatile int memory_signalled;

Those are really too generic names :-\

> +/* The maximum size of one perf_event is 65536*/
> +#define MEMORY_SIZE_MIN 65537
> +
> +struct memory {

How about: snapshot_ring_buffer? If that is deemed too big, perhaps
snapshot_rb?

> + void *start;
> + u64 head, tail;
> + u64 size;
> +};
>
> struct record {
> struct perf_tool tool;
> @@ -51,16 +61,134 @@ struct record {
> bool no_buildid;
> bool no_buildid_cache;
> unsigned long long samples;
> + struct memory memory;
> };
>
> -static int record__write(struct record *rec, void *bf, size_t size)
> +static int buf_to_file(struct record *rec, void *buf,
> + size_t size, u64 head, u64 tail)


Please continue following the existing convention and name this:

static int record__write_rb()

As the buffer you're writing is a ring buffer, thus needs checking about
wrap around, like you do here.

> {
> - if (perf_data_file__write(rec->session->file, bf, size) < 0) {
> - pr_err("failed to write perf data, error: %m\n");
> + size_t written = 0;
> +
> + if (head < tail) {
> + if (perf_data_file__write(rec->session->file,
> + buf + head, tail - head) < 0)
> + goto out;
> + written += tail - head;
> + } else if (head > tail) {
> + if (perf_data_file__write(rec->session->file,
> + buf + head, size - head) < 0)
> + goto out;
> + written += size - head;
> +
> + if (perf_data_file__write(rec->session->file, buf, tail) < 0)
> + goto out;
> + written += tail;
> + }
> +
> + rec->bytes_written += written;
> + return 0;
> +out:
> + pr_err("failed to write perf data, error: %m\n");
> + return -1;
> +}
> +
> +static int memory_to_file(struct record *rec)
> +{
> + if (buf_to_file(rec, rec->memory.start, rec->memory.size,
> + rec->memory.head, rec->memory.tail) < 0)
> return -1;
> + rec->memory.head = rec->memory.tail;
> +
> + return 0;
> +}
> +
> +static ssize_t perf_memory__write(struct memory *memory, void *buf, size_t size)
> +{
> + void *buf_start = buf;
> + size_t left = size, written, delta, skip;
> + union perf_event *event;
> + struct perf_event_header hdr;
> + struct record *rec = container_of(memory, struct record, memory);
> +
> + while (left) {
> + skip = 0;
> + written = min(left, memory->size - memory->tail);
> + if (memory->head > memory->tail)
> + delta = memory->head - memory->tail;
> + else
> + delta = memory->size - memory->tail + memory->head;
> + if (delta <= written) {
> + do {
> + if ((memory->head + skip) <= (memory->size -
> + sizeof(struct perf_event_header)))
> + event = (union perf_event *)(memory->start +
> + memory->head + skip);
> + else {
> + size_t hdr_left;
> +
> + hdr_left = sizeof(struct perf_event_header) -
> + memory->size + memory->head + skip;
> + memcpy(&hdr, memory->start + memory->head + skip,
> + sizeof(struct perf_event_header) - hdr_left);
> +
> + if (hdr_left <= memory->tail)
> + memcpy((void *)&hdr + sizeof(struct perf_event_header) -
> + hdr_left, memory->start, hdr_left);
> + else if (!memory->tail)
> + memcpy((void *)&hdr + sizeof(struct perf_event_header) -
> + hdr_left, buf, hdr_left);
> + else {
> + memcpy((void *)&hdr + sizeof(struct perf_event_header) -
> + hdr_left, memory->start, memory->tail);
> + hdr_left -= memory->tail;
> + memcpy((void *)&hdr + sizeof(struct perf_event_header) -
> + hdr_left, buf, hdr_left);
> + }
> +
> + event = (union perf_event *)&hdr;
> + if (rec->session->header.needs_swap)
> + perf_event_header__bswap(&event->header);
> + }
> +
> + if (event->header.type != PERF_RECORD_SAMPLE) {
> + if (buf_to_file(rec, memory->start, memory->size,
> + memory->head + skip, (memory->head + skip +
> + event->header.size) % memory->size) < 0)
> + return -1;
> + }
> +
> + skip += event->header.size;
> + } while (skip <= written - delta);
> + }
> +
> + memcpy(memory->start + memory->tail, buf, written);
> +
> + memory->head = (memory->head + skip) % memory->size;
> + memory->tail = (memory->tail + written) % memory->size;
> +
> + left -= written;
> + buf += written;
> + }
> +
> + BUG_ON((size_t)(buf - buf_start) != size);
> + return size;
> +}
> +
> +static int record__write(struct record *rec, void *bf, size_t size)
> +{
> + if (rec->memory.size && memory_enabled) {
> + if (perf_memory__write(&rec->memory, bf, size) < 0) {
> + pr_err("failed to write memory data, error: %m\n");
> + return -1;
> + }
> + } else {
> + if (perf_data_file__write(rec->session->file, bf, size) < 0) {
> + pr_err("failed to write perf data, error: %m\n");
> + return -1;
> + }
> + rec->bytes_written += size;
> }
>
> - rec->bytes_written += size;
> return 0;
> }
>
> @@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int idx)
> if (old == head)
> return 0;
>
> + memory_enabled = 1;
> +

This really looks hackish, using a global to change the behaviour of
some existing function, and a volatile at that, ouch, please change the
prototypes in some way to avoid globals like this.

> rec->samples++;
>
> size = head - old;
> @@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int idx)
> md->prev = old;
> perf_evlist__mmap_consume(rec->evlist, idx);
> out:
> + memory_enabled = 0;
> return rc;
> }
>
> @@ -426,8 +557,11 @@ static int record__mmap_read_all(struct record *rec)
> * Mark the round finished in case we wrote
> * at least one event.
> */
> - if (bytes_written != rec->bytes_written)
> + if (bytes_written != rec->bytes_written) {
> + memory_enabled = 1;
> rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
> + memory_enabled = 0;
> + }
>
> out:
> return rc;
> @@ -492,7 +626,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
> signal(SIGCHLD, sig_handler);
> signal(SIGINT, sig_handler);
> signal(SIGTERM, sig_handler);
> - if (rec->opts.auxtrace_snapshot_mode)
> + if (rec->opts.auxtrace_snapshot_mode || rec->memory.size)
> signal(SIGUSR2, snapshot_sig_handler);
> else
> signal(SIGUSR2, SIG_IGN);
> @@ -687,6 +821,14 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
> }
> }
>
> + if (memory_signalled) {
> + memory_signalled = 0;
> + if (memory_to_file(rec) < 0) {
> + err = -1;
> + goto out_child;
> + }
> + }
> +
> if (hits == rec->samples) {
> if (done || draining)
> break;
> @@ -1009,6 +1151,12 @@ static struct record record = {
> .mmap2 = perf_event__process_mmap2,
> .ordered_events = true,
> },
> + .memory = {
> + .start = NULL,
> + .head = 0,
> + .tail = 0,
> + .size = 0,
> + },
> };
>
> const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
> @@ -1119,6 +1267,7 @@ struct option __record_options[] = {
> OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
> "options passed to clang when compiling BPF scriptlets"),
> #endif
> + OPT_U64('M', "memory", &record.memory.size, "user space ring buffer memory size (bytes)"),
> OPT_END()
> };
>
> @@ -1220,19 +1369,29 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
> goto out_symbol_exit;
> }
>
> + if (rec->memory.size) {
> + if (rec->memory.size < MEMORY_SIZE_MIN)
> + rec->memory.size = MEMORY_SIZE_MIN;
> + rec->memory.start = malloc(rec->memory.size);
> + }
> +
> err = __cmd_record(&record, argc, argv);
> out_symbol_exit:
> perf_evlist__delete(rec->evlist);
> symbol__exit();
> auxtrace_record__free(rec->itr);
> + if (rec->memory.size)
> + free(rec->memory.start);
> return err;
> }
>
> static void snapshot_sig_handler(int sig __maybe_unused)
> {
> - if (!auxtrace_snapshot_enabled)
> - return;
> - auxtrace_snapshot_enabled = 0;
> - auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
> - auxtrace_record__snapshot_started = 1;
> + if (record.opts.auxtrace_snapshot_mode && auxtrace_snapshot_enabled) {
> + auxtrace_snapshot_enabled = 0;
> + auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
> + auxtrace_record__snapshot_started = 1;
> + }
> + if (record.memory.size && !memory_signalled)
> + memory_signalled = 1;
> }
> --
> 1.8.5.2

2015-11-24 15:06:46

by David Ahern

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

On 11/24/15 7:00 AM, Yunlong Song wrote:
> +static int record__write(struct record *rec, void *bf, size_t size)
> +{
> + if (rec->memory.size && memory_enabled) {
> + if (perf_memory__write(&rec->memory, bf, size) < 0) {
> + pr_err("failed to write memory data, error: %m\n");
> + return -1;
> + }
> + } else {
> + if (perf_data_file__write(rec->session->file, bf, size) < 0) {
> + pr_err("failed to write perf data, error: %m\n");
> + return -1;
> + }
> + rec->bytes_written += size;
> }
>
> - rec->bytes_written += size;
> return 0;
> }
>
> @@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int idx)
> if (old == head)
> return 0;
>
> + memory_enabled = 1;
> +
> rec->samples++;
>
> size = head - old;
> @@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int idx)
> md->prev = old;
> perf_evlist__mmap_consume(rec->evlist, idx);
> out:
> + memory_enabled = 0;
> return rc;
> }
>

So you are basically ignoring all samples until SIGUSR2 is received.
That means the resulting data file will have limited history of task
events for example. And for other events the quantity is random as to
when the mmaps were last scanned.

Your cover letter mentioned my code "just makes some count when the
signal triggers perf sched, with no sample recording and has nothing to
do with perf.data". That is not correct. If you look at the perf-daemon
code I pointed you to it processes task events as they are received and
saves the last N-events after time sorting (limited by memory or time).
When a signal is received it processes the saved events and dumps them
to stdout versus writing a perf.data file.

David

2015-11-24 15:20:29

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

Em Tue, Nov 24, 2015 at 08:06:41AM -0700, David Ahern escreveu:
> On 11/24/15 7:00 AM, Yunlong Song wrote:
> >+static int record__write(struct record *rec, void *bf, size_t size)
> >+{
> >+ if (rec->memory.size && memory_enabled) {
> >+ if (perf_memory__write(&rec->memory, bf, size) < 0) {
> >+ pr_err("failed to write memory data, error: %m\n");
> >+ return -1;
> >+ }
> >+ } else {
> >+ if (perf_data_file__write(rec->session->file, bf, size) < 0) {
> >+ pr_err("failed to write perf data, error: %m\n");
> >+ return -1;
> >+ }
> >+ rec->bytes_written += size;
> > }
> >
> >- rec->bytes_written += size;
> > return 0;
> > }
> >
> >@@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int idx)
> > if (old == head)
> > return 0;
> >
> >+ memory_enabled = 1;
> >+
> > rec->samples++;
> >
> > size = head - old;
> >@@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int idx)
> > md->prev = old;
> > perf_evlist__mmap_consume(rec->evlist, idx);
> > out:
> >+ memory_enabled = 0;
> > return rc;
> > }
> >
>
> So you are basically ignoring all samples until SIGUSR2 is received. That

No, he is not, its just that his code is difficult to follow, has to be
rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
will..

> means the resulting data file will have limited history of task events for

... have a complete history of task events, since PERF_RECORD_FORK, etc
are not being ignored.

No?

- Arnaldo

> example. And for other events the quantity is random as to when the mmaps
> were last scanned.
>
> Your cover letter mentioned my code "just makes some count when the signal
> triggers perf sched, with no sample recording and has nothing to do with
> perf.data". That is not correct. If you look at the perf-daemon code I
> pointed you to it processes task events as they are received and saves the
> last N-events after time sorting (limited by memory or time). When a signal
> is received it processes the saved events and dumps them to stdout versus
> writing a perf.data file.
>
> David

2015-11-24 15:24:11

by David Ahern

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

On 11/24/15 8:20 AM, Arnaldo Carvalho de Melo wrote:
> Em Tue, Nov 24, 2015 at 08:06:41AM -0700, David Ahern escreveu:
>> On 11/24/15 7:00 AM, Yunlong Song wrote:
>>> +static int record__write(struct record *rec, void *bf, size_t size)
>>> +{
>>> + if (rec->memory.size && memory_enabled) {
>>> + if (perf_memory__write(&rec->memory, bf, size) < 0) {
>>> + pr_err("failed to write memory data, error: %m\n");
>>> + return -1;
>>> + }
>>> + } else {
>>> + if (perf_data_file__write(rec->session->file, bf, size) < 0) {
>>> + pr_err("failed to write perf data, error: %m\n");
>>> + return -1;
>>> + }
>>> + rec->bytes_written += size;
>>> }
>>>
>>> - rec->bytes_written += size;
>>> return 0;
>>> }
>>>
>>> @@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int idx)
>>> if (old == head)
>>> return 0;
>>>
>>> + memory_enabled = 1;
>>> +
>>> rec->samples++;
>>>
>>> size = head - old;
>>> @@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int idx)
>>> md->prev = old;
>>> perf_evlist__mmap_consume(rec->evlist, idx);
>>> out:
>>> + memory_enabled = 0;
>>> return rc;
>>> }
>>>
>>
>> So you are basically ignoring all samples until SIGUSR2 is received. That
>
> No, he is not, its just that his code is difficult to follow, has to be
> rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
> will..
>
>> means the resulting data file will have limited history of task events for
>
> ... have a complete history of task events, since PERF_RECORD_FORK, etc
> are not being ignored.
>
> No?

perf-record does not process events, it only writes to a file. If that
is skipped then it skips all events regardless of type.

David

2015-11-24 15:40:39

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

Em Tue, Nov 24, 2015 at 08:24:07AM -0700, David Ahern escreveu:
> On 11/24/15 8:20 AM, Arnaldo Carvalho de Melo wrote:
> >Em Tue, Nov 24, 2015 at 08:06:41AM -0700, David Ahern escreveu:
> >>On 11/24/15 7:00 AM, Yunlong Song wrote:
> >>>+static int record__write(struct record *rec, void *bf, size_t size)
> >>>+{
> >>>+ if (rec->memory.size && memory_enabled) {
> >>>+ if (perf_memory__write(&rec->memory, bf, size) < 0) {
> >>>+ pr_err("failed to write memory data, error: %m\n");
> >>>+ return -1;
> >>>+ }
> >>>+ } else {
> >>>+ if (perf_data_file__write(rec->session->file, bf, size) < 0) {
> >>>+ pr_err("failed to write perf data, error: %m\n");
> >>>+ return -1;
> >>>+ }
> >>>+ rec->bytes_written += size;
> >>> }
> >>>
> >>>- rec->bytes_written += size;
> >>> return 0;
> >>> }
> >>>
> >>>@@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int idx)
> >>> if (old == head)
> >>> return 0;
> >>>
> >>>+ memory_enabled = 1;
> >>>+
> >>> rec->samples++;
> >>>
> >>> size = head - old;
> >>>@@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int idx)
> >>> md->prev = old;
> >>> perf_evlist__mmap_consume(rec->evlist, idx);
> >>> out:
> >>>+ memory_enabled = 0;
> >>> return rc;
> >>> }
> >>>
> >>
> >>So you are basically ignoring all samples until SIGUSR2 is received. That
> >
> >No, he is not, its just that his code is difficult to follow, has to be
> >rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
> >will..

> >>means the resulting data file will have limited history of task events for

> >... have a complete history of task events, since PERF_RECORD_FORK, etc
> >are not being ignored.

> >No?

> perf-record does not process events, it only writes to a file. If that is
> skipped then it skips all events regardless of type.

perf-record without his patch? yes, but with his patch it does:

__cmd_record()
for (;;)
record__mmap_read_all()
record__write()
perf_memory__write()
event = (union perf_event *)(memory->start + memory->head + skip);
if (event->header.type != PERF_RECORD_SAMPLE) {
if (buf_to_file(rec, memory->start, memory->size,
}

I almost thought that I had been fooled by the difficulty to follow his
patch and was forgetting that 'perf record' doesn't processes events,
and hasn't done so for a very good reason: to reduce its impact on the
observed workload, but that ain't so, no?

So, when not snapshotting, what you said remains true:

static int record__write(struct record *rec, void *bf, size_t size)
{
if (rec->memory.size && memory_enabled) {
if (perf_memory__write(&rec->memory, bf, size) < 0) {
pr_err("failed to write memory data, error: %m\n");
return -1;
}
} else {
if (perf_data_file__write(rec->session->file, bf, size) < 0) {

I'll continue taking the else branch and in that case, no events are
processed, we just dump that bf into the rec->session->file and go on
with life.

- Arnaldo

2015-11-24 16:16:15

by David Ahern

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

On 11/24/15 8:40 AM, Arnaldo Carvalho de Melo wrote:
> perf-record without his patch? yes, but with his patch it does:
>
> __cmd_record()
> for (;;)
> record__mmap_read_all()
> record__write()
> perf_memory__write()
> event = (union perf_event *)(memory->start + memory->head + skip);
> if (event->header.type != PERF_RECORD_SAMPLE) {
> if (buf_to_file(rec, memory->start, memory->size,
> }
>
> I almost thought that I had been fooled by the difficulty to follow his
> patch and was forgetting that 'perf record' doesn't processes events,
> and hasn't done so for a very good reason: to reduce its impact on the
> observed workload, but that ain't so, no?

exactly. And I missed the above. Thanks for pointing that out.

2015-11-25 03:54:30

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events



On 2015/11/24 23:20, Arnaldo Carvalho de Melo wrote:
> Em Tue, Nov 24, 2015 at 08:06:41AM -0700, David Ahern escreveu:
>> On 11/24/15 7:00 AM, Yunlong Song wrote:
>>> +static int record__write(struct record *rec, void *bf, size_t size)
>>> +{
>>> + if (rec->memory.size && memory_enabled) {
>>> + if (perf_memory__write(&rec->memory, bf, size) < 0) {
>>> + pr_err("failed to write memory data, error: %m\n");
>>> + return -1;
>>> + }
>>> + } else {
>>> + if (perf_data_file__write(rec->session->file, bf, size) < 0) {
>>> + pr_err("failed to write perf data, error: %m\n");
>>> + return -1;
>>> + }
>>> + rec->bytes_written += size;
>>> }
>>>
>>> - rec->bytes_written += size;
>>> return 0;
>>> }
>>>
>>> @@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int idx)
>>> if (old == head)
>>> return 0;
>>>
>>> + memory_enabled = 1;
>>> +
>>> rec->samples++;
>>>
>>> size = head - old;
>>> @@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int idx)
>>> md->prev = old;
>>> perf_evlist__mmap_consume(rec->evlist, idx);
>>> out:
>>> + memory_enabled = 0;
>>> return rc;
>>> }
>>>
>> So you are basically ignoring all samples until SIGUSR2 is received. That
> No, he is not, its just that his code is difficult to follow, has to be
> rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
> will..
>
>> means the resulting data file will have limited history of task events for
> ... have a complete history of task events, since PERF_RECORD_FORK, etc
> are not being ignored.
>
> No?

Actually we are discussing about this problem.

For such tracking events (PERF_RECORD_FORK...), we have dummy event so
it is possible for us to receive tracking events from a separated
channel, therefore we don't have to parse every events to pick those
events out. Instead, we can process tracking events differently, then
more interesting things can be done. For example, squashing those tracking
events if it takes too much memory...

Furthermore, there's another problem being discussed: if userspace
ringbuffer
is bytes based, parsing event is unavoidable. Without parsing event we are
unable to find the new 'head' pointer when overwriting. Instead, we are
thinking about a bucket-based ringbuffer that, let perf maintain a series
of bucket, each time 'poll' return, perf copies new events to the start of
a bucket. If all bucket is occupied, we drop the oldest bucket. Bucket-based
ringbuffer watest some memory but can avoid event parsing.

And there's many other problems in this patch. For example, when SIGUSR2 is
received, we need to do something to let all perf events start dumping.
Current implementation can't ensure we receive events just before the
SIGUSR2 if we not set 'no-buffer'.

Also, output events are in one perf.data, which is not user friendly.
Our final goal is to make perf a daemonized moniter, which can run 7x24
in user's environment. Each time a glitch is detected, a framework sends
a signal to perf to get a perf.data from it perf. The framework manage
those perf.data like logrotate, help developer analysis those glitch.

We are seeking the route implementing the final monitor. This patch is
an attempt to let you know what we want and get your thought about it.
Looks like you agree out basic idea. That's good. Then we decide to
start from some small feature to support the final goal. For example:
snapshot mode for specific events:

# perf record -a -e cycles/snapshot/

And when C-c is pressed, for cycles event, only those data still in
kernel would be dump.

Thank you.

2015-11-25 05:06:23

by David Ahern

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

On 11/24/15 8:50 PM, Wangnan (F) wrote:
> Actually we are discussing about this problem.
>
> For such tracking events (PERF_RECORD_FORK...), we have dummy event so
> it is possible for us to receive tracking events from a separated
> channel, therefore we don't have to parse every events to pick those
> events out. Instead, we can process tracking events differently, then
> more interesting things can be done. For example, squashing those tracking
> events if it takes too much memory...

If you look at my daemon code I process task events (FORK, MMAP, EXIT)
to maintain task state including flushing threads when they terminate.
This is a trade-off to having the knowledge to pretty-print addresses
(address to symbol resolution) yet not grow without bounds -- be it a
file or memory.

>
> Furthermore, there's another problem being discussed: if userspace
> ringbuffer
> is bytes based, parsing event is unavoidable. Without parsing event we are
> unable to find the new 'head' pointer when overwriting. Instead, we are
> thinking about a bucket-based ringbuffer that, let perf maintain a series
> of bucket, each time 'poll' return, perf copies new events to the start of
> a bucket. If all bucket is occupied, we drop the oldest bucket.
> Bucket-based
> ringbuffer watest some memory but can avoid event parsing.
>
> And there's many other problems in this patch. For example, when SIGUSR2 is
> received, we need to do something to let all perf events start dumping.
> Current implementation can't ensure we receive events just before the
> SIGUSR2 if we not set 'no-buffer'.
>
> Also, output events are in one perf.data, which is not user friendly.
> Our final goal is to make perf a daemonized moniter, which can run 7x24
> in user's environment. Each time a glitch is detected, a framework sends
> a signal to perf to get a perf.data from it perf. The framework manage
> those perf.data like logrotate, help developer analysis those glitch.

Exactly. And that's why my daemon is written the way it is. It is
intended to run 24x7x365. It retains the last N events which are dumped
when some external trigger tells it to.

Arnaldo: you asked about an event in the stream but that is not
possible. My scheduling daemon targets CPU usage prior to a significant
event (what was running, how long, where, etc). The significant event in
the motivating case was STP timeouts -- if stp daemon is not able to
send BPDUs why? What was running leading up to the timeout. The point is
something external to the perf daemon says 'hey, save the last N-events
for analysis'.

This case sounds like a generalization of my problem with the desire to
write a perf.data file instead of processing the events and dumping to a
file. It is doable. For example, synthesize task events for all threads
in memory and then write out the saved samples.

2015-11-25 07:26:08

by Adrian Hunter

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

On 25/11/15 05:50, Wangnan (F) wrote:
>
>
> On 2015/11/24 23:20, Arnaldo Carvalho de Melo wrote:
>> Em Tue, Nov 24, 2015 at 08:06:41AM -0700, David Ahern escreveu:
>>> On 11/24/15 7:00 AM, Yunlong Song wrote:
>>>> +static int record__write(struct record *rec, void *bf, size_t size)
>>>> +{
>>>> + if (rec->memory.size && memory_enabled) {
>>>> + if (perf_memory__write(&rec->memory, bf, size) < 0) {
>>>> + pr_err("failed to write memory data, error: %m\n");
>>>> + return -1;
>>>> + }
>>>> + } else {
>>>> + if (perf_data_file__write(rec->session->file, bf, size) < 0) {
>>>> + pr_err("failed to write perf data, error: %m\n");
>>>> + return -1;
>>>> + }
>>>> + rec->bytes_written += size;
>>>> }
>>>>
>>>> - rec->bytes_written += size;
>>>> return 0;
>>>> }
>>>>
>>>> @@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int
>>>> idx)
>>>> if (old == head)
>>>> return 0;
>>>>
>>>> + memory_enabled = 1;
>>>> +
>>>> rec->samples++;
>>>>
>>>> size = head - old;
>>>> @@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int
>>>> idx)
>>>> md->prev = old;
>>>> perf_evlist__mmap_consume(rec->evlist, idx);
>>>> out:
>>>> + memory_enabled = 0;
>>>> return rc;
>>>> }
>>>>
>>> So you are basically ignoring all samples until SIGUSR2 is received. That
>> No, he is not, its just that his code is difficult to follow, has to be
>> rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
>> will..
>>
>>> means the resulting data file will have limited history of task events for
>> ... have a complete history of task events, since PERF_RECORD_FORK, etc
>> are not being ignored.
>>
>> No?
>
> Actually we are discussing about this problem.
>
> For such tracking events (PERF_RECORD_FORK...), we have dummy event so
> it is possible for us to receive tracking events from a separated
> channel, therefore we don't have to parse every events to pick those
> events out. Instead, we can process tracking events differently, then
> more interesting things can be done. For example, squashing those tracking
> events if it takes too much memory...
>
> Furthermore, there's another problem being discussed: if userspace ringbuffer
> is bytes based, parsing event is unavoidable. Without parsing event we are
> unable to find the new 'head' pointer when overwriting.

Have you considered trying to find the head by trial-and-error at the time
you make the snapshot i.e. look at the first 8 bytes (event records are 8
byte aligned) and see if it is a valid record header, if not try the next 8
bytes. When you find a real event record it should parse without error and
the subsequent events should all parse without error too, all the way to the
tail. Then you can use timestamps and compare the events byte-by-byte to
avoid overlaps between 2 snapshots.

2015-11-25 07:52:08

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events



On 2015/11/25 15:22, Adrian Hunter wrote:
> On 25/11/15 05:50, Wangnan (F) wrote:
>>
>> On 2015/11/24 23:20, Arnaldo Carvalho de Melo wrote:
>>> Em Tue, Nov 24, 2015 at 08:06:41AM -0700, David Ahern escreveu:
>>>> On 11/24/15 7:00 AM, Yunlong Song wrote:
>>>>> +static int record__write(struct record *rec, void *bf, size_t size)
>>>>> +{
>>>>> + if (rec->memory.size && memory_enabled) {
>>>>> + if (perf_memory__write(&rec->memory, bf, size) < 0) {
>>>>> + pr_err("failed to write memory data, error: %m\n");
>>>>> + return -1;
>>>>> + }
>>>>> + } else {
>>>>> + if (perf_data_file__write(rec->session->file, bf, size) < 0) {
>>>>> + pr_err("failed to write perf data, error: %m\n");
>>>>> + return -1;
>>>>> + }
>>>>> + rec->bytes_written += size;
>>>>> }
>>>>>
>>>>> - rec->bytes_written += size;
>>>>> return 0;
>>>>> }
>>>>>
>>>>> @@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int
>>>>> idx)
>>>>> if (old == head)
>>>>> return 0;
>>>>>
>>>>> + memory_enabled = 1;
>>>>> +
>>>>> rec->samples++;
>>>>>
>>>>> size = head - old;
>>>>> @@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int
>>>>> idx)
>>>>> md->prev = old;
>>>>> perf_evlist__mmap_consume(rec->evlist, idx);
>>>>> out:
>>>>> + memory_enabled = 0;
>>>>> return rc;
>>>>> }
>>>>>
>>>> So you are basically ignoring all samples until SIGUSR2 is received. That
>>> No, he is not, its just that his code is difficult to follow, has to be
>>> rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
>>> will..
>>>
>>>> means the resulting data file will have limited history of task events for
>>> ... have a complete history of task events, since PERF_RECORD_FORK, etc
>>> are not being ignored.
>>>
>>> No?
>> Actually we are discussing about this problem.
>>
>> For such tracking events (PERF_RECORD_FORK...), we have dummy event so
>> it is possible for us to receive tracking events from a separated
>> channel, therefore we don't have to parse every events to pick those
>> events out. Instead, we can process tracking events differently, then
>> more interesting things can be done. For example, squashing those tracking
>> events if it takes too much memory...
>>
>> Furthermore, there's another problem being discussed: if userspace ringbuffer
>> is bytes based, parsing event is unavoidable. Without parsing event we are
>> unable to find the new 'head' pointer when overwriting.
> Have you considered trying to find the head by trial-and-error at the time
> you make the snapshot i.e. look at the first 8 bytes (event records are 8
> byte aligned) and see if it is a valid record header, if not try the next 8
> bytes. When you find a real event record it should parse without error and
> the subsequent events should all parse without error too, all the way to the
> tail. Then you can use timestamps and compare the events byte-by-byte to
> avoid overlaps between 2 snapshots.

It seems not work. Now we have BPF output event, it is possible that a
BPF program output anything through that event. Even if we have a magic
in head of each event, we can't prevent BPF output event output that
magic, except we introduce some 'escape' method to prevent BPF output
event output some data pattern. So although might work in reallife,
this solution is logically incorrect. Or am I miss someting?

Thank you.

2015-11-25 07:56:53

by Yunlong Song

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

On 2015/11/24 23:06, David Ahern wrote:
>
> So you are basically ignoring all samples until SIGUSR2 is received. That means the resulting data file will have limited history of task events for example. And for other events the quantity is random as to when the mmaps were last scanned.
>
> Your cover letter mentioned my code "just makes some count when the signal triggers perf sched, with no sample recording and has nothing to do with perf.data". That is not correct. If you look at the perf-daemon code I pointed you to it processes task events as they are received and saves the last N-events after time sorting (limited by memory or time). When a signal is received it processes the saved events and dumps them to stdout versus writing a perf.data file.
>
> David
>

Hi, David,

Yes, I know that your sched daemon can store and print info when the signal triggers,
however, what I mean 'makes some count' is: sched daemon parses and processes the events
to extract the tracing info related with sched, rather than a general use of perf.data
like "perf script", "perf report", "perf data convert --to-ctf", etc. And what I mean
'no sample recording and has nothing to do with perf.data' is: when perf receives a signal,
sched daemon uses timehist_print_summary and timehist_pstree to record those tracing info
related with sched to a new file rather than the raw perf event records in the perf.data.
We can not use those files generated by sched daemon to enjoy the strong functions like how
perf.data can be used in "perf script", "perf report", "perf data convert --to-ctf", etc.

Sched daemon is good, but it is carefully designed for specific use of perf sched. In general
case of perf record, with snapshot mode, we still want a perf.data as before. Your sched daemon
concurrently does the work of storing and sched-parsing action for each signal trigger. To get
a general style of perf.data, the sched-parsing semantic action may have to be removed.

--
Thanks,
Yunlong Song

2015-11-25 08:33:21

by Adrian Hunter

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

On 25/11/15 09:47, Wangnan (F) wrote:
>
>
> On 2015/11/25 15:22, Adrian Hunter wrote:
>> On 25/11/15 05:50, Wangnan (F) wrote:
>>>
>>> On 2015/11/24 23:20, Arnaldo Carvalho de Melo wrote:
>>>> Em Tue, Nov 24, 2015 at 08:06:41AM -0700, David Ahern escreveu:
>>>>> On 11/24/15 7:00 AM, Yunlong Song wrote:
>>>>>> +static int record__write(struct record *rec, void *bf, size_t size)
>>>>>> +{
>>>>>> + if (rec->memory.size && memory_enabled) {
>>>>>> + if (perf_memory__write(&rec->memory, bf, size) < 0) {
>>>>>> + pr_err("failed to write memory data, error: %m\n");
>>>>>> + return -1;
>>>>>> + }
>>>>>> + } else {
>>>>>> + if (perf_data_file__write(rec->session->file, bf, size) < 0) {
>>>>>> + pr_err("failed to write perf data, error: %m\n");
>>>>>> + return -1;
>>>>>> + }
>>>>>> + rec->bytes_written += size;
>>>>>> }
>>>>>>
>>>>>> - rec->bytes_written += size;
>>>>>> return 0;
>>>>>> }
>>>>>>
>>>>>> @@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int
>>>>>> idx)
>>>>>> if (old == head)
>>>>>> return 0;
>>>>>>
>>>>>> + memory_enabled = 1;
>>>>>> +
>>>>>> rec->samples++;
>>>>>>
>>>>>> size = head - old;
>>>>>> @@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int
>>>>>> idx)
>>>>>> md->prev = old;
>>>>>> perf_evlist__mmap_consume(rec->evlist, idx);
>>>>>> out:
>>>>>> + memory_enabled = 0;
>>>>>> return rc;
>>>>>> }
>>>>>>
>>>>> So you are basically ignoring all samples until SIGUSR2 is received. That
>>>> No, he is not, its just that his code is difficult to follow, has to be
>>>> rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
>>>> will..
>>>>
>>>>> means the resulting data file will have limited history of task events for
>>>> ... have a complete history of task events, since PERF_RECORD_FORK, etc
>>>> are not being ignored.
>>>>
>>>> No?
>>> Actually we are discussing about this problem.
>>>
>>> For such tracking events (PERF_RECORD_FORK...), we have dummy event so
>>> it is possible for us to receive tracking events from a separated
>>> channel, therefore we don't have to parse every events to pick those
>>> events out. Instead, we can process tracking events differently, then
>>> more interesting things can be done. For example, squashing those tracking
>>> events if it takes too much memory...
>>>
>>> Furthermore, there's another problem being discussed: if userspace
>>> ringbuffer
>>> is bytes based, parsing event is unavoidable. Without parsing event we are
>>> unable to find the new 'head' pointer when overwriting.
>> Have you considered trying to find the head by trial-and-error at the time
>> you make the snapshot i.e. look at the first 8 bytes (event records are 8
>> byte aligned) and see if it is a valid record header, if not try the next 8
>> bytes. When you find a real event record it should parse without error and
>> the subsequent events should all parse without error too, all the way to the
>> tail. Then you can use timestamps and compare the events byte-by-byte to
>> avoid overlaps between 2 snapshots.
>
> It seems not work. Now we have BPF output event, it is possible that a
> BPF program output anything through that event. Even if we have a magic
> in head of each event, we can't prevent BPF output event output that
> magic, except we introduce some 'escape' method to prevent BPF output
> event output some data pattern. So although might work in reallife,
> this solution is logically incorrect. Or am I miss someting?

When you find the head, all the events will parse correctly. It seems to me
highly unlikely that would happen if you guessed the head wrongly.
It is only incorrect if it gives the wrong result.

2015-11-25 08:57:18

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events



On 2015/11/25 16:27, Adrian Hunter wrote:
> On 25/11/15 09:47, Wangnan (F) wrote:
>>
>> On 2015/11/25 15:22, Adrian Hunter wrote:
>>> On 25/11/15 05:50, Wangnan (F) wrote:
>>>> On 2015/11/24 23:20, Arnaldo Carvalho de Melo wrote:
>>>>> Em Tue, Nov 24, 2015 at 08:06:41AM -0700, David Ahern escreveu:
>>>>>> On 11/24/15 7:00 AM, Yunlong Song wrote:
>>>>>>> +static int record__write(struct record *rec, void *bf, size_t size)
>>>>>>> +{
>>>>>>> + if (rec->memory.size && memory_enabled) {
>>>>>>> + if (perf_memory__write(&rec->memory, bf, size) < 0) {
>>>>>>> + pr_err("failed to write memory data, error: %m\n");
>>>>>>> + return -1;
>>>>>>> + }
>>>>>>> + } else {
>>>>>>> + if (perf_data_file__write(rec->session->file, bf, size) < 0) {
>>>>>>> + pr_err("failed to write perf data, error: %m\n");
>>>>>>> + return -1;
>>>>>>> + }
>>>>>>> + rec->bytes_written += size;
>>>>>>> }
>>>>>>>
>>>>>>> - rec->bytes_written += size;
>>>>>>> return 0;
>>>>>>> }
>>>>>>>
>>>>>>> @@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int
>>>>>>> idx)
>>>>>>> if (old == head)
>>>>>>> return 0;
>>>>>>>
>>>>>>> + memory_enabled = 1;
>>>>>>> +
>>>>>>> rec->samples++;
>>>>>>>
>>>>>>> size = head - old;
>>>>>>> @@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int
>>>>>>> idx)
>>>>>>> md->prev = old;
>>>>>>> perf_evlist__mmap_consume(rec->evlist, idx);
>>>>>>> out:
>>>>>>> + memory_enabled = 0;
>>>>>>> return rc;
>>>>>>> }
>>>>>>>
>>>>>> So you are basically ignoring all samples until SIGUSR2 is received. That
>>>>> No, he is not, its just that his code is difficult to follow, has to be
>>>>> rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
>>>>> will..
>>>>>
>>>>>> means the resulting data file will have limited history of task events for
>>>>> ... have a complete history of task events, since PERF_RECORD_FORK, etc
>>>>> are not being ignored.
>>>>>
>>>>> No?
>>>> Actually we are discussing about this problem.
>>>>
>>>> For such tracking events (PERF_RECORD_FORK...), we have dummy event so
>>>> it is possible for us to receive tracking events from a separated
>>>> channel, therefore we don't have to parse every events to pick those
>>>> events out. Instead, we can process tracking events differently, then
>>>> more interesting things can be done. For example, squashing those tracking
>>>> events if it takes too much memory...
>>>>
>>>> Furthermore, there's another problem being discussed: if userspace
>>>> ringbuffer
>>>> is bytes based, parsing event is unavoidable. Without parsing event we are
>>>> unable to find the new 'head' pointer when overwriting.
>>> Have you considered trying to find the head by trial-and-error at the time
>>> you make the snapshot i.e. look at the first 8 bytes (event records are 8
>>> byte aligned) and see if it is a valid record header, if not try the next 8
>>> bytes. When you find a real event record it should parse without error and
>>> the subsequent events should all parse without error too, all the way to the
>>> tail. Then you can use timestamps and compare the events byte-by-byte to
>>> avoid overlaps between 2 snapshots.
>> It seems not work. Now we have BPF output event, it is possible that a
>> BPF program output anything through that event. Even if we have a magic
>> in head of each event, we can't prevent BPF output event output that
>> magic, except we introduce some 'escape' method to prevent BPF output
>> event output some data pattern. So although might work in reallife,
>> this solution is logically incorrect. Or am I miss someting?
> When you find the head, all the events will parse correctly. It seems to me
> highly unlikely that would happen if you guessed the head wrongly.
> It is only incorrect if it gives the wrong result.

Right, so I said it might work in reallife. However, I think we
should better to try to provide some logically correct solution.
Also, 'guessing' means some sort of intelligence, or how do we
deal with guessing error? Simply drop them?

And what's your opinion on the bucket besed ring buffer? With that
design we only need to maintain a ringbuffer of pointers. It should
be much simpler. The only drawback I can image is the waste of memory
because we have to alloc buckets pessimistically. Do you think
that method have other problem I haven't considered?

Thank you.

2015-11-25 09:09:03

by Adrian Hunter

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

On 25/11/15 10:43, Wangnan (F) wrote:
>
>
> On 2015/11/25 16:27, Adrian Hunter wrote:
>> On 25/11/15 09:47, Wangnan (F) wrote:
>>>
>>> On 2015/11/25 15:22, Adrian Hunter wrote:
>>>> On 25/11/15 05:50, Wangnan (F) wrote:
>>>>> On 2015/11/24 23:20, Arnaldo Carvalho de Melo wrote:
>>>>>> Em Tue, Nov 24, 2015 at 08:06:41AM -0700, David Ahern escreveu:
>>>>>>> On 11/24/15 7:00 AM, Yunlong Song wrote:
>>>>>>>> +static int record__write(struct record *rec, void *bf, size_t size)
>>>>>>>> +{
>>>>>>>> + if (rec->memory.size && memory_enabled) {
>>>>>>>> + if (perf_memory__write(&rec->memory, bf, size) < 0) {
>>>>>>>> + pr_err("failed to write memory data, error: %m\n");
>>>>>>>> + return -1;
>>>>>>>> + }
>>>>>>>> + } else {
>>>>>>>> + if (perf_data_file__write(rec->session->file, bf, size) < 0) {
>>>>>>>> + pr_err("failed to write perf data, error: %m\n");
>>>>>>>> + return -1;
>>>>>>>> + }
>>>>>>>> + rec->bytes_written += size;
>>>>>>>> }
>>>>>>>>
>>>>>>>> - rec->bytes_written += size;
>>>>>>>> return 0;
>>>>>>>> }
>>>>>>>>
>>>>>>>> @@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int
>>>>>>>> idx)
>>>>>>>> if (old == head)
>>>>>>>> return 0;
>>>>>>>>
>>>>>>>> + memory_enabled = 1;
>>>>>>>> +
>>>>>>>> rec->samples++;
>>>>>>>>
>>>>>>>> size = head - old;
>>>>>>>> @@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec,
>>>>>>>> int
>>>>>>>> idx)
>>>>>>>> md->prev = old;
>>>>>>>> perf_evlist__mmap_consume(rec->evlist, idx);
>>>>>>>> out:
>>>>>>>> + memory_enabled = 0;
>>>>>>>> return rc;
>>>>>>>> }
>>>>>>>>
>>>>>>> So you are basically ignoring all samples until SIGUSR2 is received.
>>>>>>> That
>>>>>> No, he is not, its just that his code is difficult to follow, has to be
>>>>>> rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
>>>>>> will..
>>>>>>
>>>>>>> means the resulting data file will have limited history of task
>>>>>>> events for
>>>>>> ... have a complete history of task events, since PERF_RECORD_FORK, etc
>>>>>> are not being ignored.
>>>>>>
>>>>>> No?
>>>>> Actually we are discussing about this problem.
>>>>>
>>>>> For such tracking events (PERF_RECORD_FORK...), we have dummy event so
>>>>> it is possible for us to receive tracking events from a separated
>>>>> channel, therefore we don't have to parse every events to pick those
>>>>> events out. Instead, we can process tracking events differently, then
>>>>> more interesting things can be done. For example, squashing those tracking
>>>>> events if it takes too much memory...
>>>>>
>>>>> Furthermore, there's another problem being discussed: if userspace
>>>>> ringbuffer
>>>>> is bytes based, parsing event is unavoidable. Without parsing event we are
>>>>> unable to find the new 'head' pointer when overwriting.
>>>> Have you considered trying to find the head by trial-and-error at the time
>>>> you make the snapshot i.e. look at the first 8 bytes (event records are 8
>>>> byte aligned) and see if it is a valid record header, if not try the next 8
>>>> bytes. When you find a real event record it should parse without error and
>>>> the subsequent events should all parse without error too, all the way to
>>>> the
>>>> tail. Then you can use timestamps and compare the events byte-by-byte to
>>>> avoid overlaps between 2 snapshots.
>>> It seems not work. Now we have BPF output event, it is possible that a
>>> BPF program output anything through that event. Even if we have a magic
>>> in head of each event, we can't prevent BPF output event output that
>>> magic, except we introduce some 'escape' method to prevent BPF output
>>> event output some data pattern. So although might work in reallife,
>>> this solution is logically incorrect. Or am I miss someting?
>> When you find the head, all the events will parse correctly. It seems to me
>> highly unlikely that would happen if you guessed the head wrongly.
>> It is only incorrect if it gives the wrong result.
>
> Right, so I said it might work in reallife. However, I think we
> should better to try to provide some logically correct solution.
> Also, 'guessing' means some sort of intelligence, or how do we
> deal with guessing error? Simply drop them?

It is not "intelligence" it is a linear search. If it gives more than one
answer, it is a fatal error. You can mitigate that by adding more
validation of the event records.

But it is only a suggestion.

> And what's your opinion on the bucket besed ring buffer? With that
> design we only need to maintain a ringbuffer of pointers. It should
> be much simpler. The only drawback I can image is the waste of memory
> because we have to alloc buckets pessimistically. Do you think
> that method have other problem I haven't considered?

The drawback is that you have to copy all the events all the time instead of
letting the kernel ring buffer wraparound without any userspace involvement
until you make a snapshot.

2015-11-25 09:27:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

On Tue, Nov 24, 2015 at 10:00:31PM +0800, Yunlong Song wrote:
> In our patch, we create and maintain a user space ring buffer to store
> perf's tracing info, instead of directly writing to perf.data file as
> before. In snapshot mode, only a SIGUSR2 signal can trigger perf to dump
> the tracing info currently stored in the user space ring buffer to
> perf.data file.

I would very much like to first fix the perf overwrite mode: see
lkml.kernel.org/r/[email protected]

2015-11-25 09:44:34

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events



On 2015/11/25 17:27, Peter Zijlstra wrote:
> On Tue, Nov 24, 2015 at 10:00:31PM +0800, Yunlong Song wrote:
>> In our patch, we create and maintain a user space ring buffer to store
>> perf's tracing info, instead of directly writing to perf.data file as
>> before. In snapshot mode, only a SIGUSR2 signal can trigger perf to dump
>> the tracing info currently stored in the user space ring buffer to
>> perf.data file.
> I would very much like to first fix the perf overwrite mode: see
> lkml.kernel.org/r/[email protected]

I think they can be done in parallel. We can first do something with
tracking events and perf's output file, and wait for kernel level
overwrite mode fixed, then decide whether to implement perf's own
ringbuffer.

Thank you.

2015-11-25 12:20:49

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

On Wed, Nov 25, 2015 at 05:44:00PM +0800, Wangnan (F) wrote:
>
>
> On 2015/11/25 17:27, Peter Zijlstra wrote:
> >On Tue, Nov 24, 2015 at 10:00:31PM +0800, Yunlong Song wrote:
> >>In our patch, we create and maintain a user space ring buffer to store
> >>perf's tracing info, instead of directly writing to perf.data file as
> >>before. In snapshot mode, only a SIGUSR2 signal can trigger perf to dump
> >>the tracing info currently stored in the user space ring buffer to
> >>perf.data file.
> >I would very much like to first fix the perf overwrite mode: see
> >lkml.kernel.org/r/[email protected]
>
> I think they can be done in parallel. We can first do something with
> tracking events and perf's output file, and wait for kernel level
> overwrite mode fixed, then decide whether to implement perf's own
> ringbuffer.

That seems backwards; why would you ever want to endlessly copy the
events if you're not going to use them?

2015-11-25 12:45:00

by Yunlong Song

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

On 2015/11/24 22:30, Arnaldo Carvalho de Melo wrote:

> Looks like an interesting feature, if for no other reason, to match what
> we have for aux area tracing.
>
> But perhaps having an event in the stream itself that would signal when
> to dump the snapshot would be a better approach?
>
> One that perhaps you create with 'perf probe' or a tracepoint, both
> could use the in-kernel filtering capabilities to specify when to
> trigger that snapshot?
>
> If this snapshotting could happen in the kernel, that would be even
> better, i.e. no need for two buffers (kernel rb + userspace rb) for
> achieving this? Anyway, see below some comments.

Hi, Arnaldo,

Thanks for your suggestions and comments. As for the suggestion of snapshot
mode implemented in kernel ring buffer rather than user space ring buffer, I
have thought about that before I made this patch, the problems are:

1. Memory Handle

If snapshot mode implemented in kernel space ring buffer, what's proper size of
that ring buffer? If the size is small (such as 512k), it may lose the older
part of target info which is really related with the reason causing that snapshot.
For example, a temporary process launches and runs for a while, and it cannot
terminate properly in the end. The real reason is that something is wrong during
the launching stage, however, we do not know this prior knowledge and has to
record all the information from the beginning to the end of this process's liftime.
By analyzing the log of perf.data, we can finally figure out that the problem exists
in the launching stage. Here if kernel space ring buffer is not big enough, we will
lose the older part of the logs.

In this case, we may have to set the size of kernel space ring buffer big enough, but
can we kmalloc or vmalloc such big size easily in kernel space in any case? There
may be some limitations to do this. Since allocating memory in the kernel is not as
simple as allocating memory in user space. From this point of view, I prefer to use
user space ring buffer to store the records. It's more flexible and feasible to handle
memory.

2. Overwrite Overhead

Without overwrite feature, snapshot mode implemented in kernel space ring buffer
will also lose target info. For example, the mmap page of kernel space ring buffer
is not full at time A. The target info is filled to this mmap page and then it is
full at time B, then it wakes up perf to mmap_read all its records. In snapshot mode,
all the records are totally dropped without writing to perf.data. Later at time C,
a signal triggers the perf to dump, but the older part of target info was lost at
time B. This is because kernel space ring buffer does not overwrite itself, and dump
all its content once a page is full.

So, we need "overwrite" support, but it needs further fixes and the corresponding patch
is not applied to perf now probably due to its executive overhead (in the critical path,
not very sure yet).

Thus instead of incurring any probable overhead in the kernel space and thus may cause
any problems, I prefer to implement the "overwrite" feature in the user space ring
buffer.

3. Metadata Handle

Even if problem 2 above is solved, i.e., we finally fix the overwrite feature of kernel space
ring buffer. What about metadata handling (mmap_event, comm_event, fork_event, exit_event,
etc.)? When it's time to overwrite this info during the wraparound procedure, this metadata
events have to be handled separately to wake up perf (trigger the poll) to write to perf.data.
This may increase complexity in the kernel space. However, as for the user space ring buffer
version, it directly writes to perf.data once it notices the metadata event, no action of waking
up perf by poll at all. Since the user space ring buffer design has to parse each event to
figure out its size to skip when overwriting, it can easily figure out the event type by the
way (which event is perf_record_sample, and which event is metadata).

However, for this problem, there is a under-discussing solution. As what Wang Nan replies, "For
such tracking events (PERF_RECORD_FORK...), we have dummy event so it is possible for us to receive
tracking events from a separated channel, therefore we don't have to parse every events to pick
those events out."

4. Trigger Semantic

As what you mentioned, snapshot mode implemented in kernel space ring buffer "can" use the in-kernel
filtering capabilities to specify when to trigger that snapshot. In my personal opinion, I regard
the in-kernel filtering capabilities as something all about controlling, or say, programming with
the tracepoints/trace events, which does not relate to the semantic of snapshot. You can use the
in-kernel filtering capabilities (such as eBPF) to enable/disable tracing, filter tracing, aggregate
tracing, but all this tracing info should be written to perf.data normally. However, we can decide
when to really do this writing action via snapshot mode. Snapshot mode is just a semantic in the
user space from a high level view. Without snapshot mode, those "in-kernel filtering capabilities"
can still work for the regular perf. In the production case, apps can send SIGUSR2 signal to perf to
easily record the most useful info which is strongly related with the apps performance problem, no
redundant info any more (only collect the trace at the time any problem happens).


> How about: snapshot_ring_buffer? If that is deemed too big, perhaps
> snapshot_rb?
>
> Please continue following the existing convention and name this:
>
> static int record__write_rb()
>
> As the buffer you're writing is a ring buffer, thus needs checking about
> wrap around, like you do here.
>
> This really looks hackish, using a global to change the behaviour of
> some existing function, and a volatile at that, ouch, please change the
> prototypes in some way to avoid globals like this.

Thanks for your careful review and helpful comments, my patch stands for certain kind of design for
snapshot mode. And other people may have suggestions and different designs, let's talk and compare
different designs first. If the design of this patch is taken finally, I will rewrite it then.

--
Thanks,
Yunlong Song

2015-11-25 12:55:15

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events



On 2015/11/25 20:20, Peter Zijlstra wrote:
> On Wed, Nov 25, 2015 at 05:44:00PM +0800, Wangnan (F) wrote:
>>
>> On 2015/11/25 17:27, Peter Zijlstra wrote:
>>> On Tue, Nov 24, 2015 at 10:00:31PM +0800, Yunlong Song wrote:
>>>> In our patch, we create and maintain a user space ring buffer to store
>>>> perf's tracing info, instead of directly writing to perf.data file as
>>>> before. In snapshot mode, only a SIGUSR2 signal can trigger perf to dump
>>>> the tracing info currently stored in the user space ring buffer to
>>>> perf.data file.
>>> I would very much like to first fix the perf overwrite mode: see
>>> lkml.kernel.org/r/[email protected]
>> I think they can be done in parallel. We can first do something with
>> tracking events and perf's output file, and wait for kernel level
>> overwrite mode fixed, then decide whether to implement perf's own
>> ringbuffer.
> That seems backwards; why would you ever want to endlessly copy the
> events if you're not going to use them?

I agree that we need to fixing overwrite mode. However, user
space ringbuffer can be more flexible. for example, dynamically
shrinking and expansion. It would be hard in kernel I think,
and I'm not sure how much flexibility we need. Doing things
in kernel always more difficult than in userspace.

But yes, we can do that userspace ring buffer when we really
need it. At very first we can start working on perf side and
assume overwrite mode is ready.

Thank you.

2015-11-26 09:19:24

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events


* Wangnan (F) <[email protected]> wrote:

>
>
> On 2015/11/25 20:20, Peter Zijlstra wrote:
> >On Wed, Nov 25, 2015 at 05:44:00PM +0800, Wangnan (F) wrote:
> >>
> >>On 2015/11/25 17:27, Peter Zijlstra wrote:
> >>>On Tue, Nov 24, 2015 at 10:00:31PM +0800, Yunlong Song wrote:
> >>>>In our patch, we create and maintain a user space ring buffer to store
> >>>>perf's tracing info, instead of directly writing to perf.data file as
> >>>>before. In snapshot mode, only a SIGUSR2 signal can trigger perf to dump
> >>>>the tracing info currently stored in the user space ring buffer to
> >>>>perf.data file.
> >>>I would very much like to first fix the perf overwrite mode: see
> >>>lkml.kernel.org/r/[email protected]
> >>I think they can be done in parallel. We can first do something with
> >>tracking events and perf's output file, and wait for kernel level
> >>overwrite mode fixed, then decide whether to implement perf's own
> >>ringbuffer.
> >That seems backwards; why would you ever want to endlessly copy the
> >events if you're not going to use them?
>
> I agree that we need to fixing overwrite mode. However, user space ringbuffer
> can be more flexible. for example, dynamically shrinking and expansion. It would
> be hard in kernel I think, and I'm not sure how much flexibility we need. Doing
> things in kernel always more difficult than in userspace.
>
> But yes, we can do that userspace ring buffer when we really need it. At very
> first we can start working on perf side and assume overwrite mode is ready.

I don't think Peter asked for much: pick up the patch he has already written and
use it, to have an even lower overhead always-enabled background tracing mode of
perf.

Resizing shouldn't be much of an issue with existing features: if events start
overflowing or some other threshold for dynamic increase of the ring-buffer is met
then the daemon should open a new set of events with a larger ring-buffer, and
close the old events once the new tracing ring-buffer is up and running.

Use event multiplexing to output all interesting events into the same single (per
CPU) ring-buffer.

Thanks,

Ingo

2015-11-26 09:24:37

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events



On 2015/11/26 17:19, Ingo Molnar wrote:
> * Wangnan (F) <[email protected]> wrote:
>
>>
>> On 2015/11/25 20:20, Peter Zijlstra wrote:
>>> On Wed, Nov 25, 2015 at 05:44:00PM +0800, Wangnan (F) wrote:
>>>> On 2015/11/25 17:27, Peter Zijlstra wrote:
>>>>> On Tue, Nov 24, 2015 at 10:00:31PM +0800, Yunlong Song wrote:
>>>>>> In our patch, we create and maintain a user space ring buffer to store
>>>>>> perf's tracing info, instead of directly writing to perf.data file as
>>>>>> before. In snapshot mode, only a SIGUSR2 signal can trigger perf to dump
>>>>>> the tracing info currently stored in the user space ring buffer to
>>>>>> perf.data file.
>>>>> I would very much like to first fix the perf overwrite mode: see
>>>>> lkml.kernel.org/r/[email protected]
>>>> I think they can be done in parallel. We can first do something with
>>>> tracking events and perf's output file, and wait for kernel level
>>>> overwrite mode fixed, then decide whether to implement perf's own
>>>> ringbuffer.
>>> That seems backwards; why would you ever want to endlessly copy the
>>> events if you're not going to use them?
>> I agree that we need to fixing overwrite mode. However, user space ringbuffer
>> can be more flexible. for example, dynamically shrinking and expansion. It would
>> be hard in kernel I think, and I'm not sure how much flexibility we need. Doing
>> things in kernel always more difficult than in userspace.
>>
>> But yes, we can do that userspace ring buffer when we really need it. At very
>> first we can start working on perf side and assume overwrite mode is ready.
> I don't think Peter asked for much: pick up the patch he has already written and
> use it, to have an even lower overhead always-enabled background tracing mode of
> perf.
>
> Resizing shouldn't be much of an issue with existing features: if events start
> overflowing or some other threshold for dynamic increase of the ring-buffer is met
> then the daemon should open a new set of events with a larger ring-buffer, and
> close the old events once the new tracing ring-buffer is up and running.
>
> Use event multiplexing to output all interesting events into the same single (per
> CPU) ring-buffer.

Thank you for your reply. We'll start looking Peter's patch and try
to find what should we do to make it upstream faster. Could you please
kindly give some hints?

2015-11-26 09:27:46

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events


* Ingo Molnar <[email protected]> wrote:

> > But yes, we can do that userspace ring buffer when we really need it. At very
> > first we can start working on perf side and assume overwrite mode is ready.
>
> I don't think Peter asked for much: pick up the patch he has already written and
> use it, to have an even lower overhead always-enabled background tracing mode of
> perf.
>
> Resizing shouldn't be much of an issue with existing features: if events start
> overflowing or some other threshold for dynamic increase of the ring-buffer is
> met then the daemon should open a new set of events with a larger ring-buffer,
> and close the old events once the new tracing ring-buffer is up and running.
>
> Use event multiplexing to output all interesting events into the same single
> (per CPU) ring-buffer.

Btw., there's another trick we could use to support ftrace-alike workflows even
better: we could expose a task's active perf ring-buffers under /proc/<PID>/ and
could make it readable.

So if an overwrite-mode background tracing session is running, you don't even have
to signal it to capture the ring-buffer: just open the ring-buffer fd in procfs,
under /proc/XYZ/perf/ring-buffers/5.trace or so, and dump its current contents,
assuming the task doing that has sufficient permissions - i.e.
ptrace_may_access().

We could even pretty-print some very basic version of the records from the kernel,
via /proc/XYZ/perf/ring-buffers/5.txt, to support a tooling-less tracing modes.
This way perf based tracing could be supported even on systems that have no
writable filesystems.

I.e. in this regard perf can be made to match ftrace's tracing workflow as well -
in addition to the more traditional perf profiling workflow we all love and know!

Thanks,

Ingo

2015-11-26 09:41:07

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events


* Ingo Molnar <[email protected]> wrote:

>
> * Ingo Molnar <[email protected]> wrote:
>
> > > But yes, we can do that userspace ring buffer when we really need it. At
> > > very first we can start working on perf side and assume overwrite mode is
> > > ready.
> >
> > I don't think Peter asked for much: pick up the patch he has already written
> > and use it, to have an even lower overhead always-enabled background tracing
> > mode of perf.
> >
> > Resizing shouldn't be much of an issue with existing features: if events start
> > overflowing or some other threshold for dynamic increase of the ring-buffer is
> > met then the daemon should open a new set of events with a larger ring-buffer,
> > and close the old events once the new tracing ring-buffer is up and running.
> >
> > Use event multiplexing to output all interesting events into the same single
> > (per CPU) ring-buffer.
>
> Btw., there's another trick we could use to support ftrace-alike workflows even
> better: we could expose a task's active perf ring-buffers under /proc/<PID>/ and
> could make it readable.
>
> So if an overwrite-mode background tracing session is running, you don't even
> have to signal it to capture the ring-buffer: just open the ring-buffer fd in
> procfs, under /proc/XYZ/perf/ring-buffers/5.trace or so, and dump its current
> contents, assuming the task doing that has sufficient permissions - i.e.
> ptrace_may_access().
>
> We could even pretty-print some very basic version of the records from the
> kernel, via /proc/XYZ/perf/ring-buffers/5.txt, to support a tooling-less tracing
> modes. This way perf based tracing could be supported even on systems that have
> no writable filesystems.
>
> I.e. in this regard perf can be made to match ftrace's tracing workflow as well
> - in addition to the more traditional perf profiling workflow we all love and
> know!

Also note that if we go in this direction then with some additional changes we
could also support lightweight tracing with no tooling side at all on the traced
system: a simple kernel feature with a kernel thread could be added that takes a
list of events from sysfs or debugfs and opens them system-wide and exposes
per-cpu overwrite mode ring-buffers.

Those ring-buffers can then be accessed via procfs (and/or also be exposed in
parallel via debugfs). The kernel thread never actually does anything except set
up the events - i.e. this is a very lightweight mode of always-on tracing.

Additional debugfs toggles can be added to temporarily turn tracing on/off without
closing the events - just like ftrace.

Other toggles could be added, such as: 'stop tracing when the kernel has crashed,
or if a specific event has occured or a condition has been met'.

That way we could, among other things, capture traces on embedded systems and copy
the traces to another, larger system (or NFS-mount the target system), and run
perf tooling to analyze the traces on that more powerful system.

But it all starts with making overwrite mode work well, and working with the
kernel visible ring-buffer. That can then be exposed to user-space in very
expressive ways to turn perf into a flexible system tracing subsystem as well.

Thanks,

Ingo

2015-11-26 09:57:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events


* Ingo Molnar <[email protected]> wrote:

> So if an overwrite-mode background tracing session is running, you don't even have
> to signal it to capture the ring-buffer: just open the ring-buffer fd in procfs,
> under /proc/XYZ/perf/ring-buffers/5.trace or so, and dump its current contents,
> assuming the task doing that has sufficient permissions - i.e.
> ptrace_may_access().
>
> We could even pretty-print some very basic version of the records from the kernel,
> via /proc/XYZ/perf/ring-buffers/5.txt, to support a tooling-less tracing modes.
> This way perf based tracing could be supported even on systems that have no
> writable filesystems.

With this 'cat' could be used to look at the current trace:

cat /proc/XYZ/perf/ring-buffers/5.txt

would result in 'perf script' alike output generated by the kernel:

$ cat /proc/XYZ/perf/ring-buffers/5.txt
perf 24816 63796.780079: 1 cycles:pp: ffffffff810603f8 native_write_msr_safe
perf 24816 63796.780083: 1 cycles:pp: ffffffff810603f8 native_write_msr_safe
perf 24816 63796.780086: 8 cycles:pp: ffffffff810603f8 native_write_msr_safe
perf 24816 63796.780089: 97 cycles:pp: ffffffff810603f8 native_write_msr_safe
perf 24816 63796.780092: 1237 cycles:pp: ffffffff8103450c intel_pmu_handle_irq
perf 24816 63796.780094: 13879 cycles:pp: ffffffff81204f23 setup_new_exec
sh 24816 63796.780104: 170378 cycles:pp: ffffffff811bc437 change_protection_range
sh 24816 63796.780145: 698206 cycles:pp: ffffffff813d22d7 clear_page_c_e
sh 24816 63796.780304: 1145748 cycles:pp: 7f60aca20bdb _dl_addr
sh 24817 63796.780400: 1 cycles:pp: ffffffff810603f8 native_write_msr_safe
sh 24817 63796.780403: 1 cycles:pp: ffffffff810603f8 native_write_msr_safe
sh 24817 63796.780406: 10 cycles:pp: ffffffff810603f8 native_write_msr_safe
sh 24817 63796.780409: 118 cycles:pp: ffffffff810603f8 native_write_msr_safe

... and you could use shell scripting to analyze it - just like with ftrace.

of course this would be simplified output - and you could still access or copy the
raw trace data as well, and use all the rich tooling and visualization features of
full perf.

Thanks,

Ingo

2015-12-02 08:27:10

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH] perf record: Add snapshot mode support for perf's regular events

Hi Peter,

On 2015/11/25 17:27, Peter Zijlstra wrote:
> On Tue, Nov 24, 2015 at 10:00:31PM +0800, Yunlong Song wrote:
>> In our patch, we create and maintain a user space ring buffer to store
>> perf's tracing info, instead of directly writing to perf.data file as
>> before. In snapshot mode, only a SIGUSR2 signal can trigger perf to dump
>> the tracing info currently stored in the user space ring buffer to
>> perf.data file.
> I would very much like to first fix the perf overwrite mode: see
> lkml.kernel.org/r/[email protected]
I have an idea on this problem that, is it possible to
put the size of an event at the end of it when we working
on overwrite mode? So we can backtrack every events without
too much change to ring buffer?

Thank you.

2015-12-02 13:40:43

by Wang Nan

[permalink] [raw]
Subject: [RFC PATCH] perf/core: Put size of a sample at the end of it

This is an RFC patch which is for overwrite mode ring buffer. I'd like
to discuss the correctness of this new idea for retriving as many
events as possible from overwrite mode ring buffer. If there's no
fundamental problem, I'll start perf side work.

The biggest problem for overwrite ring buffer is that it is hard to find
the start position of valid record. [1] and [2] tries to solve this
problem by introducing 'tail' and 'next_tail' into metadata page, and
update them each time the ring buffer is half full. Which adds more
instructions to event output code path, hurt performance. In addition,
even with them we still unable to recover all possible records. For
example:

data_tail head
| |
V V
+------+-------+----------+------+---+
| A | B | C | D | |
+------+-------+----------+------+---+

If a record written at head pointer and it overwrites record A:

head data_tail
| |
V V
+--+---+-------+----------+------+---+
|E |...| B | C | D | E |
+--+---+-------+----------+------+---+

Record B is still valid but we can't get it through data_tail.

This patch suggests a different solution for this problem that, by
appending the length of a record at the end of it, user program is
possible to get every possible record in a backward manner, don't
need saving tail pointer.

For example:

head
|
V
+--+---+-------+----------+------+---+
|E6|...| B 8| C 11| D 7|E..|
+--+---+-------+----------+------+---+

In this case, from the 'head' pointer provided by kernel, user program
can first see '6' by (*(head - sizeof(u16))), then it can get the start
pointer of record 'E', then it can read size and find start position
of record D, C, B in similar way.

Kernel side implementation is easy: simply adding a PERF_SAMPLE_SIZE
for the size output.

This sloution requires user program (perf) do more things. At least
following things and limitations should be considered:

1. Before reading such ring buffer, perf must ensure all events which
may output to it is already stopped, so the 'head' pointer it get
is the end of the last record.

2. We must ensure all events attached this ring buffer has
'PERF_SAMPLE_SIZE' selected.

3. There must no tracking events output to this ring buffer.

4. 2 bytes extra space is required for each record.

Further improvement can be taken:

1. If PERF_SAMPLE_SIZE is selected, we can avoid outputting the event
size in header. Which eliminate extra space cose;

2. We can find a way to append size information for tracking events
also.

[1] http://lkml.kernel.org/r/[email protected]
[2] http://lkml.kernel.org/r/[email protected]

Signed-off-by: Wang Nan <[email protected]>
Cc: Adrian Hunter <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: David Ahern <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Yunlong Song <[email protected]>
---
include/uapi/linux/perf_event.h | 3 ++-
kernel/events/core.c | 6 ++++++
2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1afe962..c4066da 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_IDENTIFIER = 1U << 16,
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
+ PERF_SAMPLE_SIZE = 1U << 19,

- PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
};

/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5854fcf..bbbacec 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5473,6 +5473,9 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}

+ if (sample_type & PERF_SAMPLE_SIZE)
+ perf_output_put(handle, header->size);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;

@@ -5592,6 +5595,9 @@ void perf_prepare_sample(struct perf_event_header *header,

header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_SIZE)
+ header->size += sizeof(u16);
}

void perf_event_output(struct perf_event *event,
--
1.8.3.4

2015-12-03 10:08:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH] perf/core: Put size of a sample at the end of it

On Wed, Dec 02, 2015 at 01:38:19PM +0000, Wang Nan wrote:
> This sloution requires user program (perf) do more things. At least
> following things and limitations should be considered:
>
> 1. Before reading such ring buffer, perf must ensure all events which
> may output to it is already stopped, so the 'head' pointer it get
> is the end of the last record.

Right, this is tricky, this would not allow two snapshots to happen back
to back since that would then result in a bunch of missed events.

Aside from this issue its a rather nice idea.

> 2. We must ensure all events attached this ring buffer has
> 'PERF_SAMPLE_SIZE' selected.

That can be easily enforced.

> 3. There must no tracking events output to this ring buffer.

That is rather unfortunate, we'd best fix that up.

> 4. 2 bytes extra space is required for each record.

8, perf records must be 8 byte aligned and sized.

> Further improvement can be taken:
>
> 1. If PERF_SAMPLE_SIZE is selected, we can avoid outputting the event
> size in header. Which eliminate extra space cose;

That would mandate you always parse the stream backwards. Which seems
rather unfortunate. Also, no you cannot recoup the extra space, see the
alignment and size requirement.

> 2. We can find a way to append size information for tracking events
> also.

The !sample records you mean? Yes those had better have them too.

2015-12-03 10:33:38

by Wang Nan

[permalink] [raw]
Subject: Re: [RFC PATCH] perf/core: Put size of a sample at the end of it



On 2015/12/3 18:08, Peter Zijlstra wrote:
> On Wed, Dec 02, 2015 at 01:38:19PM +0000, Wang Nan wrote:
>> This sloution requires user program (perf) do more things. At least
>> following things and limitations should be considered:
>>
>> 1. Before reading such ring buffer, perf must ensure all events which
>> may output to it is already stopped, so the 'head' pointer it get
>> is the end of the last record.
> Right, this is tricky, this would not allow two snapshots to happen back
> to back since that would then result in a bunch of missed events.
>
> Aside from this issue its a rather nice idea.

Thank you for your attitude. We can start consider it seriously.

Now I'm working on perf side code to make a workable prototype.

>> 2. We must ensure all events attached this ring buffer has
>> 'PERF_SAMPLE_SIZE' selected.
> That can be easily enforced.

Yes.

>> 3. There must no tracking events output to this ring buffer.
> That is rather unfortunate, we'd best fix that up.
>
>> 4. 2 bytes extra space is required for each record.
> 8, perf records must be 8 byte aligned and sized.

So does it means we need to pad before outputing size?

>> Further improvement can be taken:
>>
>> 1. If PERF_SAMPLE_SIZE is selected, we can avoid outputting the event
>> size in header. Which eliminate extra space cose;
> That would mandate you always parse the stream backwards. Which seems
> rather unfortunate. Also, no you cannot recoup the extra space, see the
> alignment and size requirement.

That's good. We don't need to consider this :)

Before receiving your comment I'm thinking about modifying
DEFINE_OUTPUT_COPY() to write first sizeof(header) bytes at the
end of reserved area, so it would work automatically for every
possible events.

>> 2. We can find a way to append size information for tracking events
>> also.
> The !sample records you mean? Yes those had better have them too.

Thank you.

2015-12-07 13:30:01

by Wang Nan

[permalink] [raw]
Subject: [RFC PATCH v2 0/3] perf core/perf tools: Utilizing overwrite ring buffer

This patch set explores the idea shown in [1], which puts size of every
events at the end of them in ring buffer so user space tool like perf
can parse the ring buffer backward and find the oldest event in it.

In this version:

- Kernel side, rename PERF_SAMPLE_SIZE to PERF_SAMPLE_SIZE_AT_END,
output 8 bytes to meet the alignment requirement.

- Perf side, provide a prototype utilize this feature. With this
prototype users are allowed to capture the last events in an
overwrite ring buffer using:

# ./perf record -e dummy -e syscalls:*/overwrite/ ...

[1] http://lkml.kernel.org/g/[email protected]

Wang Nan (3):
perf/core: Put size of a sample at the end of it
perf tools: Enable overwrite settings
perf record: Find tail pointer through size at end of event

include/uapi/linux/perf_event.h | 3 +-
kernel/events/core.c | 9 +++++
tools/perf/builtin-record.c | 76 +++++++++++++++++++++++++++++++++++++++--
tools/perf/perf.h | 2 ++
tools/perf/util/evlist.c | 42 +++++++++++++++++------
tools/perf/util/evsel.c | 7 ++++
tools/perf/util/evsel.h | 3 ++
tools/perf/util/parse-events.c | 14 ++++++++
tools/perf/util/parse-events.h | 4 ++-
tools/perf/util/parse-events.l | 2 ++
10 files changed, 147 insertions(+), 15 deletions(-)

--
1.8.3.4

2015-12-07 13:32:35

by Wang Nan

[permalink] [raw]
Subject: [RFC PATCH v2 1/3] perf/core: Put size of a sample at the end of it

This is an RFC patch which is for overwrite mode ring buffer. I'd like
to discuss the correctness of this new idea for retriving as many
events as possible from overwrite mode ring buffer. If there's no
fundamental problem, I'll start perf side work.

The biggest problem for overwrite ring buffer is that it is hard to find
the start position of valid record. [1] and [2] tries to solve this
problem by introducing 'tail' and 'next_tail' into metadata page, and
update them each time the ring buffer is half full. Which adds more
instructions to event output code path, hurt performance. In addition,
even with them we still unable to recover all possible records. For
example:

data_tail head
| |
V V
+------+-------+----------+------+---+
| A | B | C | D | |
+------+-------+----------+------+---+

If a record written at head pointer and it overwrites record A:

head data_tail
| |
V V
+--+---+-------+----------+------+---+
|E |...| B | C | D | E |
+--+---+-------+----------+------+---+

Record B is still valid but we can't get it through data_tail.

This patch suggests a different solution for this problem that, by
appending the length of a record at the end of it, user program is
possible to get every possible record in a backward manner, don't
need saving tail pointer.

For example:

head
|
V
+--+---+-------+----------+------+---+
|E6|...| B 8| C 11| D 7|E..|
+--+---+-------+----------+------+---+

In this case, from the 'head' pointer provided by kernel, user program
can first see '6' by (*(head - sizeof(u16))), then it can get the start
pointer of record 'E', then it can read size and find start position
of record D, C, B in similar way.

Kernel side implementation is easy: simply adding a PERF_SAMPLE_SIZE_AT_END
for the size output.

This sloution requires user program (perf) do more things. At least
following things and limitations should be considered:

1. Before reading such ring buffer, perf must ensure all events which
may output to it is already stopped, so the 'head' pointer it get
is the end of the last record.

Further improvement can be taken:

1. We must ensure all events attached this ring buffer has
'PERF_SAMPLE_SIZE_AT_END' selected.

2. 8 bytes extra space is required for each record.

3. We can find a way to append size information for tracking events.

[1] http://lkml.kernel.org/r/[email protected]
[2] http://lkml.kernel.org/r/[email protected]

Signed-off-by: Wang Nan <[email protected]>
Cc: Adrian Hunter <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: David Ahern <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Yunlong Song <[email protected]>
---
include/uapi/linux/perf_event.h | 3 ++-
kernel/events/core.c | 9 +++++++++
2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1afe962..e79606c 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_IDENTIFIER = 1U << 16,
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
+ PERF_SAMPLE_SIZE_AT_END = 1U << 19,

- PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
};

/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c3d61b9..cfe9336 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5452,6 +5452,12 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}

+ /* Should be the last one */
+ if (sample_type & PERF_SAMPLE_SIZE_AT_END) {
+ perf_output_skip(handle, sizeof(u64) - sizeof(header->size));
+ perf_output_put(handle, header->size);
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;

@@ -5571,6 +5577,9 @@ void perf_prepare_sample(struct perf_event_header *header,

header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_SIZE_AT_END)
+ header->size += sizeof(u64);
}

void perf_event_output(struct perf_event *event,
--
1.8.3.4

2015-12-07 13:30:09

by Wang Nan

[permalink] [raw]
Subject: [RFC PATCH v2 2/3] perf tools: Enable overwrite settings

This patch allows following config terms and option:

# perf record --overwrite ...

Globally set following events to overwrite;

# perf record --event cycles/overwrite/ ...
# perf record --event cycles/no-overwrite/ ...

Set specific events to be overwrite or no-overwrite.

Signed-off-by: Wang Nan <[email protected]>
---
tools/perf/builtin-record.c | 3 +++
tools/perf/perf.h | 2 ++
tools/perf/util/evsel.c | 4 ++++
tools/perf/util/evsel.h | 3 +++
tools/perf/util/parse-events.c | 14 ++++++++++++++
tools/perf/util/parse-events.h | 4 +++-
tools/perf/util/parse-events.l | 2 ++
7 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 2230b85..ec4135c 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1061,6 +1061,9 @@ struct option __record_options[] = {
OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
&record.opts.no_inherit_set,
"child tasks do not inherit counters"),
+ OPT_BOOLEAN_SET(0, "overwrite", &record.opts.overwrite,
+ &record.opts.overwrite_set,
+ "use overwrite mode"),
OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
"number of mmap data pages and AUX area tracing mmap pages",
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 90129ac..43d79fb 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -58,6 +58,8 @@ struct record_opts {
bool full_auxtrace;
bool auxtrace_snapshot_mode;
bool record_switch_events;
+ bool overwrite;
+ bool overwrite_set;
unsigned int freq;
unsigned int mmap_pages;
unsigned int auxtrace_mmap_pages;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 4dee8e3..3386437 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -668,6 +668,9 @@ static void apply_config_terms(struct perf_evsel *evsel,
*/
attr->inherit = term->val.inherit ? 1 : 0;
break;
+ case PERF_EVSEL__CONFIG_TERM_OVERWRITE:
+ evsel->overwrite = term->val.overwrite ? 1 : 0;
+ break;
default:
break;
}
@@ -743,6 +746,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)

attr->sample_id_all = perf_missing_features.sample_id_all ? 0 : 1;
attr->inherit = !opts->no_inherit;
+ evsel->overwrite = opts->overwrite;

perf_evsel__set_sample_bit(evsel, IP);
perf_evsel__set_sample_bit(evsel, TID);
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 51bab0f..c3b49e0 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -44,6 +44,7 @@ enum {
PERF_EVSEL__CONFIG_TERM_CALLGRAPH,
PERF_EVSEL__CONFIG_TERM_STACK_USER,
PERF_EVSEL__CONFIG_TERM_INHERIT,
+ PERF_EVSEL__CONFIG_TERM_OVERWRITE,
PERF_EVSEL__CONFIG_TERM_MAX,
};

@@ -57,6 +58,7 @@ struct perf_evsel_config_term {
char *callgraph;
u64 stack_user;
bool inherit;
+ bool overwrite;
} val;
};

@@ -115,6 +117,7 @@ struct perf_evsel {
bool tracking;
bool per_pkg;
bool precise_max;
+ bool overwrite;
/* parse modifier helper */
int exclude_GH;
int nr_members;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index c485b32..f20cc81 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -855,6 +855,12 @@ do { \
case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
CHECK_TYPE_VAL(NUM);
break;
+ case PARSE_EVENTS__TERM_TYPE_OVERWRITE:
+ CHECK_TYPE_VAL(NUM);
+ break;
+ case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE:
+ CHECK_TYPE_VAL(NUM);
+ break;
case PARSE_EVENTS__TERM_TYPE_NAME:
CHECK_TYPE_VAL(STR);
break;
@@ -892,6 +898,8 @@ static int config_term_tracepoint(struct perf_event_attr *attr,
case PARSE_EVENTS__TERM_TYPE_STACKSIZE:
case PARSE_EVENTS__TERM_TYPE_INHERIT:
case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
+ case PARSE_EVENTS__TERM_TYPE_OVERWRITE:
+ case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE:
return config_term_common(attr, term, err);
default:
if (err) {
@@ -961,6 +969,12 @@ do { \
case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
ADD_CONFIG_TERM(INHERIT, inherit, term->val.num ? 0 : 1);
break;
+ case PARSE_EVENTS__TERM_TYPE_OVERWRITE:
+ ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 1 : 0);
+ break;
+ case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE:
+ ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 0 : 1);
+ break;
default:
break;
}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index c34615f..29cc804 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -68,7 +68,9 @@ enum {
PARSE_EVENTS__TERM_TYPE_CALLGRAPH,
PARSE_EVENTS__TERM_TYPE_STACKSIZE,
PARSE_EVENTS__TERM_TYPE_NOINHERIT,
- PARSE_EVENTS__TERM_TYPE_INHERIT
+ PARSE_EVENTS__TERM_TYPE_INHERIT,
+ PARSE_EVENTS__TERM_TYPE_NOOVERWRITE,
+ PARSE_EVENTS__TERM_TYPE_OVERWRITE,
};

struct parse_events_array {
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 27d567f..2ef6f96 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -202,6 +202,8 @@ call-graph { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CALLGRAPH); }
stack-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_STACKSIZE); }
inherit { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_INHERIT); }
no-inherit { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOINHERIT); }
+overwrite { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_OVERWRITE); }
+no-overwrite { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOOVERWRITE); }
, { return ','; }
"/" { BEGIN(INITIAL); return '/'; }
{name_minus} { return str(yyscanner, PE_NAME); }
--
1.8.3.4

2015-12-07 13:30:32

by Wang Nan

[permalink] [raw]
Subject: [RFC PATCH v2 3/3] perf record: Find tail pointer through size at end of event

This is an RFC patch which roughly shows the usage of
PERF_SAMPLE_SIZE_AT_END introduced by previous patches. In this
prototype we can use 'perf record' to capture data in overwritable
ringbuffer:

# ./perf record -g --call-graph=dwarf,128 -m 1 -e dummy -e syscalls:*/overwrite/ dd if=/dev/zero of=/dev/null count=4096 bs=1
4096+0 records in
4096+0 records out
4096 bytes (4.1 kB) copied, 8.54486 s, 0.5 kB/s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.082 MB perf.data (9 samples) ]

# ./perf script -F comm,event
dd syscalls:sys_enter_write:
dd syscalls:sys_exit_write:
dd syscalls:sys_enter_write:
dd syscalls:sys_exit_write:
dd syscalls:sys_enter_write:
dd syscalls:sys_exit_write:
dd syscalls:sys_enter_close:
dd syscalls:sys_exit_close:
dd syscalls:sys_enter_exit_group:

# ls -l ./perf.data
-rw------- 1 root root 497438 Dec 7 12:48 ./perf.data

In this case perf uses 1 page for a overwritable ringbuffer. The result is
only the last 9 samples (see the sys_enter_exit_group) are captured.

# ./perf record -g --call-graph=dwarf,128 -m 1 -e dummy -e syscalls:*/overwrite/ dd if=/dev/zero of=/dev/null count=8192 bs=1
8192+0 records in
8192+0 records out
8192 bytes (8.2 kB) copied, 16.9867 s, 0.5 kB/s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.082 MB perf.data (9 samples) ]
# ls -l ./perf.data
-rw------- 1 root root 497438 Dec 7 12:51 ./perf.data

Issuing more syscalls doesn't causes perf.data size increasing. Still 9
samples are captured.

record__search_read_start() is the core function I want to show in
this patch, which walks through the whole ring buffer backward, locates
the head of each events by the 'size' field at the tail of each event.
Finally it get the oldest event in the ring buffer, then start dump
there.

Other parts in this patch are dirty tricks and should be rewritten.

Limitation in this patch: there must have a '-e dummy' with no
overwrite set as the first event. Following command causes error:

# ./perf record -e syscalls:*/overwrite/ ...

This is because we need this dummy event capture tracking events
like fork, mmap...

We'll go back to kernel to enforce the 'PERF_SAMPLE_SIZE_AT_END' flag
as mentioned in previous email:
1. Doesn't allow mixed events attached at one ring buffer with some
events has that flag but other events no;

2. Append size to tracking events (no-sample events) so event with
attr.tracking == 1 and PERF_SAMPLE_SIZE_AT_END set is acceptable.

>From these perf size work I find following improvements should be done:
1. Overwrite should not be a attribute of evlist, but an attribute of
evsel;

2. The principle of 'channel' should be introduced, so different
events can output things to different places. Which would be useful
for:

1) separate overwrite mapping and normal mapping,
2) allow outputting tracking events (through a dummy event) to an
isolated mmap area so it would be possible for 'perf record' run
as a daemon which periodically synthesizes those event without
parsing samples,
3) allow eBPF output event with no-buffer attribute set, so eBPF
programs would be possible to control behavior of perf, for
example, shut down events.

Signed-off-by: Wang Nan <[email protected]>
---
tools/perf/builtin-record.c | 73 +++++++++++++++++++++++++++++++++++++++++++--
tools/perf/util/evlist.c | 42 +++++++++++++++++++-------
tools/perf/util/evsel.c | 3 ++
3 files changed, 105 insertions(+), 13 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index ec4135c..3817a9a 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -74,6 +74,54 @@ static int process_synthesized_event(struct perf_tool *tool,
return record__write(rec, event, event->header.size);
}

+static int record__search_read_start(struct perf_mmap *md, u64 head, u64 *pstart)
+{
+ unsigned char *data = md->base + page_size;
+ u64 evt_head = head;
+ u16 *pevt_size;
+
+ while (true) {
+ struct perf_event_header *pheader;
+
+ pevt_size = (void *)&data[(evt_head - sizeof(*pevt_size)) & md->mask];
+
+ if (*pevt_size % sizeof(u16) != 0) {
+ pr_err("strange event size: %d\n", (int)(*pevt_size));
+ return -1;
+ }
+
+ if (!*pevt_size) {
+ if (evt_head) {
+ pr_err("size is 0 but evt_head (0x%lx) not 0\n",
+ (unsigned long)evt_head);
+ return -1;
+ }
+ break;
+ }
+
+ if (evt_head < *pevt_size)
+ break;
+
+ evt_head -= *pevt_size;
+ if (evt_head + (md->mask + 1) < head) {
+ evt_head += *pevt_size;
+ pr_debug("evt_head=%lx, head=%lx, size=%d\n", evt_head, head, *pevt_size);
+ break;
+ }
+
+ pheader = (struct perf_event_header *)(&data[evt_head & md->mask]);
+
+ if (pheader->size != *pevt_size) {
+ pr_err("size mismatch: %d vs %d\n",
+ (int)pheader->size, (int)(*pevt_size));
+ return -1;
+ }
+ }
+
+ *pstart = evt_head;
+ return 0;
+}
+
static int record__mmap_read(struct record *rec, int idx)
{
struct perf_mmap *md = &rec->evlist->mmap[idx];
@@ -83,6 +131,17 @@ static int record__mmap_read(struct record *rec, int idx)
unsigned long size;
void *buf;
int rc = 0;
+ bool overwrite = rec->evlist->overwrite;
+ int err;
+
+ if (idx >= rec->evlist->nr_mmaps)
+ overwrite = !overwrite;
+
+ if (overwrite) {
+ err = record__search_read_start(md, head, &old);
+ if (err)
+ return 0;
+ }

if (old == head)
return 0;
@@ -400,7 +459,7 @@ static struct perf_event_header finished_round_event = {
.type = PERF_RECORD_FINISHED_ROUND,
};

-static int record__mmap_read_all(struct record *rec)
+static int __record__mmap_read_all(struct record *rec, bool next_half)
{
u64 bytes_written = rec->bytes_written;
int i;
@@ -408,9 +467,10 @@ static int record__mmap_read_all(struct record *rec)

for (i = 0; i < rec->evlist->nr_mmaps; i++) {
struct auxtrace_mmap *mm = &rec->evlist->mmap[i].auxtrace_mmap;
+ int idx = i + (next_half ? rec->evlist->nr_mmaps : 0);

- if (rec->evlist->mmap[i].base) {
- if (record__mmap_read(rec, i) != 0) {
+ if (rec->evlist->mmap[idx].base) {
+ if (record__mmap_read(rec, idx) != 0) {
rc = -1;
goto out;
}
@@ -434,6 +494,11 @@ out:
return rc;
}

+static int record__mmap_read_all(struct record *rec)
+{
+ return __record__mmap_read_all(rec, false);
+}
+
static void record__init_features(struct record *rec)
{
struct perf_session *session = rec->session;
@@ -727,6 +792,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
}
auxtrace_snapshot_enabled = 0;

+ __record__mmap_read_all(rec, true);
+
if (forks && workload_exec_errno) {
char msg[STRERR_BUFSIZE];
const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 8dd59aa..a3421ee 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -800,7 +800,9 @@ void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx)
{
struct perf_mmap *md = &evlist->mmap[idx];

- if (!evlist->overwrite) {
+ if ((!evlist->overwrite && (idx < evlist->nr_mmaps)) ||
+ (evlist->overwrite && (idx >= evlist->nr_mmaps))) {
+
u64 old = md->prev;

perf_mmap__write_tail(md, old);
@@ -855,7 +857,7 @@ void perf_evlist__munmap(struct perf_evlist *evlist)
if (evlist->mmap == NULL)
return;

- for (i = 0; i < evlist->nr_mmaps; i++)
+ for (i = 0; i < 2 * evlist->nr_mmaps; i++)
__perf_evlist__munmap(evlist, i);

zfree(&evlist->mmap);
@@ -866,7 +868,7 @@ static int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
evlist->nr_mmaps = cpu_map__nr(evlist->cpus);
if (cpu_map__empty(evlist->cpus))
evlist->nr_mmaps = thread_map__nr(evlist->threads);
- evlist->mmap = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap));
+ evlist->mmap = zalloc(2 * evlist->nr_mmaps * sizeof(struct perf_mmap));
return evlist->mmap != NULL ? 0 : -ENOMEM;
}

@@ -897,6 +899,7 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx,
evlist->mmap[idx].mask = mp->mask;
evlist->mmap[idx].base = mmap(NULL, evlist->mmap_len, mp->prot,
MAP_SHARED, fd, 0);
+
if (evlist->mmap[idx].base == MAP_FAILED) {
pr_debug2("failed to mmap perf event ring buffer, error %d\n",
errno);
@@ -911,18 +914,31 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx,
return 0;
}

-static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx,
- struct mmap_params *mp, int cpu,
- int thread, int *output)
+static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int _idx,
+ struct mmap_params *_mp, int cpu,
+ int thread, int *output1, int *output2)
{
struct perf_evsel *evsel;
+ int *output;
+ struct mmap_params new_mp = *_mp;

evlist__for_each(evlist, evsel) {
int fd;
+ int idx = _idx;
+ struct mmap_params *mp = _mp;

if (evsel->system_wide && thread)
continue;

+ if (evsel->overwrite ^ evlist->overwrite) {
+ output = output2;
+ new_mp.prot ^= PROT_WRITE;
+ mp = &new_mp;
+ idx = _idx + evlist->nr_mmaps;
+ } else {
+ output = output1;
+ }
+
fd = FD(evsel, cpu, thread);

if (*output == -1) {
@@ -936,6 +952,8 @@ static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx,
perf_evlist__mmap_get(evlist, idx);
}

+ if (evsel->overwrite)
+ goto skip_poll_add;
/*
* The system_wide flag causes a selected event to be opened
* always without a pid. Consequently it will never get a
@@ -949,6 +967,8 @@ static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx,
return -1;
}

+skip_poll_add:
+
if (evsel->attr.read_format & PERF_FORMAT_ID) {
if (perf_evlist__id_add_fd(evlist, evsel, cpu, thread,
fd) < 0)
@@ -970,14 +990,15 @@ static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist,

pr_debug2("perf event ring buffer mmapped per cpu\n");
for (cpu = 0; cpu < nr_cpus; cpu++) {
- int output = -1;
+ int output1 = -1;
+ int output2 = -1;

auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, cpu,
true);

for (thread = 0; thread < nr_threads; thread++) {
if (perf_evlist__mmap_per_evsel(evlist, cpu, mp, cpu,
- thread, &output))
+ thread, &output1, &output2))
goto out_unmap;
}
}
@@ -998,13 +1019,14 @@ static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist,

pr_debug2("perf event ring buffer mmapped per thread\n");
for (thread = 0; thread < nr_threads; thread++) {
- int output = -1;
+ int output1 = -1;
+ int output2 = -1;

auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, thread,
false);

if (perf_evlist__mmap_per_evsel(evlist, thread, mp, 0, thread,
- &output))
+ &output1, &output2))
goto out_unmap;
}

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 3386437..8e40da9 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -910,6 +910,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
* it overloads any global configuration.
*/
apply_config_terms(evsel, opts);
+
+ if (evsel->overwrite)
+ perf_evsel__set_sample_bit(evsel, SIZE_AT_END);
}

static int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
--
1.8.3.4