From: Kan Liang <[email protected]>
perf_top__mmap_read has severe performance issue in
Knights Landing/Mill, when monitoring in heavy load system. It costs
several minutes to finish, which is unacceptable.
Currently, perf top is non overwrite mode. For non overwrite mode, it
tries to read everything in the ringbuffer and doesn't pause the
ringbuffer. Once there are lots of samples delivered persistently,
the processing time could be very long. Also, the latest samples could
be lost when the ringbuffer is full.
It's better to change it to overwrite mode, which takes a snapshot for
the system by pausing the ringbuffer and could significantly reducing
the processing time (from several minutes to several seconds).
Also, the overwrite mode always keep the latest samples.
Patch 1-4: Modify perf_mmap__read_catchup and perf_mmap__read_backward.
Introduce perf_mmap__read_event
Make generic code ready for overwrite mode
Patch 5-7: Perf top should only support either overwrite or non-overwrite
mode.
Switch default mode to overwrite mode
If kernel doesn't support overwrite mode, fall back to
non-overwrite mode.
Patch 8: The latency could be still higher than refresh time in some
extreme cases. Give user some hints to reduce the latency.
Changes since V1:
- New patches 4-6
- Support both overwrite mode and non-overwrite mode.
If kernel doesn't support default overwrite mode, fall back to
non-overwrite mode.
Kan Liang (8):
perf tools: remove stale perf evlist mmap read for backward
perf tools: rewrite perf mmap read for overwrite
perf tools: reuse perf_mmap__read_catchup in perf_mmap__push
perf tools: introduce perf_mmap__read_event
perf top: check per event overwrite term
perf top: add overwrite fall back
perf top: switch default mode to overwrite mode
perf top: check the latency of perf_top__mmap_read
tools/perf/builtin-top.c | 139 +++++++++++++++++++++++++++----
tools/perf/tests/backward-ring-buffer.c | 9 +-
tools/perf/ui/browsers/hists.c | 12 ++-
tools/perf/util/evlist.c | 17 ----
tools/perf/util/evlist.h | 4 -
tools/perf/util/mmap.c | 140 ++++++++++++++++++--------------
tools/perf/util/mmap.h | 10 ++-
7 files changed, 229 insertions(+), 102 deletions(-)
--
2.5.5
From: Kan Liang <[email protected]>
perf_evlist__mmap_read_catchup and perf_evlist__mmap_read_backward are
only for overwrite mode.
But they read the evlist->mmap buffer which is for non-overwrite mode.
It did not bring any serious problem yet, because there is no one use
it.
Remove the unused interfaces.
Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/util/evlist.c | 17 -----------------
tools/perf/util/evlist.h | 4 ----
2 files changed, 21 deletions(-)
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 3570355..4b6a06d 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -714,28 +714,11 @@ union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist, int
return perf_mmap__read_forward(md);
}
-union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist, int idx)
-{
- struct perf_mmap *md = &evlist->mmap[idx];
-
- /*
- * No need to check messup for backward ring buffer:
- * We can always read arbitrary long data from a backward
- * ring buffer unless we forget to pause it before reading.
- */
- return perf_mmap__read_backward(md);
-}
-
union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
{
return perf_evlist__mmap_read_forward(evlist, idx);
}
-void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx)
-{
- perf_mmap__read_catchup(&evlist->mmap[idx]);
-}
-
void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx)
{
perf_mmap__consume(&evlist->mmap[idx], false);
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 7516066..a80fd47 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -132,10 +132,6 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx);
union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist,
int idx);
-union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist,
- int idx);
-void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx);
-
void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx);
int perf_evlist__open(struct perf_evlist *evlist);
--
2.5.5
From: Kan Liang <[email protected]>
Switch to non-overwrite mode if kernel doesnot support overwrite
ringbuffer.
It's only effect when overwrite mode is supported.
No change to current behavior.
Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/builtin-top.c | 35 +++++++++++++++++++++++++++++++++++
1 file changed, 35 insertions(+)
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 5e15d27..7c462d6 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -931,6 +931,27 @@ static int perf_top_overwrite_check(struct perf_top *top)
return 0;
}
+static int perf_top_overwrite_fallback(struct perf_top *top,
+ struct perf_evsel *evsel)
+{
+ struct record_opts *opts = &top->record_opts;
+ struct perf_evlist *evlist = top->evlist;
+ struct perf_evsel *counter;
+
+ if (!opts->overwrite)
+ return 0;
+
+ /* only fall back when first event fails */
+ if (evsel != perf_evlist__first(evlist))
+ return 0;
+
+ evlist__for_each_entry(evlist, counter)
+ counter->attr.write_backward = false;
+ opts->overwrite = false;
+ ui__warning("fall back to non-overwrite mode\n");
+ return 1;
+}
+
static int perf_top__start_counters(struct perf_top *top)
{
char msg[BUFSIZ];
@@ -954,6 +975,20 @@ static int perf_top__start_counters(struct perf_top *top)
try_again:
if (perf_evsel__open(counter, top->evlist->cpus,
top->evlist->threads) < 0) {
+
+ /*
+ * Specially handle overwrite fall back.
+ * Because perf top is the only tool which has
+ * overwrite mode by default, support
+ * both overwrite and non-overwrite mode, and
+ * require consistent mode for all events.
+ *
+ * May move it to generic code with more tools
+ * have similar attribute.
+ */
+ if (perf_top_overwrite_fallback(top, counter))
+ goto try_again;
+
if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
if (verbose > 0)
ui__warning("%s\n", msg);
--
2.5.5
From: Kan Liang <[email protected]>
perf_top__mmap_read has severe performance issue in
Knights Landing/Mill, when monitoring in heavy load system. It costs
several minutes to finish, which is unacceptable.
Currently, perf top is non overwrite mode. For non overwrite mode, it
tries to read everything in the ringbuffer and doesn't pause the
ringbuffer. Once there are lots of samples delivered persistently,
the processing time could be very long. Also, the latest samples could
be lost when the ringbuffer is full.
For overwrite mode, it takes a snapshot for the system by pausing the
ringbuffer, which could significantly reducing the processing time.
Also, the overwrite mode always keep the latest samples.
Considering the real time requirement for perf top, the overwrite mode
is more suitable for perf top.
Actually, perf top was overwrite mode. It is changed to non overwrite
mode since commit 93fc64f14472 ("perf top: Switch to non overwrite
mode"). It's better to change it back to overwrite mode by default.
For the kernel which doesn't support overwrite mode, it will fall back
to non overwrite mode.
There would be some records lost in overwrite mode because of pausing
the ringbuffer. It has little impact for the accuracy of the snapshot
and could be tolerant.
The lost events checking is removed.
For overwrite mode, unconditionally wait 100 ms before each snapshot. It
also reduce the overhead caused by pausing ringbuffer, especially on
light load system.
Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/builtin-top.c | 45 ++++++++++++++++++++++++------------------
tools/perf/ui/browsers/hists.c | 12 ++++++++---
2 files changed, 35 insertions(+), 22 deletions(-)
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 7c462d6..02c0c7f 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -283,16 +283,6 @@ static void perf_top__print_sym_table(struct perf_top *top)
printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
- if (hists->stats.nr_lost_warned !=
- hists->stats.nr_events[PERF_RECORD_LOST]) {
- hists->stats.nr_lost_warned =
- hists->stats.nr_events[PERF_RECORD_LOST];
- color_fprintf(stdout, PERF_COLOR_RED,
- "WARNING: LOST %d chunks, Check IO/CPU overload",
- hists->stats.nr_lost_warned);
- ++printed;
- }
-
if (top->sym_filter_entry) {
perf_top__show_details(top);
return;
@@ -807,15 +797,24 @@ static void perf_event__process_sample(struct perf_tool *tool,
static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
{
+ struct record_opts *opts = &top->record_opts;
+ struct perf_evlist *evlist = top->evlist;
struct perf_sample sample;
struct perf_evsel *evsel;
+ struct perf_mmap *md;
struct perf_session *session = top->session;
union perf_event *event;
struct machine *machine;
+ unsigned long size;
+ u64 end, start;
int ret;
- while ((event = perf_evlist__mmap_read(top->evlist, idx)) != NULL) {
- ret = perf_evlist__parse_sample(top->evlist, event, &sample);
+ md = opts->overwrite ? &evlist->overwrite_mmap[idx] : &evlist->mmap[idx];
+ if (perf_mmap__read_catchup(md, opts->overwrite, &start, &end, &size) < 1)
+ return;
+
+ while ((event = perf_mmap__read_event(md, opts->overwrite, &start, end)) != NULL) {
+ ret = perf_evlist__parse_sample(evlist, event, &sample);
if (ret) {
pr_err("Can't parse sample, err = %d\n", ret);
goto next_event;
@@ -869,16 +868,28 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
} else
++session->evlist->stats.nr_unknown_events;
next_event:
- perf_evlist__mmap_consume(top->evlist, idx);
+ perf_mmap__consume(md, opts->overwrite);
}
+
+ perf_mmap__read_done(md);
}
static void perf_top__mmap_read(struct perf_top *top)
{
+ bool overwrite = top->record_opts.overwrite;
+ struct perf_evlist *evlist = top->evlist;
int i;
+ if (overwrite)
+ perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_DATA_PENDING);
+
for (i = 0; i < top->evlist->nr_mmaps; i++)
perf_top__mmap_read_idx(top, i);
+
+ if (overwrite) {
+ perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
+ perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_RUNNING);
+ }
}
/*
@@ -966,11 +977,6 @@ static int perf_top__start_counters(struct perf_top *top)
goto out_err;
}
- if (opts->overwrite) {
- ui__error("not support overwrite mode yet\n");
- goto out_err;
- }
-
evlist__for_each_entry(evlist, counter) {
try_again:
if (perf_evsel__open(counter, top->evlist->cpus,
@@ -1128,7 +1134,7 @@ static int __cmd_top(struct perf_top *top)
perf_top__mmap_read(top);
- if (hits == top->samples)
+ if (opts->overwrite || (hits == top->samples))
ret = perf_evlist__poll(top->evlist, 100);
if (resize) {
@@ -1222,6 +1228,7 @@ int cmd_top(int argc, const char **argv)
.uses_mmap = true,
},
.proc_map_timeout = 500,
+ .overwrite = 1,
},
.max_stack = sysctl_perf_event_max_stack,
.sym_pcnt_filter = 5,
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 68146f4..56023e4 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -638,8 +638,13 @@ int hist_browser__run(struct hist_browser *browser, const char *help)
nr_entries = hist_browser__nr_entries(browser);
ui_browser__update_nr_entries(&browser->b, nr_entries);
- if (browser->hists->stats.nr_lost_warned !=
- browser->hists->stats.nr_events[PERF_RECORD_LOST]) {
+ /*
+ * Don't print lost events warning for perf top,
+ * because it is overwrite mode.
+ * Perf top is the only tool which has hbt timer.
+ */
+ if ((browser->hists->stats.nr_lost_warned !=
+ browser->hists->stats.nr_events[PERF_RECORD_LOST]) && !hbt) {
browser->hists->stats.nr_lost_warned =
browser->hists->stats.nr_events[PERF_RECORD_LOST];
ui_browser__warn_lost_events(&browser->b);
@@ -3203,7 +3208,8 @@ static int perf_evsel_menu__run(struct perf_evsel_menu *menu,
case K_TIMER:
hbt->timer(hbt->arg);
- if (!menu->lost_events_warned && menu->lost_events) {
+ if (!menu->lost_events_warned &&
+ menu->lost_events && !hbt) {
ui_browser__warn_lost_events(&menu->b);
menu->lost_events_warned = true;
}
--
2.5.5
From: Kan Liang <[email protected]>
The latency of perf_top__mmap_read should be lower than refresh time.
If not, give some hints to reduce the latency.
Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/builtin-top.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 02c0c7f..b6b038c 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -878,8 +878,10 @@ static void perf_top__mmap_read(struct perf_top *top)
{
bool overwrite = top->record_opts.overwrite;
struct perf_evlist *evlist = top->evlist;
+ unsigned long long start, end;
int i;
+ start = rdclock();
if (overwrite)
perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_DATA_PENDING);
@@ -890,6 +892,13 @@ static void perf_top__mmap_read(struct perf_top *top)
perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_RUNNING);
}
+ end = rdclock();
+
+ if ((end - start) > (unsigned long long)top->delay_secs * NSEC_PER_SEC)
+ ui__warning("Too slow to read ring buffer.\n"
+ "Please try increasing the period (-c) or\n"
+ "decreasing the freq (-F) or\n"
+ "limiting the number of CPUs (-C)\n");
}
/*
--
2.5.5
From: Kan Liang <[email protected]>
Per event overwrite term is not forbidden in perf top, which can bring
problems. Because perf top only support non-overwrite mode.
Check and forbid inconsistent per event overwrite term in the evlist.
Make it possible to support either non-overwrite or overwrite mode.
The overwrite mode is forbidden now, which will be removed when the
overwrite mode is supported later.
Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/builtin-top.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 60 insertions(+)
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 540461f..5e15d27 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -881,6 +881,56 @@ static void perf_top__mmap_read(struct perf_top *top)
perf_top__mmap_read_idx(top, i);
}
+/*
+ * Check per event overwrite term.
+ * perf top supports consistent mode for all events.
+ * Return -1 if the per event terms set but not consistent.
+ */
+static int perf_top_overwrite_check(struct perf_top *top)
+{
+ struct record_opts *opts = &top->record_opts;
+ struct perf_evlist *evlist = top->evlist;
+ struct perf_evsel_config_term *term;
+ struct list_head *config_terms;
+ struct perf_evsel *evsel;
+ int set, overwrite = -1;
+
+ evlist__for_each_entry(evlist, evsel) {
+ set = -1;
+ config_terms = &evsel->config_terms;
+ list_for_each_entry(term, config_terms, list) {
+ if (term->type == PERF_EVSEL__CONFIG_TERM_OVERWRITE)
+ set = term->val.overwrite ? 1 : 0;
+ }
+
+ /* no term for current and previous event (likely) */
+ if ((overwrite < 0) && (set < 0))
+ continue;
+
+ /* has term for both current and previous event, compare */
+ if ((overwrite >= 0) && (set >= 0) && (overwrite != set))
+ return -1;
+
+ /* no term for current event but has term for previous one */
+ if ((overwrite >= 0) && (set < 0))
+ return -1;
+
+ /* has term for current event */
+ if ((overwrite < 0) && (set >= 0)) {
+ /* if it's first event, set overwrite */
+ if (evsel == perf_evlist__first(evlist))
+ overwrite = set;
+ else
+ return -1;
+ }
+ }
+
+ if ((overwrite >= 0) && (opts->overwrite != overwrite))
+ opts->overwrite = overwrite;
+
+ return 0;
+}
+
static int perf_top__start_counters(struct perf_top *top)
{
char msg[BUFSIZ];
@@ -890,6 +940,16 @@ static int perf_top__start_counters(struct perf_top *top)
perf_evlist__config(evlist, opts, &callchain_param);
+ if (perf_top_overwrite_check(top)) {
+ ui__error("perf top support consistent mode for all events\n");
+ goto out_err;
+ }
+
+ if (opts->overwrite) {
+ ui__error("not support overwrite mode yet\n");
+ goto out_err;
+ }
+
evlist__for_each_entry(evlist, counter) {
try_again:
if (perf_evsel__open(counter, top->evlist->cpus,
--
2.5.5
From: Kan Liang <[email protected]>
Currently, there is no generic function to read event from ring buffer,
which support both overwirte and non-overwrite mode.
Indroduce perf_mmap__read_event to do so.
The usage is as below.
perf_mmap__read_catchup()
while(event = perf_mmap__read_event()) {
//process the event
perf_mmap__consume()
}
perf_mmap__read_done()
Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/util/mmap.c | 25 +++++++++++++++++++++++++
tools/perf/util/mmap.h | 3 +++
2 files changed, 28 insertions(+)
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 61237eb..7c4c69a 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -146,6 +146,31 @@ void perf_mmap__read_done(struct perf_mmap *map)
map->prev = perf_mmap__read_head(map);
}
+
+/*
+ * Read event from ring buffer. Return one event for each call.
+ * Support both overwirte and non-overwrite mode.
+ * The start and end are only available for overwirte mode, which
+ * pause the ringbuffer.
+ *
+ * Usage:
+ * perf_mmap__read_catchup
+ * while(event = perf_mmap__read_event) {
+ * //process the event
+ * perf_mmap__consume
+ * }
+ * perf_mmap__read_done
+ */
+union perf_event *perf_mmap__read_event(struct perf_mmap *map,
+ bool overwrite,
+ u64 *start, u64 end)
+{
+ if (overwrite)
+ return perf_mmap__read_backward(map, start, end);
+ else
+ return perf_mmap__read_forward(map);
+}
+
static bool perf_mmap__empty(struct perf_mmap *map)
{
return perf_mmap__read_head(map) == map->prev && !map->auxtrace_mmap.base;
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index a91222e..7082a7c 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -91,6 +91,9 @@ static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail)
union perf_event *perf_mmap__read_forward(struct perf_mmap *map);
union perf_event *perf_mmap__read_backward(struct perf_mmap *map,
u64 *start, u64 end);
+union perf_event *perf_mmap__read_event(struct perf_mmap *map,
+ bool overwrite,
+ u64 *start, u64 end);
int perf_mmap__push(struct perf_mmap *md, bool backward,
void *to, int push(void *to, void *buf, size_t size));
--
2.5.5
From: Kan Liang <[email protected]>
perf_mmap__read_catchup and perf_mmap__read_backward are used to read
events one by one from ring buffer under overwrite mode.
It always read the stale buffer which is already processed.
Because the previous location of processed ring buffer is discarded.
Introducing the perf_mmap__read_done, which update the map->prev to
indicate the position of processed buffer.
Refining perf_mmap__read_catchup to calculate the start and the end of
ring buffer. It only need to do the calculation once at the beginning,
because the ring buffer is paused in overwrite mode.
Doesn't need to do the calculation in perf_mmap__read_backward.
For now, the only user is backward-ring-buffer in perf test.
Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/tests/backward-ring-buffer.c | 9 +++-
tools/perf/util/mmap.c | 88 +++++++++++++++++++--------------
tools/perf/util/mmap.h | 7 ++-
3 files changed, 63 insertions(+), 41 deletions(-)
diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c
index 4035d43..66d9e54 100644
--- a/tools/perf/tests/backward-ring-buffer.c
+++ b/tools/perf/tests/backward-ring-buffer.c
@@ -31,10 +31,14 @@ static int count_samples(struct perf_evlist *evlist, int *sample_count,
int i;
for (i = 0; i < evlist->nr_mmaps; i++) {
+ struct perf_mmap *map = &evlist->overwrite_mmap[i];
union perf_event *event;
+ unsigned long size;
+ u64 start, end;
- perf_mmap__read_catchup(&evlist->overwrite_mmap[i]);
- while ((event = perf_mmap__read_backward(&evlist->overwrite_mmap[i])) != NULL) {
+ perf_mmap__read_catchup(map, true, &start, &end, &size);
+ while ((event = perf_mmap__read_backward(map, &start, end)) !=
+ NULL) {
const u32 type = event->header.type;
switch (type) {
@@ -49,6 +53,7 @@ static int count_samples(struct perf_evlist *evlist, int *sample_count,
return TEST_FAIL;
}
}
+ perf_mmap__read_done(map);
}
return TEST_OK;
}
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 05076e6..bf67460 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -85,51 +85,65 @@ union perf_event *perf_mmap__read_forward(struct perf_mmap *map)
return perf_mmap__read(map, old, head, &map->prev);
}
-union perf_event *perf_mmap__read_backward(struct perf_mmap *map)
+union perf_event *perf_mmap__read_backward(struct perf_mmap *map,
+ u64 *start, u64 end)
{
- u64 head, end;
- u64 start = map->prev;
-
- /*
- * Check if event was unmapped due to a POLLHUP/POLLERR.
- */
- if (!refcount_read(&map->refcnt))
- return NULL;
-
- head = perf_mmap__read_head(map);
- if (!head)
- return NULL;
+ union perf_event *event = NULL;
- /*
- * 'head' pointer starts from 0. Kernel minus sizeof(record) form
- * it each time when kernel writes to it, so in fact 'head' is
- * negative. 'end' pointer is made manually by adding the size of
- * the ring buffer to 'head' pointer, means the validate data can
- * read is the whole ring buffer. If 'end' is positive, the ring
- * buffer has not fully filled, so we must adjust 'end' to 0.
- *
- * However, since both 'head' and 'end' is unsigned, we can't
- * simply compare 'end' against 0. Here we compare '-head' and
- * the size of the ring buffer, where -head is the number of bytes
- * kernel write to the ring buffer.
- */
- if (-head < (u64)(map->mask + 1))
- end = 0;
- else
- end = head + map->mask + 1;
+ event = perf_mmap__read(map, *start, end, &map->prev);
+ *start = map->prev;
- return perf_mmap__read(map, start, end, &map->prev);
+ return event;
}
-void perf_mmap__read_catchup(struct perf_mmap *map)
+static int overwrite_rb_find_range(void *buf, int mask, u64 head,
+ u64 *start, u64 *end);
+
+int perf_mmap__read_catchup(struct perf_mmap *map,
+ bool overwrite,
+ u64 *start, u64 *end,
+ unsigned long *size)
{
- u64 head;
+ u64 head = perf_mmap__read_head(map);
+ u64 old = map->prev;
+ unsigned char *data = map->base + page_size;
- if (!refcount_read(&map->refcnt))
- return;
+ *start = overwrite ? head : old;
+ *end = overwrite ? old : head;
- head = perf_mmap__read_head(map);
- map->prev = head;
+ if (*start == *end)
+ return 0;
+
+ *size = *end - *start;
+ if (*size > (unsigned long)(map->mask) + 1) {
+ if (!overwrite) {
+ WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
+
+ map->prev = head;
+ perf_mmap__consume(map, overwrite);
+ return 0;
+ }
+
+ /*
+ * Backward ring buffer is full. We still have a chance to read
+ * most of data from it.
+ */
+ if (overwrite_rb_find_range(data, map->mask, head, start, end))
+ return -1;
+ }
+
+ return 1;
+}
+
+/*
+ * Mandatory for overwrite mode
+ * The direction of overwrite mode is backward.
+ * The last mmap__read_event will set tail to map->prev.
+ * Need to correct the map->prev to head which is the end of next read.
+ */
+void perf_mmap__read_done(struct perf_mmap *map)
+{
+ map->prev = perf_mmap__read_head(map);
}
static bool perf_mmap__empty(struct perf_mmap *map)
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index d640273..a91222e 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -65,7 +65,9 @@ void perf_mmap__put(struct perf_mmap *map);
void perf_mmap__consume(struct perf_mmap *map, bool overwrite);
-void perf_mmap__read_catchup(struct perf_mmap *md);
+int perf_mmap__read_catchup(struct perf_mmap *map, bool overwrite,
+ u64 *start, u64 *end, unsigned long *size);
+void perf_mmap__read_done(struct perf_mmap *map);
static inline u64 perf_mmap__read_head(struct perf_mmap *mm)
{
@@ -87,7 +89,8 @@ static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail)
}
union perf_event *perf_mmap__read_forward(struct perf_mmap *map);
-union perf_event *perf_mmap__read_backward(struct perf_mmap *map);
+union perf_event *perf_mmap__read_backward(struct perf_mmap *map,
+ u64 *start, u64 end);
int perf_mmap__push(struct perf_mmap *md, bool backward,
void *to, int push(void *to, void *buf, size_t size));
--
2.5.5
From: Kan Liang <[email protected]>
perf_mmap__push uses the same codes as perf_mmap__read_catchup to
calculate the ring buffer start, end and size.
No funcational change.
Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/util/mmap.c | 31 ++++++-------------------------
1 file changed, 6 insertions(+), 25 deletions(-)
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index bf67460..61237eb 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -285,36 +285,16 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite,
void *to, int push(void *to, void *buf, size_t size))
{
u64 head = perf_mmap__read_head(md);
- u64 old = md->prev;
- u64 end = head, start = old;
+ u64 end, start;
unsigned char *data = md->base + page_size;
unsigned long size;
void *buf;
- int rc = 0;
+ int rc;
- start = overwrite ? head : old;
- end = overwrite ? old : head;
- if (start == end)
- return 0;
-
- size = end - start;
- if (size > (unsigned long)(md->mask) + 1) {
- if (!overwrite) {
- WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
-
- md->prev = head;
- perf_mmap__consume(md, overwrite);
- return 0;
- }
-
- /*
- * Backward ring buffer is full. We still have a chance to read
- * most of data from it.
- */
- if (overwrite_rb_find_range(data, md->mask, head, &start, &end))
- return -1;
- }
+ rc = perf_mmap__read_catchup(md, overwrite, &start, &end, &size);
+ if (rc < 1)
+ return rc;
if ((start & md->mask) + size != (end & md->mask)) {
buf = &data[start & md->mask];
@@ -338,6 +318,7 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite,
md->prev = head;
perf_mmap__consume(md, overwrite);
+ rc = 0;
out:
return rc;
}
--
2.5.5
On 2017/12/7 7:32, [email protected] wrote:
> From: Kan Liang <[email protected]>
>
> perf_evlist__mmap_read_catchup and perf_evlist__mmap_read_backward are
> only for overwrite mode.
> But they read the evlist->mmap buffer which is for non-overwrite mode.
>
> It did not bring any serious problem yet, because there is no one use
> it.
>
> Remove the unused interfaces.
>
> Signed-off-by: Kan Liang <[email protected]>
Acked-by: Wang Nan <[email protected]>
> ---
> tools/perf/util/evlist.c | 17 -----------------
> tools/perf/util/evlist.h | 4 ----
> 2 files changed, 21 deletions(-)
>
> diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
> index 3570355..4b6a06d 100644
> --- a/tools/perf/util/evlist.c
> +++ b/tools/perf/util/evlist.c
> @@ -714,28 +714,11 @@ union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist, int
> return perf_mmap__read_forward(md);
> }
>
> -union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist, int idx)
> -{
> - struct perf_mmap *md = &evlist->mmap[idx];
> -
> - /*
> - * No need to check messup for backward ring buffer:
> - * We can always read arbitrary long data from a backward
> - * ring buffer unless we forget to pause it before reading.
> - */
> - return perf_mmap__read_backward(md);
> -}
> -
> union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
> {
> return perf_evlist__mmap_read_forward(evlist, idx);
> }
>
> -void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx)
> -{
> - perf_mmap__read_catchup(&evlist->mmap[idx]);
> -}
> -
> void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx)
> {
> perf_mmap__consume(&evlist->mmap[idx], false);
> diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
> index 7516066..a80fd47 100644
> --- a/tools/perf/util/evlist.h
> +++ b/tools/perf/util/evlist.h
> @@ -132,10 +132,6 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx);
>
> union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist,
> int idx);
> -union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist,
> - int idx);
> -void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx);
> -
> void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx);
>
> int perf_evlist__open(struct perf_evlist *evlist);
On 2017/12/7 7:32, [email protected] wrote:
> From: Kan Liang <[email protected]>
>
> perf_mmap__read_catchup and perf_mmap__read_backward are used to read
> events one by one from ring buffer under overwrite mode.
> It always read the stale buffer which is already processed.
> Because the previous location of processed ring buffer is discarded.
>
> Introducing the perf_mmap__read_done, which update the map->prev to
> indicate the position of processed buffer.
>
> Refining perf_mmap__read_catchup to calculate the start and the end of
> ring buffer. It only need to do the calculation once at the beginning,
> because the ring buffer is paused in overwrite mode.
> Doesn't need to do the calculation in perf_mmap__read_backward.
>
> For now, the only user is backward-ring-buffer in perf test.
>
> Signed-off-by: Kan Liang <[email protected]>
> ---
> tools/perf/tests/backward-ring-buffer.c | 9 +++-
> tools/perf/util/mmap.c | 88 +++++++++++++++++++--------------
> tools/perf/util/mmap.h | 7 ++-
> 3 files changed, 63 insertions(+), 41 deletions(-)
>
> diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c
> index 4035d43..66d9e54 100644
> --- a/tools/perf/tests/backward-ring-buffer.c
> +++ b/tools/perf/tests/backward-ring-buffer.c
> @@ -31,10 +31,14 @@ static int count_samples(struct perf_evlist *evlist, int *sample_count,
> int i;
>
> for (i = 0; i < evlist->nr_mmaps; i++) {
> + struct perf_mmap *map = &evlist->overwrite_mmap[i];
> union perf_event *event;
> + unsigned long size;
> + u64 start, end;
>
> - perf_mmap__read_catchup(&evlist->overwrite_mmap[i]);
> - while ((event = perf_mmap__read_backward(&evlist->overwrite_mmap[i])) != NULL) {
> + perf_mmap__read_catchup(map, true, &start, &end, &size);
> + while ((event = perf_mmap__read_backward(map, &start, end)) !=
> + NULL) {
> const u32 type = event->header.type;
>
> switch (type) {
> @@ -49,6 +53,7 @@ static int count_samples(struct perf_evlist *evlist, int *sample_count,
> return TEST_FAIL;
> }
> }
> + perf_mmap__read_done(map);
> }
> return TEST_OK;
> }
> diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
> index 05076e6..bf67460 100644
> --- a/tools/perf/util/mmap.c
> +++ b/tools/perf/util/mmap.c
> @@ -85,51 +85,65 @@ union perf_event *perf_mmap__read_forward(struct perf_mmap *map)
> return perf_mmap__read(map, old, head, &map->prev);
> }
>
> -union perf_event *perf_mmap__read_backward(struct perf_mmap *map)
> +union perf_event *perf_mmap__read_backward(struct perf_mmap *map,
> + u64 *start, u64 end)
> {
> - u64 head, end;
> - u64 start = map->prev;
> -
> - /*
> - * Check if event was unmapped due to a POLLHUP/POLLERR.
> - */
> - if (!refcount_read(&map->refcnt))
> - return NULL;
> -
Is removing this checking safe?
> - head = perf_mmap__read_head(map);
> - if (!head)
> - return NULL;
> + union perf_event *event = NULL;
>
> - /*
> - * 'head' pointer starts from 0. Kernel minus sizeof(record) form
> - * it each time when kernel writes to it, so in fact 'head' is
> - * negative. 'end' pointer is made manually by adding the size of
> - * the ring buffer to 'head' pointer, means the validate data can
> - * read is the whole ring buffer. If 'end' is positive, the ring
> - * buffer has not fully filled, so we must adjust 'end' to 0.
> - *
> - * However, since both 'head' and 'end' is unsigned, we can't
> - * simply compare 'end' against 0. Here we compare '-head' and
> - * the size of the ring buffer, where -head is the number of bytes
> - * kernel write to the ring buffer.
> - */
> - if (-head < (u64)(map->mask + 1))
> - end = 0;
> - else
> - end = head + map->mask + 1;
> + event = perf_mmap__read(map, *start, end, &map->prev);
> + *start = map->prev;
>
> - return perf_mmap__read(map, start, end, &map->prev);
> + return event;
> }
perf_mmap__read_backward() and perf_mmap__read_forward() become
asymmetrical,
but their names are symmetrical.
>
> -void perf_mmap__read_catchup(struct perf_mmap *map)
> +static int overwrite_rb_find_range(void *buf, int mask, u64 head,
> + u64 *start, u64 *end);
> +
> +int perf_mmap__read_catchup(struct perf_mmap *map,
> + bool overwrite,
> + u64 *start, u64 *end,
> + unsigned long *size)
> {
> - u64 head;
> + u64 head = perf_mmap__read_head(map);
> + u64 old = map->prev;
> + unsigned char *data = map->base + page_size;
>
> - if (!refcount_read(&map->refcnt))
> - return;
> + *start = overwrite ? head : old;
> + *end = overwrite ? old : head;
>
> - head = perf_mmap__read_head(map);
> - map->prev = head;
> + if (*start == *end)
> + return 0;
> +
> + *size = *end - *start;
> + if (*size > (unsigned long)(map->mask) + 1) {
> + if (!overwrite) {
> + WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
> +
> + map->prev = head;
> + perf_mmap__consume(map, overwrite);
> + return 0;
> + }
> +
> + /*
> + * Backward ring buffer is full. We still have a chance to read
> + * most of data from it.
> + */
> + if (overwrite_rb_find_range(data, map->mask, head, start, end))
> + return -1;
> + }
> +
> + return 1;
Coding suggestion: this function returns 2 different value (1 and -1) in
two fail path. Should return 0 for success and -1 for failure.
You totally redefine perf_mmap__read_catchup(). The original meaning of
this function is adjust md->prev so following perf_mmap__read() can read
events one by one safely. Only backward ring buffer needs catchup:
kernel's 'head' pointer keeps moving (backward), while md->prev keeps
unchanged. For forward ring buffer, md->prev will catchup each time when
reading from the ring buffer.
Your patch totally changes its meaning. Now its meaning is finding the
available data from a ring buffer (report start and end). At least we
need a better name. In addition, if I understand your code correctly, we
don't need to report 'size' to caller, because size is determined by
start and end.
In addition, since the concept of backward and overwrite are now
unified, we can avoid updating map->prev during one-by-one reading
(because backward reading don't need consume operation). I think we can
decouple the updating of map->prev and these two basic read functions.
This can makes the operations on map->prev clearer. Now the moving
direction of map->prev is confusing for backward ring buffer: it moves
forward during one by one reading, and moves backward when block reading
(in perf record) and when syncing.
Then the core of two reading become uniform. Let's get rid of
duplication.
Summarize:
1. Compute both start and end before reading for both forward and
backward (perf_mmap__read_catchup, but need a better name).
2. Don't update map->prev in perf_mmap__read. Let perf_mmap__read()
update start pointer, so it become stateless and suit for both backward
and forward reading.
3. Update md->prev for forward reading. Merge it into
perf_mmap__consume?
Thank you.
On 2017/12/7 7:33, [email protected] wrote:
> From: Kan Liang <[email protected]>
>
> Switch to non-overwrite mode if kernel doesnot support overwrite
> ringbuffer.
>
> It's only effect when overwrite mode is supported.
> No change to current behavior.
>
> Signed-off-by: Kan Liang <[email protected]>
> ---
> tools/perf/builtin-top.c | 35 +++++++++++++++++++++++++++++++++++
> 1 file changed, 35 insertions(+)
>
> diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
> index 5e15d27..7c462d6 100644
> --- a/tools/perf/builtin-top.c
> +++ b/tools/perf/builtin-top.c
> @@ -931,6 +931,27 @@ static int perf_top_overwrite_check(struct perf_top *top)
> return 0;
> }
>
> +static int perf_top_overwrite_fallback(struct perf_top *top,
> + struct perf_evsel *evsel)
> +{
> + struct record_opts *opts = &top->record_opts;
> + struct perf_evlist *evlist = top->evlist;
> + struct perf_evsel *counter;
> +
> + if (!opts->overwrite)
> + return 0;
> +
> + /* only fall back when first event fails */
> + if (evsel != perf_evlist__first(evlist))
> + return 0;
> +
> + evlist__for_each_entry(evlist, counter)
> + counter->attr.write_backward = false;
> + opts->overwrite = false;
> + ui__warning("fall back to non-overwrite mode\n");
> + return 1;
You can check perf_missing_features.write_backward here. Only do the
fallback
when we know the problem is caused by missing of write_backward.
> +}
> +
> static int perf_top__start_counters(struct perf_top *top)
> {
> char msg[BUFSIZ];
> @@ -954,6 +975,20 @@ static int perf_top__start_counters(struct perf_top *top)
> try_again:
> if (perf_evsel__open(counter, top->evlist->cpus,
> top->evlist->threads) < 0) {
> +
> + /*
> + * Specially handle overwrite fall back.
> + * Because perf top is the only tool which has
> + * overwrite mode by default, support
> + * both overwrite and non-overwrite mode, and
> + * require consistent mode for all events.
> + *
> + * May move it to generic code with more tools
> + * have similar attribute.
> + */
> + if (perf_top_overwrite_fallback(top, counter))
> + goto try_again;
> +
It will unconditionally remove backward bit even if the problem
is caused by other missing feature.
Thank you.
> > -union perf_event *perf_mmap__read_backward(struct perf_mmap *map)
> > +union perf_event *perf_mmap__read_backward(struct perf_mmap *map,
> > + u64 *start, u64 end)
> > {
> > - u64 head, end;
> > - u64 start = map->prev;
> > -
> > - /*
> > - * Check if event was unmapped due to a POLLHUP/POLLERR.
> > - */
> > - if (!refcount_read(&map->refcnt))
> > - return NULL;
> > -
>
> Is removing this checking safe?
It was duplicate as the one in perf_mmap__read_catchup.
Once planned to remove one. But it looks I removed both accidently.
Will keep one in V3.
>
> > - head = perf_mmap__read_head(map);
> > - if (!head)
> > - return NULL;
> > + union perf_event *event = NULL;
> >
> > - /*
> > - * 'head' pointer starts from 0. Kernel minus sizeof(record) form
> > - * it each time when kernel writes to it, so in fact 'head' is
> > - * negative. 'end' pointer is made manually by adding the size of
> > - * the ring buffer to 'head' pointer, means the validate data can
> > - * read is the whole ring buffer. If 'end' is positive, the ring
> > - * buffer has not fully filled, so we must adjust 'end' to 0.
> > - *
> > - * However, since both 'head' and 'end' is unsigned, we can't
> > - * simply compare 'end' against 0. Here we compare '-head' and
> > - * the size of the ring buffer, where -head is the number of bytes
> > - * kernel write to the ring buffer.
> > - */
> > - if (-head < (u64)(map->mask + 1))
> > - end = 0;
> > - else
> > - end = head + map->mask + 1;
> > + event = perf_mmap__read(map, *start, end, &map->prev);
> > + *start = map->prev;
> >
> > - return perf_mmap__read(map, start, end, &map->prev);
> > + return event;
> > }
>
> perf_mmap__read_backward() and perf_mmap__read_forward() become
> asymmetrical,
> but their names are symmetrical.
>
>
> >
> > -void perf_mmap__read_catchup(struct perf_mmap *map)
> > +static int overwrite_rb_find_range(void *buf, int mask, u64 head,
> > + u64 *start, u64 *end);
> > +
> > +int perf_mmap__read_catchup(struct perf_mmap *map,
> > + bool overwrite,
> > + u64 *start, u64 *end,
> > + unsigned long *size)
> > {
> > - u64 head;
> > + u64 head = perf_mmap__read_head(map);
> > + u64 old = map->prev;
> > + unsigned char *data = map->base + page_size;
> >
> > - if (!refcount_read(&map->refcnt))
> > - return;
> > + *start = overwrite ? head : old;
> > + *end = overwrite ? old : head;
> >
> > - head = perf_mmap__read_head(map);
> > - map->prev = head;
> > + if (*start == *end)
> > + return 0;
> > +
> > + *size = *end - *start;
> > + if (*size > (unsigned long)(map->mask) + 1) {
> > + if (!overwrite) {
> > + WARN_ONCE(1, "failed to keep up with mmap data.
> (warn only once)\n");
> > +
> > + map->prev = head;
> > + perf_mmap__consume(map, overwrite);
> > + return 0;
> > + }
> > +
> > + /*
> > + * Backward ring buffer is full. We still have a chance to read
> > + * most of data from it.
> > + */
> > + if (overwrite_rb_find_range(data, map->mask, head, start,
> end))
> > + return -1;
> > + }
> > +
> > + return 1;
>
> Coding suggestion: this function returns 2 different value (1 and -1) in
> two fail path. Should return 0 for success and -1 for failure.
>
For perf top, -1 is enough for failure.
But for perf_mmap__push, it looks two fail paths are needed, aren't they?
> You totally redefine perf_mmap__read_catchup(). The original meaning of
> this function is adjust md->prev so following perf_mmap__read() can read
> events one by one safely. Only backward ring buffer needs catchup:
> kernel's 'head' pointer keeps moving (backward), while md->prev keeps
> unchanged. For forward ring buffer, md->prev will catchup each time when
> reading from the ring buffer.
>
> Your patch totally changes its meaning. Now its meaning is finding the
> available data from a ring buffer (report start and end). At least we
> need a better name.
Sure, I will introduce a new function to do it in V3.
> In addition, if I understand your code correctly, we
> don't need to report 'size' to caller, because size is determined by
> start and end.
Sure, I will remove the 'size' in V3.
>
> In addition, since the concept of backward and overwrite are now
> unified, we can avoid updating map->prev during one-by-one reading
> (because backward reading don't need consume operation). I think we can
> decouple the updating of map->prev and these two basic read functions.
> This can makes the operations on map->prev clearer. Now the moving
> direction of map->prev is confusing for backward ring buffer: it moves
> forward during one by one reading, and moves backward when block
> reading
> (in perf record)
For reading, I think both one by one reading and block reading move forward
for overwrite.
It starts from head and ends in map->prev. No?
For one-by-one reading, yes, it doesn’t need to update map->prev for each read.
But both need to update the map->prev when the block is finished.
> and when syncing.
>
> Then the core of two reading become uniform. Let's get rid of
> duplication.
>
> Summarize:
>
> 1. Compute both start and end before reading for both forward and
> backward (perf_mmap__read_catchup, but need a better name).
How about perf_mmap__read_init?
>
> 2. Don't update map->prev in perf_mmap__read. Let perf_mmap__read()
> update start pointer, so it become stateless and suit for both backward
> and forward reading.
>
> 3. Update md->prev for forward reading. Merge it into
> perf_mmap__consume?
It looks we have to pass the updated 'start' to perf_mmap__consume.
Move interfaces like perf_evlist__mmap_read need to be changed.
That would impacts other tools which only support non-overwrite mode.
How about update both 'md->prev' and 'start' in perf_mmap__read()
like the patch as below?
Introducing a new perf_mmap__read_event to get rid of
the perf_mmap__read_backward/forward.
perf_mmap__read_backward will be removed later.
Keep perf_mmap__read_forward since other tools still use it.
It has to update the 'end' for non-overwrite mode in each read since the
ringbuffer is not paused.
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 484a394..74f33fd 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -22,16 +22,16 @@ size_t perf_mmap__mmap_len(struct perf_mmap *map)
/* When check_messup is true, 'end' must points to a good entry */
static union perf_event *perf_mmap__read(struct perf_mmap *map,
- u64 start, u64 end, u64 *prev)
+ u64 *start, u64 end, u64 *prev)
{
unsigned char *data = map->base + page_size;
union perf_event *event = NULL;
- int diff = end - start;
+ int diff = end - *start;
if (diff >= (int)sizeof(event->header)) {
size_t size;
- event = (union perf_event *)&data[start & map->mask];
+ event = (union perf_event *)&data[*start & map->mask];
size = event->header.size;
if (size < sizeof(event->header) || diff < (int)size) {
@@ -43,8 +43,8 @@ static union perf_event *perf_mmap__read(struct perf_mmap *map,
* Event straddles the mmap boundary -- header should always
* be inside due to u64 alignment of output.
*/
- if ((start & map->mask) + size != ((start + size) & map->mask)) {
- unsigned int offset = start;
+ if ((*start & map->mask) + size != ((*start + size) & map->mask)) {
+ unsigned int offset = *start;
unsigned int len = min(sizeof(*event), size), cpy;
void *dst = map->event_copy;
@@ -59,12 +59,12 @@ static union perf_event *perf_mmap__read(struct perf_mmap *map,
event = (union perf_event *)map->event_copy;
}
- start += size;
+ *start += size;
}
broken_event:
if (prev)
- *prev = start;
+ *prev = *start;
return event;
}
@@ -132,6 +107,42 @@ void perf_mmap__read_catchup(struct perf_mmap *map)
map->prev = head;
}
+
+/*
+ * Read event from ring buffer. Return one event for each call.
+ * Support both overwirte and non-overwrite mode.
+ * The start and end are only available for overwirte mode, which
+ * pause the ringbuffer.
+ *
+ * Usage:
+ * perf_mmap__read_init
+ * while(event = perf_mmap__read_event) {
+ * //process the event
+ * perf_mmap__consume
+ * }
+ * perf_mmap__read_done
+ */
+union perf_event *perf_mmap__read_event(struct perf_mmap *map,
+ bool overwrite,
+ u64 *start, u64 end)
+{
+ /*
+ * Check if event was unmapped due to a POLLHUP/POLLERR.
+ */
+ if (!refcount_read(&map->refcnt))
+ return NULL;
+
+ if (!overwrite)
+ end = perf_mmap__read_head(map);
+
+ return perf_mmap__read(map, start, end, &map->prev);
+}
+
static bool perf_mmap__empty(struct perf_mmap *map)
{
return perf_mmap__read_head(map) == map->prev && !map->auxtrace_mmap.base;
Thanks,
Kan
>
> On 2017/12/7 7:33, [email protected] wrote:
> > From: Kan Liang <[email protected]>
> >
> > Switch to non-overwrite mode if kernel doesnot support overwrite
> > ringbuffer.
> >
> > It's only effect when overwrite mode is supported.
> > No change to current behavior.
> >
> > Signed-off-by: Kan Liang <[email protected]>
> > ---
> > tools/perf/builtin-top.c | 35 +++++++++++++++++++++++++++++++++++
> > 1 file changed, 35 insertions(+)
> >
> > diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
> > index 5e15d27..7c462d6 100644
> > --- a/tools/perf/builtin-top.c
> > +++ b/tools/perf/builtin-top.c
> > @@ -931,6 +931,27 @@ static int perf_top_overwrite_check(struct
> perf_top *top)
> > return 0;
> > }
> >
> > +static int perf_top_overwrite_fallback(struct perf_top *top,
> > + struct perf_evsel *evsel)
> > +{
> > + struct record_opts *opts = &top->record_opts;
> > + struct perf_evlist *evlist = top->evlist;
> > + struct perf_evsel *counter;
> > +
> > + if (!opts->overwrite)
> > + return 0;
> > +
> > + /* only fall back when first event fails */
> > + if (evsel != perf_evlist__first(evlist))
> > + return 0;
> > +
> > + evlist__for_each_entry(evlist, counter)
> > + counter->attr.write_backward = false;
> > + opts->overwrite = false;
> > + ui__warning("fall back to non-overwrite mode\n");
> > + return 1;
>
> You can check perf_missing_features.write_backward here. Only do the
> fallback
> when we know the problem is caused by missing of write_backward.
>
perf_missing_features is only defined in evsel.c
I will add an inline function in evsel.c to do the check.
Thanks,
Kan
> > +}
> > +
> > static int perf_top__start_counters(struct perf_top *top)
> > {
> > char msg[BUFSIZ];
> > @@ -954,6 +975,20 @@ static int perf_top__start_counters(struct
> perf_top *top)
> > try_again:
> > if (perf_evsel__open(counter, top->evlist->cpus,
> > top->evlist->threads) < 0) {
> > +
> > + /*
> > + * Specially handle overwrite fall back.
> > + * Because perf top is the only tool which has
> > + * overwrite mode by default, support
> > + * both overwrite and non-overwrite mode, and
> > + * require consistent mode for all events.
> > + *
> > + * May move it to generic code with more tools
> > + * have similar attribute.
> > + */
> > + if (perf_top_overwrite_fallback(top, counter))
> > + goto try_again;
> > +
>
> It will unconditionally remove backward bit even if the problem
> is caused by other missing feature.
>
> Thank you.
On 2017/12/19 3:07, Liang, Kan wrote:
>>> -union perf_event *perf_mmap__read_backward(struct perf_mmap *map)
>>> +union perf_event *perf_mmap__read_backward(struct perf_mmap *map,
>>> + u64 *start, u64 end)
>>> {
>>> - u64 head, end;
>>> - u64 start = map->prev;
>>> -
>>> - /*
>>> - * Check if event was unmapped due to a POLLHUP/POLLERR.
>>> - */
>>> - if (!refcount_read(&map->refcnt))
>>> - return NULL;
>>> -
>> Is removing this checking safe?
> It was duplicate as the one in perf_mmap__read_catchup.
> Once planned to remove one. But it looks I removed both accidently.
> Will keep one in V3.
>
>>> - head = perf_mmap__read_head(map);
>>> - if (!head)
>>> - return NULL;
>>> + union perf_event *event = NULL;
>>>
>>> - /*
>>> - * 'head' pointer starts from 0. Kernel minus sizeof(record) form
>>> - * it each time when kernel writes to it, so in fact 'head' is
>>> - * negative. 'end' pointer is made manually by adding the size of
>>> - * the ring buffer to 'head' pointer, means the validate data can
>>> - * read is the whole ring buffer. If 'end' is positive, the ring
>>> - * buffer has not fully filled, so we must adjust 'end' to 0.
>>> - *
>>> - * However, since both 'head' and 'end' is unsigned, we can't
>>> - * simply compare 'end' against 0. Here we compare '-head' and
>>> - * the size of the ring buffer, where -head is the number of bytes
>>> - * kernel write to the ring buffer.
>>> - */
>>> - if (-head < (u64)(map->mask + 1))
>>> - end = 0;
>>> - else
>>> - end = head + map->mask + 1;
>>> + event = perf_mmap__read(map, *start, end, &map->prev);
>>> + *start = map->prev;
>>>
>>> - return perf_mmap__read(map, start, end, &map->prev);
>>> + return event;
>>> }
>> perf_mmap__read_backward() and perf_mmap__read_forward() become
>> asymmetrical,
>> but their names are symmetrical.
>>
>>
>>> -void perf_mmap__read_catchup(struct perf_mmap *map)
>>> +static int overwrite_rb_find_range(void *buf, int mask, u64 head,
>>> + u64 *start, u64 *end);
>>> +
>>> +int perf_mmap__read_catchup(struct perf_mmap *map,
>>> + bool overwrite,
>>> + u64 *start, u64 *end,
>>> + unsigned long *size)
>>> {
>>> - u64 head;
>>> + u64 head = perf_mmap__read_head(map);
>>> + u64 old = map->prev;
>>> + unsigned char *data = map->base + page_size;
>>>
>>> - if (!refcount_read(&map->refcnt))
>>> - return;
>>> + *start = overwrite ? head : old;
>>> + *end = overwrite ? old : head;
>>>
>>> - head = perf_mmap__read_head(map);
>>> - map->prev = head;
>>> + if (*start == *end)
>>> + return 0;
>>> +
>>> + *size = *end - *start;
>>> + if (*size > (unsigned long)(map->mask) + 1) {
>>> + if (!overwrite) {
>>> + WARN_ONCE(1, "failed to keep up with mmap data.
>> (warn only once)\n");
>>> +
>>> + map->prev = head;
>>> + perf_mmap__consume(map, overwrite);
>>> + return 0;
>>> + }
>>> +
>>> + /*
>>> + * Backward ring buffer is full. We still have a chance to read
>>> + * most of data from it.
>>> + */
>>> + if (overwrite_rb_find_range(data, map->mask, head, start,
>> end))
>>> + return -1;
>>> + }
>>> +
>>> + return 1;
>> Coding suggestion: this function returns 2 different value (1 and -1) in
>> two fail path. Should return 0 for success and -1 for failure.
>>
> For perf top, -1 is enough for failure.
> But for perf_mmap__push, it looks two fail paths are needed, aren't they?
>
I see. I think this is not a good practice. Please try to avoid returning
3 states. For example, comparing start and end outside? If can't avoid, how
about returning an enum to notice reader about the difference?
>> You totally redefine perf_mmap__read_catchup(). The original meaning of
>> this function is adjust md->prev so following perf_mmap__read() can read
>> events one by one safely. Only backward ring buffer needs catchup:
>> kernel's 'head' pointer keeps moving (backward), while md->prev keeps
>> unchanged. For forward ring buffer, md->prev will catchup each time when
>> reading from the ring buffer.
>>
>> Your patch totally changes its meaning. Now its meaning is finding the
>> available data from a ring buffer (report start and end). At least we
>> need a better name.
> Sure, I will introduce a new function to do it in V3.
>
>> In addition, if I understand your code correctly, we
>> don't need to report 'size' to caller, because size is determined by
>> start and end.
> Sure, I will remove the 'size' in V3.
>
>> In addition, since the concept of backward and overwrite are now
>> unified, we can avoid updating map->prev during one-by-one reading
>> (because backward reading don't need consume operation). I think we can
>> decouple the updating of map->prev and these two basic read functions.
>> This can makes the operations on map->prev clearer. Now the moving
>> direction of map->prev is confusing for backward ring buffer: it moves
>> forward during one by one reading, and moves backward when block
>> reading
>> (in perf record)
> For reading, I think both one by one reading and block reading move forward
> for overwrite.
> It starts from head and ends in map->prev. No?
>
> For one-by-one reading, yes, it doesn’t need to update map->prev for each read.
> But both need to update the map->prev when the block is finished.
It depends on how you think about map->prev.
map->prev is the last position it has been processed. For forward ring
buffer,
in both one-by-one reading and block reading, the processing direction
is same,
so map->prev keeps moving forward. For backward ring buffer, the processing
direction is different: for perf record, the processing direction is
backward,
so map->prev moves backward; for one-by-one reading, the reading
direction is
same as forward ring buffer, so it moves forward. It still needs to move
backward
again to read the next block.
If we can avoid moving it forward in one-by-one reading, map->prev would
consistent
with the direction of the ring buffer in all cases. Then we can enforce
the meaning
of the operation of 'process' that, only perf_mmap__consume() process a
block of
ring buffer.
>> and when syncing.
>>
>> Then the core of two reading become uniform. Let's get rid of
>> duplication.
>>
>> Summarize:
>>
>> 1. Compute both start and end before reading for both forward and
>> backward (perf_mmap__read_catchup, but need a better name).
> How about perf_mmap__read_init?
Good.
>> 2. Don't update map->prev in perf_mmap__read. Let perf_mmap__read()
>> update start pointer, so it become stateless and suit for both backward
>> and forward reading.
>>
>> 3. Update md->prev for forward reading. Merge it into
>> perf_mmap__consume?
> It looks we have to pass the updated 'start' to perf_mmap__consume.
> Move interfaces like perf_evlist__mmap_read need to be changed.
> That would impacts other tools which only support non-overwrite mode.
>
> How about update both 'md->prev' and 'start' in perf_mmap__read()
> like the patch as below?
What about making perf_mmap__read() totally stateless? Don't
touch md->prev there, and makes forward reading do an extra
mp->prev updating before consuming?
> Introducing a new perf_mmap__read_event to get rid of
> the perf_mmap__read_backward/forward.
>
> perf_mmap__read_backward will be removed later.
> Keep perf_mmap__read_forward since other tools still use it.
OK. For all reader who doesn't care backward or forward, it should use
perf_mmap__read() and maintains start position by its own, and give it
a chance to adjust map->prev if the ringbuffer is forward.
perf_mmap__read_forward() borrows mp->prev to maintain start position
like this:
union perf_event *perf_mmap__read_forward(struct perf_mmap *map)
{
....
return perf_mmap__read(map, &map->prev, head);
}
> It has to update the 'end' for non-overwrite mode in each read since the
> ringbuffer is not paused.
>
> diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
> index 484a394..74f33fd 100644
> --- a/tools/perf/util/mmap.c
> +++ b/tools/perf/util/mmap.c
> @@ -22,16 +22,16 @@ size_t perf_mmap__mmap_len(struct perf_mmap *map)
>
> /* When check_messup is true, 'end' must points to a good entry */
> static union perf_event *perf_mmap__read(struct perf_mmap *map,
> - u64 start, u64 end, u64 *prev)
> + u64 *start, u64 end, u64 *prev)
Don't need *prev because it can be replaced by *start.
> {
> unsigned char *data = map->base + page_size;
> union perf_event *event = NULL;
> - int diff = end - start;
> + int diff = end - *start;
>
> if (diff >= (int)sizeof(event->header)) {
> size_t size;
>
> - event = (union perf_event *)&data[start & map->mask];
> + event = (union perf_event *)&data[*start & map->mask];
> size = event->header.size;
>
> if (size < sizeof(event->header) || diff < (int)size) {
> @@ -43,8 +43,8 @@ static union perf_event *perf_mmap__read(struct perf_mmap *map,
> * Event straddles the mmap boundary -- header should always
> * be inside due to u64 alignment of output.
> */
> - if ((start & map->mask) + size != ((start + size) & map->mask)) {
> - unsigned int offset = start;
> + if ((*start & map->mask) + size != ((*start + size) & map->mask)) {
> + unsigned int offset = *start;
> unsigned int len = min(sizeof(*event), size), cpy;
> void *dst = map->event_copy;
>
> @@ -59,12 +59,12 @@ static union perf_event *perf_mmap__read(struct perf_mmap *map,
> event = (union perf_event *)map->event_copy;
> }
>
> - start += size;
> + *start += size;
> }
>
> broken_event:
> if (prev)
> - *prev = start;
> + *prev = *start;
>
> return event;
> }
> @@ -132,6 +107,42 @@ void perf_mmap__read_catchup(struct perf_mmap *map)
> map->prev = head;
> }
> +
> +/*
> + * Read event from ring buffer. Return one event for each call.
> + * Support both overwirte and non-overwrite mode.
> + * The start and end are only available for overwirte mode, which
> + * pause the ringbuffer.
> + *
> + * Usage:
> + * perf_mmap__read_init
> + * while(event = perf_mmap__read_event) {
> + * //process the event
> + * perf_mmap__consume
Need a transparent method before perf_mmap__consume to adjust
md->prev if the ring buffer is forward. Or let'w wrap perf_mmap__consume
to do it if you don't want to break its API?
Thank you.
> >>> +int perf_mmap__read_catchup(struct perf_mmap *map,
> >>> + bool overwrite,
> >>> + u64 *start, u64 *end,
> >>> + unsigned long *size)
> >>> {
> >>> - u64 head;
> >>> + u64 head = perf_mmap__read_head(map);
> >>> + u64 old = map->prev;
> >>> + unsigned char *data = map->base + page_size;
> >>>
> >>> - if (!refcount_read(&map->refcnt))
> >>> - return;
> >>> + *start = overwrite ? head : old;
> >>> + *end = overwrite ? old : head;
> >>>
> >>> - head = perf_mmap__read_head(map);
> >>> - map->prev = head;
> >>> + if (*start == *end)
> >>> + return 0;
> >>> +
> >>> + *size = *end - *start;
> >>> + if (*size > (unsigned long)(map->mask) + 1) {
> >>> + if (!overwrite) {
> >>> + WARN_ONCE(1, "failed to keep up with mmap data.
> >> (warn only once)\n");
> >>> +
> >>> + map->prev = head;
> >>> + perf_mmap__consume(map, overwrite);
> >>> + return 0;
> >>> + }
> >>> +
> >>> + /*
> >>> + * Backward ring buffer is full. We still have a chance to read
> >>> + * most of data from it.
> >>> + */
> >>> + if (overwrite_rb_find_range(data, map->mask, head, start,
> >> end))
> >>> + return -1;
> >>> + }
> >>> +
> >>> + return 1;
> >> Coding suggestion: this function returns 2 different value (1 and -1) in
> >> two fail path. Should return 0 for success and -1 for failure.
> >>
> > For perf top, -1 is enough for failure.
> > But for perf_mmap__push, it looks two fail paths are needed, aren't they?
> >
>
> I see. I think this is not a good practice. Please try to avoid returning
> 3 states. For example, comparing start and end outside? If can't avoid, how
> about returning an enum to notice reader about the difference?
OK. Will avoid it in V3.
> >> 2. Don't update map->prev in perf_mmap__read. Let perf_mmap__read()
> >> update start pointer, so it become stateless and suit for both backward
> >> and forward reading.
> >>
> >> 3. Update md->prev for forward reading. Merge it into
> >> perf_mmap__consume?
> > It looks we have to pass the updated 'start' to perf_mmap__consume.
> > Move interfaces like perf_evlist__mmap_read need to be changed.
> > That would impacts other tools which only support non-overwrite mode.
> >
> > How about update both 'md->prev' and 'start' in perf_mmap__read()
> > like the patch as below?
>
> What about making perf_mmap__read() totally stateless? Don't
> touch md->prev there, and makes forward reading do an extra
> mp->prev updating before consuming?
We can move the update in the new interface perf_mmap__read_event.
>
> > Introducing a new perf_mmap__read_event to get rid of
> > the perf_mmap__read_backward/forward.
> >
> > perf_mmap__read_backward will be removed later.
> > Keep perf_mmap__read_forward since other tools still use it.
>
>
> OK. For all reader who doesn't care backward or forward, it should use
> perf_mmap__read() and maintains start position by its own, and give it
> a chance to adjust map->prev if the ringbuffer is forward.
>
> perf_mmap__read_forward() borrows mp->prev to maintain start position
> like this:
>
> union perf_event *perf_mmap__read_forward(struct perf_mmap *map)
> {
> ....
> return perf_mmap__read(map, &map->prev, head);
> }
>
>
Yes, we can do that for the legacy interface.
> > It has to update the 'end' for non-overwrite mode in each read since the
> > ringbuffer is not paused.
> >
> > diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
> > index 484a394..74f33fd 100644
> > --- a/tools/perf/util/mmap.c
> > +++ b/tools/perf/util/mmap.c
> > @@ -22,16 +22,16 @@ size_t perf_mmap__mmap_len(struct perf_mmap
> *map)
> >
> > /* When check_messup is true, 'end' must points to a good entry */
> > static union perf_event *perf_mmap__read(struct perf_mmap *map,
> > - u64 start, u64 end, u64 *prev)
> > + u64 *start, u64 end, u64 *prev)
>
>
> Don't need *prev because it can be replaced by *start.
>
> > {
> > unsigned char *data = map->base + page_size;
> > union perf_event *event = NULL;
> > - int diff = end - start;
> > + int diff = end - *start;
> >
> > if (diff >= (int)sizeof(event->header)) {
> > size_t size;
> >
> > - event = (union perf_event *)&data[start & map->mask];
> > + event = (union perf_event *)&data[*start & map->mask];
> > size = event->header.size;
> >
> > if (size < sizeof(event->header) || diff < (int)size) {
> > @@ -43,8 +43,8 @@ static union perf_event *perf_mmap__read(struct
> perf_mmap *map,
> > * Event straddles the mmap boundary -- header should
> always
> > * be inside due to u64 alignment of output.
> > */
> > - if ((start & map->mask) + size != ((start + size) & map->mask))
> {
> > - unsigned int offset = start;
> > + if ((*start & map->mask) + size != ((*start + size) & map-
> >mask)) {
> > + unsigned int offset = *start;
> > unsigned int len = min(sizeof(*event), size), cpy;
> > void *dst = map->event_copy;
> >
> > @@ -59,12 +59,12 @@ static union perf_event *perf_mmap__read(struct
> perf_mmap *map,
> > event = (union perf_event *)map->event_copy;
> > }
> >
> > - start += size;
> > + *start += size;
> > }
> >
> > broken_event:
> > if (prev)
> > - *prev = start;
> > + *prev = *start;
> >
> > return event;
> > }
> > @@ -132,6 +107,42 @@ void perf_mmap__read_catchup(struct
> perf_mmap *map)
> > map->prev = head;
> > }
> > +
> > +/*
> > + * Read event from ring buffer. Return one event for each call.
> > + * Support both overwirte and non-overwrite mode.
> > + * The start and end are only available for overwirte mode, which
> > + * pause the ringbuffer.
> > + *
> > + * Usage:
> > + * perf_mmap__read_init
> > + * while(event = perf_mmap__read_event) {
> > + * //process the event
> > + * perf_mmap__consume
>
> Need a transparent method before perf_mmap__consume to adjust
> md->prev if the ring buffer is forward. Or let'w wrap perf_mmap__consume
> to do it if you don't want to break its API?
I think the best place is perf_mmap__read_event, if you don't want to
update it in perf_mmap__read.
Wrapping perf_mmap__consume still need to pass the 'start' as parameter,
and add need a new wrapper interface.
In perf_mmap__read_event, 'start' is already there.
Only need to do this.
+ if (!overwrite)
+ map->prev = *start;
Thanks,
Kan