Hi,
Here are 2 small patches which try to fulfill a point in the perf todo
list:
* Forward port the page fault tracepoints and use it in 'trace'.
http://git.kernel.org/?p=linux/kernel/git/acme/linux.git;a=commitdiff;h=d53b11976093b6d8afeb8181db53aaffc754920d;hp=32ba4abf60ae1b710d22a75725491815de649bc5
There are some questionable points:
* With luck I think I found the patch related with the todo item (the
link in the todo wiki page is broken); I hope I am not wrong...
* In the patch mentioned above, I found only changes related with
tracepoints creations and calls; the tracepoints were declared
generic (in include/trace/events/kmem.h) but were only called in x86
(arch/x86/mm/fault.c); as in x86, the tracepoint
"mm_pagefault_start" looks fairly like "page_fault_user" and
"page_fault_kernel", I decided to only create one x86-specific
tracepoint: "page_fault_exit"; maybe, you would prefer declaring
generic tracepoints;
* No option has been added for activating page-fault duration
calculation: if the needed tracepoints are available, durations will
be printed; maybe, that was not what you were looking for.
The patches were generated against tip/perf/core.
Alexis.
Alexis Berlemont (2):
perf, x86-mm: Add exit-fault tracing
perf: add page fault duration measures in perf trace
arch/x86/include/asm/trace/exceptions.h | 21 +++
arch/x86/mm/fault.c | 1 +
tools/perf/builtin-trace.c | 225 ++++++++++++++++++++++++++++----
3 files changed, 221 insertions(+), 26 deletions(-)
--
2.10.1
Signed-off-by: Alexis Berlemont <[email protected]>
---
tools/perf/Documentation/perf-trace.txt | 4 +-
tools/perf/builtin-trace.c | 225 ++++++++++++++++++++++++++++----
2 files changed, 202 insertions(+), 27 deletions(-)
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 781b019..53c103c 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -117,7 +117,9 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
-F=[all|min|maj]::
--pf=[all|min|maj]::
Trace pagefaults. Optionally, you can specify whether you want minor,
- major or all pagefaults. Default value is maj.
+ major or all pagefaults. Default value is maj. Durations of
+ page-fault handling will be printed if possible (need for some
+ architecture-dependent tracepoints).
--syscalls::
Trace system calls. This options is enabled by default.
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 5f45166..100c28a 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -861,6 +861,8 @@ struct thread_trace {
} paths;
struct intlist *syscall_stats;
+ u64 pgfault_entry_time;
+ char *pgfault_entry_str;
};
static struct thread_trace *thread_trace__new(void)
@@ -1797,21 +1799,56 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
return 0;
}
-static void print_location(FILE *f, struct perf_sample *sample,
- struct addr_location *al,
- bool print_dso, bool print_sym)
+static int trace__pgfault_enter(struct trace *trace,
+ struct perf_evsel *evsel __maybe_unused,
+ union perf_event *event __maybe_unused,
+ struct perf_sample *sample)
{
+ struct thread *thread;
+ struct thread_trace *ttrace;
+ int err = -1;
+
+ thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+ if (!thread)
+ goto out;
+
+ ttrace = thread__trace(thread, trace->output);
+ if (ttrace == NULL)
+ goto out_put;
+
+ ttrace->pgfault_entry_time = sample->time;
+
+out:
+ err = 0;
+out_put:
+ thread__put(thread);
+ return err;
+}
+
+static size_t scnprintf_location(char *bf, size_t size,
+ struct perf_sample *sample,
+ struct addr_location *al,
+ bool print_dso, bool print_sym)
+{
+ size_t printed = 0;
if ((verbose || print_dso) && al->map)
- fprintf(f, "%s@", al->map->dso->long_name);
+ printed += scnprintf(bf + printed,
+ size - printed, "%s@", al->map->dso->long_name);
if ((verbose || print_sym) && al->sym)
- fprintf(f, "%s+0x%" PRIx64, al->sym->name,
- al->addr - al->sym->start);
+ printed += scnprintf(bf + printed,
+ size - printed,
+ "%s+0x%" PRIx64, al->sym->name,
+ al->addr - al->sym->start);
else if (al->map)
- fprintf(f, "0x%" PRIx64, al->addr);
+ printed += scnprintf(bf + printed,
+ size - printed, "0x%" PRIx64, al->addr);
else
- fprintf(f, "0x%" PRIx64, sample->addr);
+ printed += scnprintf(bf + printed,
+ size - printed, "0x%" PRIx64, sample->addr);
+
+ return printed;
}
static int trace__pgfault(struct trace *trace,
@@ -1823,13 +1860,22 @@ static int trace__pgfault(struct trace *trace,
struct addr_location al;
char map_type = 'd';
struct thread_trace *ttrace;
+ size_t printed = 0;
int err = -1;
int callchain_ret = 0;
thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+ if (!thread)
+ goto out;
- if (sample->callchain) {
- callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+ ttrace = thread__trace(thread, trace->output);
+ if (ttrace == NULL)
+ goto out_put;
+
+ if (!ttrace->pgfault_entry_time && sample->callchain) {
+ callchain_ret =
+ trace__resolve_callchain(trace, evsel,
+ sample, &callchain_cursor);
if (callchain_ret == 0) {
if (callchain_cursor.nr < trace->min_stack)
goto out_put;
@@ -1837,10 +1883,6 @@ static int trace__pgfault(struct trace *trace,
}
}
- ttrace = thread__trace(thread, trace->output);
- if (ttrace == NULL)
- goto out_put;
-
if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
ttrace->pfmaj++;
else
@@ -1849,18 +1891,27 @@ static int trace__pgfault(struct trace *trace,
if (trace->summary_only)
goto out;
+ if (ttrace->pgfault_entry_str == NULL) {
+ ttrace->pgfault_entry_str = malloc(trace__entry_str_size);
+ if (!ttrace->pgfault_entry_str)
+ goto out_put;
+ ttrace->pgfault_entry_str[0] = '\0';
+ }
+
thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
sample->ip, &al);
- trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
-
- fprintf(trace->output, "%sfault [",
- evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
- "maj" : "min");
+ printed += scnprintf(ttrace->pgfault_entry_str + printed,
+ trace__entry_str_size - printed, "%sfault [",
+ evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
+ "maj" : "min");
- print_location(trace->output, sample, &al, false, true);
+ printed += scnprintf_location(ttrace->pgfault_entry_str + printed,
+ trace__entry_str_size - printed,
+ sample, &al, false, true);
- fprintf(trace->output, "] => ");
+ printed += scnprintf(ttrace->pgfault_entry_str + printed,
+ trace__entry_str_size - printed, "] => ");
thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
sample->addr, &al);
@@ -1875,14 +1926,97 @@ static int trace__pgfault(struct trace *trace,
map_type = '?';
}
- print_location(trace->output, sample, &al, true, false);
+ printed += scnprintf_location(ttrace->pgfault_entry_str + printed,
+ trace__entry_str_size - printed,
+ sample, &al, true, false);
- fprintf(trace->output, " (%c%c)\n", map_type, al.level);
+ printed += scnprintf(ttrace->pgfault_entry_str + printed,
+ trace__entry_str_size - printed,
+ " (%c%c)", map_type, al.level);
+
+ if (!ttrace->pgfault_entry_time) {
+ trace__fprintf_entry_head(trace, thread,
+ 0, sample->time, trace->output);
+ fprintf(trace->output, "%-70s\n", ttrace->pgfault_entry_str);
+
+ if (callchain_ret > 0)
+ trace__fprintf_callchain(trace, sample);
+ else if (callchain_ret < 0)
+ pr_err("Problem processing %s callchain, skipping...\n",
+ perf_evsel__name(evsel));
+ }
+
+out:
+ err = 0;
+out_put:
+ thread__put(thread);
+ return err;
+}
+
+static int trace__pgfault_exit(struct trace *trace, struct perf_evsel *evsel,
+ union perf_event *event __maybe_unused,
+ struct perf_sample *sample)
+{
+ struct thread *thread;
+ struct thread_trace *ttrace;
+ u64 duration = 0;
+ int err = -1, callchain_ret = 0;
+
+ thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+
+ ttrace = thread__priv(thread);
+ if (!ttrace)
+ goto out_put;
+
+ if (!ttrace->pgfault_entry_time)
+ goto out;
+
+ /*
+ * The check below is necessary for a specific case: it is
+ * possible to enable only major (or minor) page faults
+ * software events but it is impossible to filter page-fault
+ * related tracepoints according to the major / minor
+ * characteristic.
+ */
+
+ if (!ttrace->pgfault_entry_str ||
+ strlen(ttrace->pgfault_entry_str) == 0)
+ goto out;
+
+ if (sample->callchain) {
+ callchain_ret =
+ trace__resolve_callchain(trace, evsel,
+ sample, &callchain_cursor);
+ if (callchain_ret == 0) {
+ if (callchain_cursor.nr < trace->min_stack)
+ goto out;
+ callchain_ret = 1;
+ }
+ }
+
+ if (ttrace->pgfault_entry_time) {
+ duration = sample->time - ttrace->pgfault_entry_time;
+ if (trace__filter_duration(trace, duration))
+ goto out;
+ }
+
+ trace__fprintf_entry_head(trace, thread,
+ duration, sample->time, trace->output);
+
+ fprintf(trace->output, "%-70s\n", ttrace->pgfault_entry_str);
+
+ /*
+ * Once the string is printed; clear it just in case the next
+ * software events is filtered (because of major / minor.
+ */
+ ttrace->pgfault_entry_str[0] = '\0';
if (callchain_ret > 0)
trace__fprintf_callchain(trace, sample);
else if (callchain_ret < 0)
- pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
+ pr_err("Problem processing %s callchain, skipping...\n",
+ perf_evsel__name(evsel));
+
out:
err = 0;
out_put:
@@ -2062,6 +2196,37 @@ static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
return evsel;
}
+static int trace__add_pgfault_newtp(struct trace *trace)
+{
+ int err = -1;
+ struct perf_evlist *evlist = trace->evlist;
+
+ /*
+ * The tracepoints exceptions::page_fault_* are not available
+ * on all architecture.; so, we consider it is not an error if
+ * the 1st one's initialization is KO...
+ */
+ err = perf_evlist__add_newtp(evlist, "exceptions",
+ "page_fault_exit", trace__pgfault_exit);
+ if (err < 0)
+ return 0;
+
+ /*
+ * ...however, if the 2nd or the 3rd init is KO; there is
+ * definitely something wrong not related with the absence of
+ * tracepoints
+ */
+ err = perf_evlist__add_newtp(evlist, "exceptions",
+ "page_fault_kernel", trace__pgfault_enter);
+ if (err < 0)
+ return -1;
+
+ err = perf_evlist__add_newtp(evlist, "exceptions",
+ "page_fault_user", trace__pgfault_enter);
+
+ return err;
+}
+
static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
{
const u32 type = event->header.type;
@@ -2193,9 +2358,12 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
perf_evlist__add(evlist, pgfault_min);
}
+ if (trace->trace_pgfaults && trace__add_pgfault_newtp(trace))
+ goto out_error_pgfault_tp;
+
if (trace->sched &&
- perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
- trace__sched_stat_runtime))
+ perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
+ trace__sched_stat_runtime))
goto out_error_sched_stat_runtime;
err = perf_evlist__create_maps(evlist, &trace->opts.target);
@@ -2399,6 +2567,11 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
goto out_error;
+out_error_pgfault_tp:
+ tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf),
+ "exceptions", "page_fault_(kernel|user)");
+ goto out_error;
+
out_error_mmap:
perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
goto out_error;
--
2.10.1
Signed-off-by: Alexis Berlemont <[email protected]>
---
arch/x86/include/asm/trace/exceptions.h | 21 +++++++++++++++++++++
arch/x86/mm/fault.c | 1 +
2 files changed, 22 insertions(+)
diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
index 2fbc66c..39f78bb 100644
--- a/arch/x86/include/asm/trace/exceptions.h
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -43,6 +43,27 @@ DEFINE_EVENT_FN(x86_exceptions, name, \
DEFINE_PAGE_FAULT_EVENT(page_fault_user);
DEFINE_PAGE_FAULT_EVENT(page_fault_kernel);
+TRACE_EVENT_FN(page_fault_exit,
+
+ TP_PROTO(unsigned long address),
+
+ TP_ARGS(address),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, address)
+ ),
+
+ TP_fast_assign(
+ __entry->address = address;
+ ),
+
+ TP_printk("address=%lx", __entry->address),
+
+ trace_irq_vector_regfunc,
+
+ trace_irq_vector_unregfunc
+);
+
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE exceptions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9f72ca3..e31e8ef 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1488,6 +1488,7 @@ trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
prev_state = exception_enter();
trace_page_fault_entries(address, regs, error_code);
__do_page_fault(regs, error_code, address);
+ trace_page_fault_exit(address);
exception_exit(prev_state);
}
NOKPROBE_SYMBOL(trace_do_page_fault);
--
2.10.1
On Wed, Oct 26, 2016 at 01:51:58AM +0200, Alexis Berlemont wrote:
> Hi,
>
> Here are 2 small patches which try to fulfill a point in the perf todo
> list:
There's a todo list?
> * Forward port the page fault tracepoints and use it in 'trace'.
> http://git.kernel.org/?p=linux/kernel/git/acme/linux.git;a=commitdiff;h=d53b11976093b6d8afeb8181db53aaffc754920d;hp=32ba4abf60ae1b710d22a75725491815de649bc5
dead link
On Wed, Oct 26, 2016 at 01:51:59AM +0200, Alexis Berlemont wrote:
-ENOCHANGELOG
> Signed-off-by: Alexis Berlemont <[email protected]>
> ---
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 9f72ca3..e31e8ef 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -1488,6 +1488,7 @@ trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
> prev_state = exception_enter();
> trace_page_fault_entries(address, regs, error_code);
> __do_page_fault(regs, error_code, address);
> + trace_page_fault_exit(address);
Aside from my general hatred of tracepoint, it bugs me that its not
symmetric like the irq vector ones. But I'll leave that to x86 people.
> exception_exit(prev_state);
> }
> NOKPROBE_SYMBOL(trace_do_page_fault);
> --
> 2.10.1
>
Em Wed, Oct 26, 2016 at 10:46:18AM +0200, Peter Zijlstra escreveu:
> On Wed, Oct 26, 2016 at 01:51:58AM +0200, Alexis Berlemont wrote:
> > Hi,
> >
> > Here are 2 small patches which try to fulfill a point in the perf todo
> > list:
>
> There's a todo list?
https://perf.wiki.kernel.org/index.php/Todo
> > * Forward port the page fault tracepoints and use it in 'trace'.
> > http://git.kernel.org/?p=linux/kernel/git/acme/linux.git;a=commitdiff;h=d53b11976093b6d8afeb8181db53aaffc754920d;hp=32ba4abf60ae1b710d22a75725491815de649bc5
>
> dead link
I guess this is the one:
http://git.kernel.org/cgit/linux/kernel/git/acme/linux.git/commit/?id=eea86c6e06c241667c96c8e87e43c0870e1d6285
author Frederic Weisbecker <[email protected]> 2010-11-12 04:35:06 (GMT)
committer Ingo Molnar <[email protected]> 2011-05-07 16:10:43 (GMT)
commit eea86c6e06c241667c96c8e87e43c0870e1d6285 (patch)
tree b00ce7ef9a6219f4df6b0e1ffe453d2d5baf3593
parent 57d524154ffe99d27fb55e0e30ddbad9f4c35806 (diff)
perf, mm: Add fault tracing
Part of that are two modified patches from Jiri Olsa who added the fault
tracepoints. I had to split them in two tracepoints so that we get the
faults
handling duration.
Originally-from: Frederic Weisbecker <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
--------------------
After that entry was added to the todo we got --pf, but no duration:
[root@jouet ~]# perf trace --pf maj --no-syscalls
0.000 ( 0.000 ms): perf/20412 majfault [__memcmp_sse4_1+0xbc6] => 0x7f1e0010c000 (?.)
1.156 ( 0.000 ms): perf/20412 majfault [__memcmp_sse4_1+0xbc6] => /usr/bin/mv@0x0 (d.)
56.907 ( 0.000 ms): perf/20412 majfault [__memcpy_avx_unaligned+0x1e7] => /usr/bin/mv@0x207a0 (d.)
57.683 ( 0.000 ms): perf/20412 majfault [__memcmp_sse4_1+0xbc6] => /usr/lib/debug/usr/bin/mv.debug@0x0 (d.)
60.916 ( 0.000 ms): perf/20412 majfault [__memcpy_avx_unaligned+0x2b6] => /usr/lib/debug/usr/bin/mv.debug@0x6ace0 (d.)
18446744073708.766 ( 0.000 ms): systemd-journa/578 majfault [0x14972] => 0x7f25bdd9ebd0 (?.)
70.193 ( 0.000 ms): perf/20412 majfault [__memcmp_sse4_1+0xbc6] => /usr/lib/debug/usr/lib/systemd/systemd-journald.debug@0x0 (d.)
71.176 ( 0.000 ms): perf/20412 majfault [__memcpy_avx_unaligned+0x2b6] => /usr/lib/debug/usr/lib/systemd/systemd-journald.debug@0x1221e0 (d.)
^C[root@jouet ~]# trace -h --pf
Usage: perf trace [<options>] [<command>]
or: perf trace [<options>] -- <command> [<options>]
or: perf trace record [<options>] [<command>]
or: perf trace record [<options>] -- <command> [<options>]
-F, --pf <all|maj|min>
Trace pagefaults
[root@jouet ~]#
Em Wed, Oct 26, 2016 at 10:51:16AM +0200, Peter Zijlstra escreveu:
> On Wed, Oct 26, 2016 at 01:51:59AM +0200, Alexis Berlemont wrote:
>
> -ENOCHANGELOG
Yeah, please add one explaining why we want this, how we can use it, for
example, in 'perf trace', to measure how long each page fault took,
examples of it in use, etc.
> > Signed-off-by: Alexis Berlemont <[email protected]>
> > ---
>
> > diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> > index 9f72ca3..e31e8ef 100644
> > --- a/arch/x86/mm/fault.c
> > +++ b/arch/x86/mm/fault.c
> > @@ -1488,6 +1488,7 @@ trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
> > prev_state = exception_enter();
> > trace_page_fault_entries(address, regs, error_code);
> > __do_page_fault(regs, error_code, address);
> > + trace_page_fault_exit(address);
>
> Aside from my general hatred of tracepoint, it bugs me that its not
> symmetric like the irq vector ones. But I'll leave that to x86 people.
What is the simmetry problem, you think we should have:
exceptions:page_fault_kernel_exit
exceptions:page_fault_user_exit
To be counterparts of:
[root@jouet ~]# perf list exceptions:*
List of pre-defined events (to be used in -e):
exceptions:page_fault_kernel [Tracepoint event]
exceptions:page_fault_user [Tracepoint event]
[root@jouet ~]#
?
For perf usage yeah, the good thing would be to have just
exceptions:page_fault_entry and exceptions:page_fault_exit, and then use
perf_event_attr fields to filter what kind was desired, with a
header.misc stating where it took place, just like with other events,
but by now we have two for entry, so two for exit?
- Arnaldo
> > exception_exit(prev_state);
> > }
> > NOKPROBE_SYMBOL(trace_do_page_fault);
> > --
> > 2.10.1
> >
On Wed, Oct 26, 2016 at 04:53:39PM -0200, Arnaldo Carvalho de Melo wrote:
> > > +++ b/arch/x86/mm/fault.c
> > > @@ -1488,6 +1488,7 @@ trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
> > > prev_state = exception_enter();
> > > trace_page_fault_entries(address, regs, error_code);
> > > __do_page_fault(regs, error_code, address);
> > > + trace_page_fault_exit(address);
> >
> > Aside from my general hatred of tracepoint, it bugs me that its not
> > symmetric like the irq vector ones. But I'll leave that to x86 people.
>
> What is the simmetry problem, you think we should have:
Look at arch/x86/include/asm/trace/irq_vectors.h and
$ git grep "trace.*_VECTOR"
The entry and exit tracepoints are fully symmetric and generate from a
single macro.
On Wed, Oct 26, 2016 at 04:53:39PM -0200, Arnaldo Carvalho de Melo wrote:
> Em Wed, Oct 26, 2016 at 10:51:16AM +0200, Peter Zijlstra escreveu:
> > On Wed, Oct 26, 2016 at 01:51:59AM +0200, Alexis Berlemont wrote:
> >
> > -ENOCHANGELOG
>
> Yeah, please add one explaining why we want this, how we can use it, for
> example, in 'perf trace', to measure how long each page fault took,
> examples of it in use, etc.
Note that 'to measure page-fault duration' is not a reason, its a what.
This needs a proper reason.
So, would you be OK with the following patch ?
There is a symmetry just like irq_vectors tracepoints:
# perf list | grep -E "exception|irq_vectors"
exceptions:page_fault_kernel_entry [Tracepoint event]
exceptions:page_fault_kernel_exit [Tracepoint event]
exceptions:page_fault_user_entry [Tracepoint event]
exceptions:page_fault_user_exit [Tracepoint event]
irq_vectors:call_function_entry [Tracepoint event]
irq_vectors:call_function_exit [Tracepoint event]
...
Maybe 2 tracepoints (instead of 4) should have been enough; however,
there were already 1 tracepoint per mode before.
Alexis.
Alexis Berlemont (1):
perf, x86-mm: declare page-faults tracepoints like irq-vectors ones
arch/x86/include/asm/trace/exceptions.h | 17 ++++++++++++++++-
arch/x86/mm/fault.c | 17 ++++++++++++++---
2 files changed, 30 insertions(+), 4 deletions(-)
--
2.10.1
The tracepoints "exceptions:page_fault_(user|kernel)" were renamed
"exceptions:page_fault_(user|kernel)_entry". Two more tracepoints were
added: "exceptions:page_fault_(user|kernel)_exit". These events are
called right before and after __do_page_fault().
Signed-off-by: Alexis Berlemont <[email protected]>
---
arch/x86/include/asm/trace/exceptions.h | 17 ++++++++++++++++-
arch/x86/mm/fault.c | 17 ++++++++++++++---
2 files changed, 30 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
index 2fbc66c..ecbe64e 100644
--- a/arch/x86/include/asm/trace/exceptions.h
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -33,14 +33,29 @@ DECLARE_EVENT_CLASS(x86_exceptions,
__entry->error_code) );
#define DEFINE_PAGE_FAULT_EVENT(name) \
-DEFINE_EVENT_FN(x86_exceptions, name, \
+DEFINE_EVENT_FN(x86_exceptions, name##_entry, \
+ TP_PROTO(unsigned long address, struct pt_regs *regs, \
+ unsigned long error_code), \
+ TP_ARGS(address, regs, error_code), \
+ trace_irq_vector_regfunc, \
+ trace_irq_vector_unregfunc); \
+DEFINE_EVENT_FN(x86_exceptions, name##_exit, \
TP_PROTO(unsigned long address, struct pt_regs *regs, \
unsigned long error_code), \
TP_ARGS(address, regs, error_code), \
trace_irq_vector_regfunc, \
trace_irq_vector_unregfunc);
+/*
+ * page_fault_user - called when entering/exiting a page fault in
+ * user-space
+ */
DEFINE_PAGE_FAULT_EVENT(page_fault_user);
+
+/*
+ * page_fault_kernel - called when entering/exiting a page fault in
+ * kernel-space
+ */
DEFINE_PAGE_FAULT_EVENT(page_fault_kernel);
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9f72ca3..2ed2716 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1465,12 +1465,22 @@ NOKPROBE_SYMBOL(do_page_fault);
#ifdef CONFIG_TRACING
static nokprobe_inline void
trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
- unsigned long error_code)
+ unsigned long error_code)
{
if (user_mode(regs))
- trace_page_fault_user(address, regs, error_code);
+ trace_page_fault_user_entry(address, regs, error_code);
else
- trace_page_fault_kernel(address, regs, error_code);
+ trace_page_fault_kernel_entry(address, regs, error_code);
+}
+
+static nokprobe_inline void
+trace_page_fault_exits(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
+{
+ if (user_mode(regs))
+ trace_page_fault_user_exit(address, regs, error_code);
+ else
+ trace_page_fault_kernel_exit(address, regs, error_code);
}
dotraplinkage void notrace
@@ -1488,6 +1498,7 @@ trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
prev_state = exception_enter();
trace_page_fault_entries(address, regs, error_code);
__do_page_fault(regs, error_code, address);
+ trace_page_fault_exits(address, regs, error_code);
exception_exit(prev_state);
}
NOKPROBE_SYMBOL(trace_do_page_fault);
--
2.10.1
Hi,
Here is another attempt to make the perf-trace subcommand print the
page fault durations.
This solution is based on static tracepoints and it was necessary to
add 2 more tracepoints. I understood that adding tracepoints is a
questionable solution; I can try something else if someone has another
idea.
Alexis.
Alexis Berlemont (2):
perf, x86-mm: declare page-faults tracepoints like irq-vectors ones
perf: add page fault duration measures in perf trace
arch/x86/include/asm/trace/exceptions.h | 17 ++-
arch/x86/mm/fault.c | 17 ++-
tools/perf/Documentation/perf-trace.txt | 4 +-
tools/perf/builtin-trace.c | 231 ++++++++++++++++++++++++++++----
4 files changed, 238 insertions(+), 31 deletions(-)
--
2.10.2
Thanks to the tracepoints exceptions:page_fault_kernel_exit and
exceptions:page_fault_user_exit, the subcommand perf-trace can now
print the duration of page-fault exception handling just like it is
done for syscalls.
Here is the result; the page fault durations are indicated in
parenthesis (like syscalls):
Here is an example; the durations are indicated in parenthesis just
like with syscalls:
0.053 ( 0.053 ms): ls/152 minfault [__clear_user+0x25] => 0x694e7e (?k)
0.114 ( 0.027 ms): ls/152 minfault [__clear_user+0x25] => 0x7fb912eaafc0 (?k)
0.201 ( 0.030 ms): ls/152 minfault [_start+0x0] => /lib/ld-2.23.so@0xc70 (x.)
...
Signed-off-by: Alexis Berlemont <[email protected]>
---
tools/perf/Documentation/perf-trace.txt | 4 +-
tools/perf/builtin-trace.c | 231 ++++++++++++++++++++++++++++----
2 files changed, 208 insertions(+), 27 deletions(-)
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 781b019..53c103c 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -117,7 +117,9 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
-F=[all|min|maj]::
--pf=[all|min|maj]::
Trace pagefaults. Optionally, you can specify whether you want minor,
- major or all pagefaults. Default value is maj.
+ major or all pagefaults. Default value is maj. Durations of
+ page-fault handling will be printed if possible (need for some
+ architecture-dependent tracepoints).
--syscalls::
Trace system calls. This options is enabled by default.
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 5f45166..f0957a8 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -861,6 +861,8 @@ struct thread_trace {
} paths;
struct intlist *syscall_stats;
+ u64 pgfault_entry_time;
+ char *pgfault_entry_str;
};
static struct thread_trace *thread_trace__new(void)
@@ -1797,21 +1799,56 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
return 0;
}
-static void print_location(FILE *f, struct perf_sample *sample,
- struct addr_location *al,
- bool print_dso, bool print_sym)
+static int trace__pgfault_enter(struct trace *trace,
+ struct perf_evsel *evsel __maybe_unused,
+ union perf_event *event __maybe_unused,
+ struct perf_sample *sample)
{
+ struct thread *thread;
+ struct thread_trace *ttrace;
+ int err = -1;
+
+ thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+ if (!thread)
+ goto out;
+
+ ttrace = thread__trace(thread, trace->output);
+ if (ttrace == NULL)
+ goto out_put;
+
+ ttrace->pgfault_entry_time = sample->time;
+
+out:
+ err = 0;
+out_put:
+ thread__put(thread);
+ return err;
+}
+
+static size_t scnprintf_location(char *bf, size_t size,
+ struct perf_sample *sample,
+ struct addr_location *al,
+ bool print_dso, bool print_sym)
+{
+ size_t printed = 0;
if ((verbose || print_dso) && al->map)
- fprintf(f, "%s@", al->map->dso->long_name);
+ printed += scnprintf(bf + printed,
+ size - printed, "%s@", al->map->dso->long_name);
if ((verbose || print_sym) && al->sym)
- fprintf(f, "%s+0x%" PRIx64, al->sym->name,
- al->addr - al->sym->start);
+ printed += scnprintf(bf + printed,
+ size - printed,
+ "%s+0x%" PRIx64, al->sym->name,
+ al->addr - al->sym->start);
else if (al->map)
- fprintf(f, "0x%" PRIx64, al->addr);
+ printed += scnprintf(bf + printed,
+ size - printed, "0x%" PRIx64, al->addr);
else
- fprintf(f, "0x%" PRIx64, sample->addr);
+ printed += scnprintf(bf + printed,
+ size - printed, "0x%" PRIx64, sample->addr);
+
+ return printed;
}
static int trace__pgfault(struct trace *trace,
@@ -1823,13 +1860,22 @@ static int trace__pgfault(struct trace *trace,
struct addr_location al;
char map_type = 'd';
struct thread_trace *ttrace;
+ size_t printed = 0;
int err = -1;
int callchain_ret = 0;
thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+ if (!thread)
+ goto out;
- if (sample->callchain) {
- callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+ ttrace = thread__trace(thread, trace->output);
+ if (ttrace == NULL)
+ goto out_put;
+
+ if (!ttrace->pgfault_entry_time && sample->callchain) {
+ callchain_ret =
+ trace__resolve_callchain(trace, evsel,
+ sample, &callchain_cursor);
if (callchain_ret == 0) {
if (callchain_cursor.nr < trace->min_stack)
goto out_put;
@@ -1837,10 +1883,6 @@ static int trace__pgfault(struct trace *trace,
}
}
- ttrace = thread__trace(thread, trace->output);
- if (ttrace == NULL)
- goto out_put;
-
if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
ttrace->pfmaj++;
else
@@ -1849,18 +1891,27 @@ static int trace__pgfault(struct trace *trace,
if (trace->summary_only)
goto out;
+ if (ttrace->pgfault_entry_str == NULL) {
+ ttrace->pgfault_entry_str = malloc(trace__entry_str_size);
+ if (!ttrace->pgfault_entry_str)
+ goto out_put;
+ ttrace->pgfault_entry_str[0] = '\0';
+ }
+
thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
sample->ip, &al);
- trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
-
- fprintf(trace->output, "%sfault [",
- evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
- "maj" : "min");
+ printed += scnprintf(ttrace->pgfault_entry_str + printed,
+ trace__entry_str_size - printed, "%sfault [",
+ evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
+ "maj" : "min");
- print_location(trace->output, sample, &al, false, true);
+ printed += scnprintf_location(ttrace->pgfault_entry_str + printed,
+ trace__entry_str_size - printed,
+ sample, &al, false, true);
- fprintf(trace->output, "] => ");
+ printed += scnprintf(ttrace->pgfault_entry_str + printed,
+ trace__entry_str_size - printed, "] => ");
thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
sample->addr, &al);
@@ -1875,14 +1926,97 @@ static int trace__pgfault(struct trace *trace,
map_type = '?';
}
- print_location(trace->output, sample, &al, true, false);
+ printed += scnprintf_location(ttrace->pgfault_entry_str + printed,
+ trace__entry_str_size - printed,
+ sample, &al, true, false);
+
+ printed += scnprintf(ttrace->pgfault_entry_str + printed,
+ trace__entry_str_size - printed,
+ " (%c%c)", map_type, al.level);
+
+ if (!ttrace->pgfault_entry_time) {
+ trace__fprintf_entry_head(trace, thread,
+ 0, sample->time, trace->output);
+ fprintf(trace->output, "%-70s\n", ttrace->pgfault_entry_str);
+
+ if (callchain_ret > 0)
+ trace__fprintf_callchain(trace, sample);
+ else if (callchain_ret < 0)
+ pr_err("Problem processing %s callchain, skipping...\n",
+ perf_evsel__name(evsel));
+ }
+
+out:
+ err = 0;
+out_put:
+ thread__put(thread);
+ return err;
+}
+
+static int trace__pgfault_exit(struct trace *trace, struct perf_evsel *evsel,
+ union perf_event *event __maybe_unused,
+ struct perf_sample *sample)
+{
+ struct thread *thread;
+ struct thread_trace *ttrace;
+ u64 duration = 0;
+ int err = -1, callchain_ret = 0;
+
+ thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+
+ ttrace = thread__priv(thread);
+ if (!ttrace)
+ goto out_put;
+
+ if (!ttrace->pgfault_entry_time)
+ goto out;
+
+ /*
+ * The check below is necessary for a specific case: it is
+ * possible to enable only major (or minor) page faults
+ * software events but it is impossible to filter page-fault
+ * related tracepoints according to the major / minor
+ * characteristic.
+ */
+
+ if (!ttrace->pgfault_entry_str ||
+ strlen(ttrace->pgfault_entry_str) == 0)
+ goto out;
+
+ if (sample->callchain) {
+ callchain_ret =
+ trace__resolve_callchain(trace, evsel,
+ sample, &callchain_cursor);
+ if (callchain_ret == 0) {
+ if (callchain_cursor.nr < trace->min_stack)
+ goto out;
+ callchain_ret = 1;
+ }
+ }
+
+ if (ttrace->pgfault_entry_time) {
+ duration = sample->time - ttrace->pgfault_entry_time;
+ if (trace__filter_duration(trace, duration))
+ goto out;
+ }
+
+ trace__fprintf_entry_head(trace, thread,
+ duration, sample->time, trace->output);
- fprintf(trace->output, " (%c%c)\n", map_type, al.level);
+ fprintf(trace->output, "%-70s\n", ttrace->pgfault_entry_str);
+
+ /*
+ * Once the string is printed; clear it just in case the next
+ * software events is filtered (because of major / minor.
+ */
+ ttrace->pgfault_entry_str[0] = '\0';
if (callchain_ret > 0)
trace__fprintf_callchain(trace, sample);
else if (callchain_ret < 0)
- pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
+ pr_err("Problem processing %s callchain, skipping...\n",
+ perf_evsel__name(evsel));
+
out:
err = 0;
out_put:
@@ -2062,6 +2196,43 @@ static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
return evsel;
}
+static int trace__add_pgfault_newtp(struct trace *trace)
+{
+ int err = -1;
+ struct perf_evlist *evlist = trace->evlist;
+
+ /*
+ * The tracepoints exceptions::page_fault_* are not available
+ * on all architecture.; so, we consider it is not an error if
+ * the 1st one's initialization is KO...
+ */
+ err = perf_evlist__add_newtp(evlist, "exceptions",
+ "page_fault_kernel_exit", trace__pgfault_exit);
+ if (err < 0)
+ return 0;
+
+ /*
+ * ...however, if the 2nd, 3rd or 4th init is KO; there is
+ * definitely something wrong not related with the absence of
+ * tracepoints
+ */
+ err = perf_evlist__add_newtp(evlist, "exceptions",
+ "page_fault_user_exit", trace__pgfault_exit);
+ if (err < 0)
+ return -1;
+
+ err = perf_evlist__add_newtp(evlist, "exceptions",
+ "page_fault_kernel_entry",
+ trace__pgfault_enter);
+ if (err < 0)
+ return -1;
+
+ err = perf_evlist__add_newtp(evlist, "exceptions",
+ "page_fault_user_entry", trace__pgfault_enter);
+
+ return err;
+}
+
static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
{
const u32 type = event->header.type;
@@ -2193,9 +2364,12 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
perf_evlist__add(evlist, pgfault_min);
}
+ if (trace->trace_pgfaults && trace__add_pgfault_newtp(trace))
+ goto out_error_pgfault_tp;
+
if (trace->sched &&
- perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
- trace__sched_stat_runtime))
+ perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
+ trace__sched_stat_runtime))
goto out_error_sched_stat_runtime;
err = perf_evlist__create_maps(evlist, &trace->opts.target);
@@ -2399,6 +2573,11 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
goto out_error;
+out_error_pgfault_tp:
+ tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf),
+ "exceptions", "page_fault_(kernel|user)");
+ goto out_error;
+
out_error_mmap:
perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
goto out_error;
--
2.10.2
The tracepoints "exceptions:page_fault_(user|kernel)" were renamed
"exceptions:page_fault_(user|kernel)_entry". Two more tracepoints were
added: "exceptions:page_fault_(user|kernel)_exit". These events are
called right before and after __do_page_fault().
Signed-off-by: Alexis Berlemont <[email protected]>
---
arch/x86/include/asm/trace/exceptions.h | 17 ++++++++++++++++-
arch/x86/mm/fault.c | 17 ++++++++++++++---
2 files changed, 30 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
index 2fbc66c..ecbe64e 100644
--- a/arch/x86/include/asm/trace/exceptions.h
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -33,14 +33,29 @@ DECLARE_EVENT_CLASS(x86_exceptions,
__entry->error_code) );
#define DEFINE_PAGE_FAULT_EVENT(name) \
-DEFINE_EVENT_FN(x86_exceptions, name, \
+DEFINE_EVENT_FN(x86_exceptions, name##_entry, \
+ TP_PROTO(unsigned long address, struct pt_regs *regs, \
+ unsigned long error_code), \
+ TP_ARGS(address, regs, error_code), \
+ trace_irq_vector_regfunc, \
+ trace_irq_vector_unregfunc); \
+DEFINE_EVENT_FN(x86_exceptions, name##_exit, \
TP_PROTO(unsigned long address, struct pt_regs *regs, \
unsigned long error_code), \
TP_ARGS(address, regs, error_code), \
trace_irq_vector_regfunc, \
trace_irq_vector_unregfunc);
+/*
+ * page_fault_user - called when entering/exiting a page fault in
+ * user-space
+ */
DEFINE_PAGE_FAULT_EVENT(page_fault_user);
+
+/*
+ * page_fault_kernel - called when entering/exiting a page fault in
+ * kernel-space
+ */
DEFINE_PAGE_FAULT_EVENT(page_fault_kernel);
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9f72ca3..2ed2716 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1465,12 +1465,22 @@ NOKPROBE_SYMBOL(do_page_fault);
#ifdef CONFIG_TRACING
static nokprobe_inline void
trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
- unsigned long error_code)
+ unsigned long error_code)
{
if (user_mode(regs))
- trace_page_fault_user(address, regs, error_code);
+ trace_page_fault_user_entry(address, regs, error_code);
else
- trace_page_fault_kernel(address, regs, error_code);
+ trace_page_fault_kernel_entry(address, regs, error_code);
+}
+
+static nokprobe_inline void
+trace_page_fault_exits(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
+{
+ if (user_mode(regs))
+ trace_page_fault_user_exit(address, regs, error_code);
+ else
+ trace_page_fault_kernel_exit(address, regs, error_code);
}
dotraplinkage void notrace
@@ -1488,6 +1498,7 @@ trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
prev_state = exception_enter();
trace_page_fault_entries(address, regs, error_code);
__do_page_fault(regs, error_code, address);
+ trace_page_fault_exits(address, regs, error_code);
exception_exit(prev_state);
}
NOKPROBE_SYMBOL(trace_do_page_fault);
--
2.10.2
Alexis Berlemont wrote:
> Hi,
>
> Here is another attempt to make the perf-trace subcommand print the
> page fault durations.
>
> This solution is based on static tracepoints and it was necessary to
> add 2 more tracepoints. I understood that adding tracepoints is a
> questionable solution; I can try something else if someone has another
> idea.
>
Could you indicate me a way to improve these patches so as to move
forward ?
Regards,
Alexis.
> Alexis.
>
> Alexis Berlemont (2):
> perf, x86-mm: declare page-faults tracepoints like irq-vectors ones
> perf: add page fault duration measures in perf trace
>
> arch/x86/include/asm/trace/exceptions.h | 17 ++-
> arch/x86/mm/fault.c | 17 ++-
> tools/perf/Documentation/perf-trace.txt | 4 +-
> tools/perf/builtin-trace.c | 231 ++++++++++++++++++++++++++++----
> 4 files changed, 238 insertions(+), 31 deletions(-)
>
> --
> 2.10.2
>
Em Tue, Dec 06, 2016 at 12:40:06AM +0100, Alexis Berlemont escreveu:
> Alexis Berlemont wrote:
> > Hi,
> >
> > Here is another attempt to make the perf-trace subcommand print the
> > page fault durations.
> >
> > This solution is based on static tracepoints and it was necessary to
> > add 2 more tracepoints. I understood that adding tracepoints is a
> > questionable solution; I can try something else if someone has another
> > idea.
> >
>
> Could you indicate me a way to improve these patches so as to move
> forward ?
I have to test this, and review PeterZ latest comment about it, to see
if all is well with him by now, Peter?
- Arnaldo
> Regards,
>
> Alexis.
>
> > Alexis.
> >
> > Alexis Berlemont (2):
> > perf, x86-mm: declare page-faults tracepoints like irq-vectors ones
> > perf: add page fault duration measures in perf trace
> >
> > arch/x86/include/asm/trace/exceptions.h | 17 ++-
> > arch/x86/mm/fault.c | 17 ++-
> > tools/perf/Documentation/perf-trace.txt | 4 +-
> > tools/perf/builtin-trace.c | 231 ++++++++++++++++++++++++++++----
> > 4 files changed, 238 insertions(+), 31 deletions(-)
> >
> > --
> > 2.10.2
> >