2012-08-07 12:56:15

by Andrei Vagin

[permalink] [raw]
Subject: [PATCH 0/4] perf: Teach perf tool to profile sleep times (v2)

This functionality helps to analize where a task sleeps or waits locks.
This feature can help to investigate a scalability problems.

The main idea is that we can combine sched_switch and sched_stat_sleep events.
sched_switch contains a callchain, when a task starts sleeping.
sched_stat_sleep contains a time period for which a task slept.

This series teaches "perf inject" to combine this events.

All kernel related patches were committed committed in 3.6-rc1.

Here is an example of a report:
$ cat ~/foo.c
....
for (i = 0; i < 10; i++) {
ts1.tv_sec = 0;
ts1.tv_nsec = 10000000;
nanosleep(&ts1, NULL);

tv1.tv_sec = 0;
tv1.tv_usec = 40000;
select(0, NULL, NULL, NULL,&tv1);
}
...

$ ./perf record -e sched:sched_stat_sleep -e sched:sched_switch \
-e sched:sched_process_exit -gP -o ~/perf.data.raw ~/foo
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.015 MB /root/perf.data.raw (~661 samples) ]
$ ./perf inject -v -s -i ~/perf.data.raw -o ~/perf.data
$ ./perf report -i ~/perf.data
# Samples: 40 of event 'sched:sched_switch'
# Event count (approx.): 1005527702
#
# Overhead Command Shared Object Symbol
# ........ ....... ................. ..............
#
100.00% foo [kernel.kallsyms] [k] __schedule
|
--- __schedule
schedule
|
|--79.81%-- schedule_hrtimeout_range_clock
| schedule_hrtimeout_range
| poll_schedule_timeout
| do_select
| core_sys_select
| sys_select
| system_call_fastpath
| __select
| __libc_start_main
|
--20.19%-- do_nanosleep
hrtimer_nanosleep
sys_nanosleep
system_call_fastpath
__GI___libc_nanosleep
__libc_start_main

Andrew Vagin (3):
perf: teach "perf inject" to work with files
perf: teach perf inject to merge sched_stat_* and sched_switch events
perf: mark a dso if it's used

tools/perf/builtin-inject.c | 139 ++++++++++++++++++++++++++++++++++++++++---
tools/perf/util/build-id.c | 2 +-
tools/perf/util/build-id.h | 5 ++
3 files changed, 137 insertions(+), 9 deletions(-)


2012-08-07 12:56:13

by Andrei Vagin

[permalink] [raw]
Subject: [PATCH 1/4] perf: teach "perf inject" to work with files (v2)

Before this patch "perf inject" can only handle data from pipe.

I want to use "perf inject" for reworking events. Look at my following patch.

v2: add information about new options in tools/perf/Documentation/

Signed-off-by: Andrew Vagin <[email protected]>
---
tools/perf/Documentation/perf-inject.txt | 6 +++++
tools/perf/builtin-inject.c | 33 ++++++++++++++++++++++++++++-
2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index 025630d..6be2101 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -29,6 +29,12 @@ OPTIONS
-v::
--verbose::
Be more verbose.
+-i::
+--input=::
+ Input file name. (default: stdin)
+-o::
+--output=::
+ Output file name. (default: stdout)

SEE ALSO
--------
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 3beab48..4c9cdbb 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -14,7 +14,12 @@

#include "util/parse-options.h"

-static char const *input_name = "-";
+static const char *input_name = "-";
+static const char *output_name = "-";
+static int pipe_output;
+static int output;
+static u64 bytes_written;
+
static bool inject_build_ids;

static int perf_event__repipe_synth(struct perf_tool *tool __used,
@@ -27,12 +32,14 @@ static int perf_event__repipe_synth(struct perf_tool *tool __used,
size = event->header.size;

while (size) {
- int ret = write(STDOUT_FILENO, buf, size);
+ int ret = write(output, buf, size);
if (ret < 0)
return -errno;

size -= ret;
buf += ret;
+
+ bytes_written += ret;
}

return 0;
@@ -244,8 +251,14 @@ static int __cmd_inject(void)
if (session == NULL)
return -ENOMEM;

+ if (!pipe_output)
+ lseek(output, session->header.data_offset, SEEK_SET);
ret = perf_session__process_events(session, &perf_inject);

+ if (!pipe_output) {
+ session->header.data_size = bytes_written;
+ perf_session__write_header(session, session->evlist, output, true);
+ }
perf_session__delete(session);

return ret;
@@ -259,6 +272,10 @@ static const char * const report_usage[] = {
static const struct option options[] = {
OPT_BOOLEAN('b', "build-ids", &inject_build_ids,
"Inject build-ids into the output stream"),
+ OPT_STRING('i', "input", &input_name, "file",
+ "input file name"),
+ OPT_STRING('o', "output", &output_name, "file",
+ "output file name"),
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show build ids, etc)"),
OPT_END()
@@ -274,6 +291,18 @@ int cmd_inject(int argc, const char **argv, const char *prefix __used)
if (argc)
usage_with_options(report_usage, options);

+ if (!strcmp(output_name, "-")) {
+ pipe_output = 1;
+ output = STDOUT_FILENO;
+ } else {
+ output = open(output_name, O_CREAT | O_WRONLY | O_TRUNC,
+ S_IRUSR | S_IWUSR);
+ if (output < 0) {
+ perror("failed to create output file");
+ exit(-1);
+ }
+ }
+
if (symbol__init() < 0)
return -1;

--
1.7.1

2012-08-07 12:56:12

by Andrei Vagin

[permalink] [raw]
Subject: [PATCH 4/4] perf: mark a dso if it's used

Otherwise they will be not written in an output file.

Signed-off-by: Andrew Vagin <[email protected]>
---
tools/perf/builtin-inject.c | 10 ++++++++--
tools/perf/util/build-id.c | 2 +-
tools/perf/util/build-id.h | 5 +++++
3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 9b3393a..be2edae 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -15,6 +15,7 @@

#include "util/parse-options.h"
#include "util/trace-event.h"
+#include "util/build-id.h"

static const char *input_name = "-";
static const char *output_name = "-";
@@ -282,10 +283,12 @@ static int perf_event__sched_stat(struct perf_tool *tool,
sample_sw.time = sample->time;
perf_evsel__synthesize_sample(evsel, event_sw, &sample_sw, false);

+ build_id__mark_dso_hit(tool, event_sw, &sample_sw, evsel, machine);
perf_event__repipe(tool, event_sw, &sample_sw, machine);
return 0;
}

+ build_id__mark_dso_hit(tool, event, sample, evsel, machine);
perf_event__repipe(tool, event, sample, machine);

return 0;
@@ -320,11 +323,14 @@ static int __cmd_inject(void)

signal(SIGINT, sig_handler);

- if (inject_build_ids) {
- perf_inject.sample = perf_event__inject_buildid;
+ if (inject_build_ids | inject_sched_stat) {
perf_inject.mmap = perf_event__repipe_mmap;
perf_inject.fork = perf_event__repipe_task;
perf_inject.tracing_data = perf_event__repipe_tracing_data;
+ }
+
+ if (inject_build_ids) {
+ perf_inject.sample = perf_event__inject_buildid;
} else if (inject_sched_stat) {
perf_inject.sample = perf_event__sched_stat;
perf_inject.ordered_samples = true;
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index fd9a594..9ce0e11 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -16,7 +16,7 @@
#include "session.h"
#include "tool.h"

-static int build_id__mark_dso_hit(struct perf_tool *tool __used,
+int build_id__mark_dso_hit(struct perf_tool *tool __used,
union perf_event *event,
struct perf_sample *sample __used,
struct perf_evsel *evsel __used,
diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h
index a993ba8..032a968 100644
--- a/tools/perf/util/build-id.h
+++ b/tools/perf/util/build-id.h
@@ -7,4 +7,9 @@ extern struct perf_tool build_id__mark_dso_hit_ops;

char *dso__build_id_filename(struct dso *self, char *bf, size_t size);

+int build_id__mark_dso_hit(struct perf_tool *tool __used,
+ union perf_event *event,
+ struct perf_sample *sample __used,
+ struct perf_evsel *evsel __used,
+ struct machine *machine);
#endif
--
1.7.1

2012-08-07 12:56:11

by Andrei Vagin

[permalink] [raw]
Subject: [PATCH 3/4] perf: teach perf inject to merge sched_stat_* and sched_switch events (v2)

You may want to know where and how long a task is sleeping. A callchain
may be found in sched_switch and a time slice in stat_iowait, so I add
handler in perf inject for merging this events.

My code saves sched_switch event for each process and when it meets
stat_iowait, it reports the sched_switch event, because this event
contains a correct callchain. By another words it replaces all
stat_iowait events on proper sched_switch events.

v2: - remove the global variable "session"
- hadle errors from malloc()

Signed-off-by: Andrew Vagin <[email protected]>
---
tools/perf/Documentation/perf-inject.txt | 4 ++
tools/perf/builtin-inject.c | 86 ++++++++++++++++++++++++++++++
2 files changed, 90 insertions(+), 0 deletions(-)

diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index 6be2101..c04e0c6 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -35,6 +35,10 @@ OPTIONS
-o::
--output=::
Output file name. (default: stdout)
+-s::
+--sched-stat::
+ Merge sched_stat and sched_switch for getting events where and how long
+ tasks slept.

SEE ALSO
--------
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 4c9cdbb..9b3393a 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -8,11 +8,13 @@
#include "builtin.h"

#include "perf.h"
+#include "util/evsel.h"
#include "util/session.h"
#include "util/tool.h"
#include "util/debug.h"

#include "util/parse-options.h"
+#include "util/trace-event.h"

static const char *input_name = "-";
static const char *output_name = "-";
@@ -21,6 +23,7 @@ static int output;
static u64 bytes_written;

static bool inject_build_ids;
+static bool inject_sched_stat;

static int perf_event__repipe_synth(struct perf_tool *tool __used,
union perf_event *event,
@@ -210,6 +213,83 @@ repipe:
return 0;
}

+struct event_entry {
+ struct list_head node;
+ u32 pid;
+ union perf_event event[0];
+};
+
+static LIST_HEAD(samples);
+
+static int perf_event__sched_stat(struct perf_tool *tool,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct perf_evsel *evsel,
+ struct machine *machine)
+{
+ const char *evname = NULL;
+ uint32_t size;
+ struct event_entry *ent;
+ union perf_event *event_sw = NULL;
+ struct perf_sample sample_sw;
+ int sched_process_exit;
+
+ size = event->header.size;
+
+ evname = evsel->tp_format->name;
+
+ sched_process_exit = !strcmp(evname, "sched_process_exit");
+
+ if (!strcmp(evname, "sched_switch") || sched_process_exit) {
+ list_for_each_entry(ent, &samples, node)
+ if (sample->pid == ent->pid)
+ break;
+
+ if (&ent->node != &samples) {
+ list_del(&ent->node);
+ free(ent);
+ }
+
+ if (sched_process_exit)
+ return 0;
+
+ ent = malloc(size + sizeof(struct event_entry));
+ if (ent == NULL)
+ die("malloc");
+ ent->pid = sample->pid;
+ memcpy(&ent->event, event, size);
+ list_add(&ent->node, &samples);
+ return 0;
+
+ } else if (!strncmp(evname, "sched_stat_", 11)) {
+ u32 pid;
+
+ pid = raw_field_value(evsel->tp_format,
+ "pid", sample->raw_data);
+
+ list_for_each_entry(ent, &samples, node) {
+ if (pid == ent->pid)
+ break;
+ }
+
+ if (&ent->node == &samples)
+ return 0;
+
+ event_sw = &ent->event[0];
+ perf_evsel__parse_sample(evsel, event_sw, &sample_sw, false);
+
+ sample_sw.period = sample->period;
+ sample_sw.time = sample->time;
+ perf_evsel__synthesize_sample(evsel, event_sw, &sample_sw, false);
+
+ perf_event__repipe(tool, event_sw, &sample_sw, machine);
+ return 0;
+ }
+
+ perf_event__repipe(tool, event, sample, machine);
+
+ return 0;
+}
struct perf_tool perf_inject = {
.sample = perf_event__repipe_sample,
.mmap = perf_event__repipe,
@@ -245,6 +325,9 @@ static int __cmd_inject(void)
perf_inject.mmap = perf_event__repipe_mmap;
perf_inject.fork = perf_event__repipe_task;
perf_inject.tracing_data = perf_event__repipe_tracing_data;
+ } else if (inject_sched_stat) {
+ perf_inject.sample = perf_event__sched_stat;
+ perf_inject.ordered_samples = true;
}

session = perf_session__new(input_name, O_RDONLY, false, true, &perf_inject);
@@ -272,6 +355,9 @@ static const char * const report_usage[] = {
static const struct option options[] = {
OPT_BOOLEAN('b', "build-ids", &inject_build_ids,
"Inject build-ids into the output stream"),
+ OPT_BOOLEAN('s', "sched-stat", &inject_sched_stat,
+ "Merge sched-stat and sched-switch for getting events "
+ "where and how long tasks slept"),
OPT_STRING('i', "input", &input_name, "file",
"input file name"),
OPT_STRING('o', "output", &output_name, "file",
--
1.7.1

2012-08-07 12:56:59

by Andrei Vagin

[permalink] [raw]
Subject: [PATCH 2/4] perf: synthesize_sample gets evsel instead of session

The same thing was done for perf_evsel__parse_sample

Signed-off-by: Andrew Vagin <[email protected]>
---
tools/perf/util/event.h | 4 ----
tools/perf/util/evsel.c | 4 +++-
tools/perf/util/evsel.h | 5 +++++
3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index d84870b..1ab3141 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -197,10 +197,6 @@ int perf_event__preprocess_sample(const union perf_event *self,

const char *perf_event__name(unsigned int id);

-int perf_event__synthesize_sample(union perf_event *event, u64 type,
- const struct perf_sample *sample,
- bool swapped);
-
size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp);
size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp);
size_t perf_event__fprintf_task(union perf_event *event, FILE *fp);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 2eaae14..7547203 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -872,10 +872,12 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
return 0;
}

-int perf_event__synthesize_sample(union perf_event *event, u64 type,
+int perf_evsel__synthesize_sample(struct perf_evsel *evsel,
+ union perf_event *event,
const struct perf_sample *sample,
bool swapped)
{
+ u64 type = evsel->attr.sample_type;
u64 *array;

/*
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index a56c457..62cdd5d 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -183,4 +183,9 @@ void hists__init(struct hists *hists);

int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
struct perf_sample *sample, bool swapped);
+
+int perf_evsel__synthesize_sample(struct perf_evsel *evsel,
+ union perf_event *event,
+ const struct perf_sample *sample,
+ bool swapped);
#endif /* __PERF_EVSEL_H */
--
1.7.1

2012-08-08 00:38:29

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH 0/4] perf: Teach perf tool to profile sleep times (v2)

Hi, Andrew

On Tue, 7 Aug 2012 16:56:01 +0400, Andrew Vagin wrote:
> This functionality helps to analize where a task sleeps or waits locks.
> This feature can help to investigate a scalability problems.
>
Looks like a nice feature.


> The main idea is that we can combine sched_switch and sched_stat_sleep events.
> sched_switch contains a callchain, when a task starts sleeping.
> sched_stat_sleep contains a time period for which a task slept.
>
> This series teaches "perf inject" to combine this events.
>
> All kernel related patches were committed committed in 3.6-rc1.
>
> Here is an example of a report:
> $ cat ~/foo.c
> ....
> for (i = 0; i < 10; i++) {
> ts1.tv_sec = 0;
> ts1.tv_nsec = 10000000;
> nanosleep(&ts1, NULL);
>
> tv1.tv_sec = 0;
> tv1.tv_usec = 40000;
> select(0, NULL, NULL, NULL,&tv1);
> }
> ...
>
> $ ./perf record -e sched:sched_stat_sleep -e sched:sched_switch \
> -e sched:sched_process_exit -gP -o ~/perf.data.raw ~/foo
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0.015 MB /root/perf.data.raw (~661 samples) ]
> $ ./perf inject -v -s -i ~/perf.data.raw -o ~/perf.data
> $ ./perf report -i ~/perf.data

The usage like this is too specific and hard to use IMHO. How about
putting it somehow into perf sched or new command?

/me don't have an idea though. :-)


> # Samples: 40 of event 'sched:sched_switch'
> # Event count (approx.): 1005527702
> #
> # Overhead Command Shared Object Symbol
> # ........ ....... ................. ..............
> #
> 100.00% foo [kernel.kallsyms] [k] __schedule
> |
> --- __schedule
> schedule
> |
> |--79.81%-- schedule_hrtimeout_range_clock
> | schedule_hrtimeout_range
> | poll_schedule_timeout
> | do_select
> | core_sys_select
> | sys_select
> | system_call_fastpath
> | __select
> | __libc_start_main
> |
> --20.19%-- do_nanosleep
> hrtimer_nanosleep
> sys_nanosleep
> system_call_fastpath
> __GI___libc_nanosleep
> __libc_start_main
>
> Andrew Vagin (3):
> perf: teach "perf inject" to work with files
> perf: teach perf inject to merge sched_stat_* and sched_switch events
> perf: mark a dso if it's used
>

Seems to be outdated.

Thanks,
Namhyung


> tools/perf/builtin-inject.c | 139 ++++++++++++++++++++++++++++++++++++++++---
> tools/perf/util/build-id.c | 2 +-
> tools/perf/util/build-id.h | 5 ++
> 3 files changed, 137 insertions(+), 9 deletions(-)

2012-08-08 05:02:21

by Andrei Vagin

[permalink] [raw]
Subject: Re: [PATCH 0/4] perf: Teach perf tool to profile sleep times (v2)

2012/8/8 Namhyung Kim <[email protected]>:
>>
>> $ ./perf record -e sched:sched_stat_sleep -e sched:sched_switch \
>> -e sched:sched_process_exit -gP -o ~/perf.data.raw ~/foo

Actually this string is not completed, because sched:sched_switch
should be filtered by state.

>> [ perf record: Woken up 1 times to write data ]
>> [ perf record: Captured and wrote 0.015 MB /root/perf.data.raw (~661 samples) ]
>> $ ./perf inject -v -s -i ~/perf.data.raw -o ~/perf.data
>> $ ./perf report -i ~/perf.data
>
> The usage like this is too specific and hard to use IMHO. How about
> putting it somehow into perf sched or new command?
>
> /me don't have an idea though. :-)
>

I'm going to add a script, so the usage will look like this:
$ perf script record sched-stat -e sched:sched_stat_sleep <command>
This command will collect sched_stat_* and proper sched_switch events
$ perf script report sched-stat

2012-08-08 05:36:52

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH 0/4] perf: Teach perf tool to profile sleep times (v2)

On Wed, 8 Aug 2012 09:02:18 +0400, Andrey Wagin wrote:
> 2012/8/8 Namhyung Kim <[email protected]>:
>>>
>>> $ ./perf record -e sched:sched_stat_sleep -e sched:sched_switch \
>>> -e sched:sched_process_exit -gP -o ~/perf.data.raw ~/foo
>
> Actually this string is not completed, because sched:sched_switch
> should be filtered by state.
>
>>> [ perf record: Woken up 1 times to write data ]
>>> [ perf record: Captured and wrote 0.015 MB /root/perf.data.raw (~661 samples) ]
>>> $ ./perf inject -v -s -i ~/perf.data.raw -o ~/perf.data
>>> $ ./perf report -i ~/perf.data
>>
>> The usage like this is too specific and hard to use IMHO. How about
>> putting it somehow into perf sched or new command?
>>
>> /me don't have an idea though. :-)
>>
>
> I'm going to add a script, so the usage will look like this:
> $ perf script record sched-stat -e sched:sched_stat_sleep <command>
> This command will collect sched_stat_* and proper sched_switch events

??? That means '-e sched:sched_stat_sleep' part can be removed from
command line, no?

Thanks,
Namhyung

2012-08-08 07:24:36

by Andrei Vagin

[permalink] [raw]
Subject: Re: [PATCH 0/4] perf: Teach perf tool to profile sleep times (v2)

2012/8/8 Namhyung Kim <[email protected]>:
> On Wed, 8 Aug 2012 09:02:18 +0400, Andrey Wagin wrote:
>> 2012/8/8 Namhyung Kim <[email protected]>:
>>>>
>>>> $ ./perf record -e sched:sched_stat_sleep -e sched:sched_switch \
>>>> -e sched:sched_process_exit -gP -o ~/perf.data.raw ~/foo
>>
>> Actually this string is not completed, because sched:sched_switch
>> should be filtered by state.
>>
>>>> [ perf record: Woken up 1 times to write data ]
>>>> [ perf record: Captured and wrote 0.015 MB /root/perf.data.raw (~661 samples) ]
>>>> $ ./perf inject -v -s -i ~/perf.data.raw -o ~/perf.data
>>>> $ ./perf report -i ~/perf.data
>>>
>>> The usage like this is too specific and hard to use IMHO. How about
>>> putting it somehow into perf sched or new command?
>>>
>>> /me don't have an idea though. :-)
>>>
>>
>> I'm going to add a script, so the usage will look like this:
>> $ perf script record sched-stat -e sched:sched_stat_sleep <command>
>> This command will collect sched_stat_* and proper sched_switch events
>
> ??? That means '-e sched:sched_stat_sleep' part can be removed from
> command line, no?

No. My method works for all kind of sched_stat_* events, so you need
to specify an event type which should be traced.

2012-08-09 00:43:55

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH 0/4] perf: Teach perf tool to profile sleep times (v2)

On Wed, 8 Aug 2012 11:24:34 +0400, Andrey Wagin wrote:
> 2012/8/8 Namhyung Kim <[email protected]>:
>> On Wed, 8 Aug 2012 09:02:18 +0400, Andrey Wagin wrote:
>>> 2012/8/8 Namhyung Kim <[email protected]>:
>>>>>
>>>>> $ ./perf record -e sched:sched_stat_sleep -e sched:sched_switch \
>>>>> -e sched:sched_process_exit -gP -o ~/perf.data.raw ~/foo
>>>
>>> Actually this string is not completed, because sched:sched_switch
>>> should be filtered by state.
>>>
>>>>> [ perf record: Woken up 1 times to write data ]
>>>>> [ perf record: Captured and wrote 0.015 MB /root/perf.data.raw (~661 samples) ]
>>>>> $ ./perf inject -v -s -i ~/perf.data.raw -o ~/perf.data
>>>>> $ ./perf report -i ~/perf.data
>>>>
>>>> The usage like this is too specific and hard to use IMHO. How about
>>>> putting it somehow into perf sched or new command?
>>>>
>>>> /me don't have an idea though. :-)
>>>>
>>>
>>> I'm going to add a script, so the usage will look like this:
>>> $ perf script record sched-stat -e sched:sched_stat_sleep <command>
>>> This command will collect sched_stat_* and proper sched_switch events
>>
>> ??? That means '-e sched:sched_stat_sleep' part can be removed from
>> command line, no?
>
> No. My method works for all kind of sched_stat_* events, so you need
> to specify an event type which should be traced.

Ok, so can it be like 'perf script record sched-stat -t sleep <command>'?

Thanks,
Namhyung

2012-08-09 12:56:59

by Andrei Vagin

[permalink] [raw]
Subject: Re: [PATCH 0/4] perf: Teach perf tool to profile sleep times (v2)

2012/8/9 Namhyung Kim <[email protected]>:
>>>>> The usage like this is too specific and hard to use IMHO. How about
>>>>> putting it somehow into perf sched or new command?
>>>>>
>>>>> /me don't have an idea though. :-)
>>>>>
>>>>
>>>> I'm going to add a script, so the usage will look like this:
>>>> $ perf script record sched-stat -e sched:sched_stat_sleep <command>
>>>> This command will collect sched_stat_* and proper sched_switch events
>>>
>>> ??? That means '-e sched:sched_stat_sleep' part can be removed from
>>> command line, no?
>>
>> No. My method works for all kind of sched_stat_* events, so you need
>> to specify an event type which should be traced.
>
> Ok, so can it be like 'perf script record sched-stat -t sleep <command>'?

Yes, it can. Thanks for your feedback. I'm going to write the script,
when this series will be committed.

2012-08-24 13:32:27

by Andrei Vagin

[permalink] [raw]
Subject: Re: [PATCH 0/4] perf: Teach perf tool to profile sleep times (v2)

Hello Arnaldo,

What do you think about this series?

It has been fixed according with your comments to the previous
patches. Are you going to take it?

2012/8/7 Andrew Vagin <[email protected]>:
> This functionality helps to analize where a task sleeps or waits locks.
> This feature can help to investigate a scalability problems.
>
> The main idea is that we can combine sched_switch and sched_stat_sleep events.
> sched_switch contains a callchain, when a task starts sleeping.
> sched_stat_sleep contains a time period for which a task slept.
>
> This series teaches "perf inject" to combine this events.
>
> All kernel related patches were committed committed in 3.6-rc1.
>
> Here is an example of a report:
> $ cat ~/foo.c
> ....
> for (i = 0; i < 10; i++) {
> ts1.tv_sec = 0;
> ts1.tv_nsec = 10000000;
> nanosleep(&ts1, NULL);
>
> tv1.tv_sec = 0;
> tv1.tv_usec = 40000;
> select(0, NULL, NULL, NULL,&tv1);
> }
> ...
>
> $ ./perf record -e sched:sched_stat_sleep -e sched:sched_switch \
> -e sched:sched_process_exit -gP -o ~/perf.data.raw ~/foo
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0.015 MB /root/perf.data.raw (~661 samples) ]
> $ ./perf inject -v -s -i ~/perf.data.raw -o ~/perf.data
> $ ./perf report -i ~/perf.data
> # Samples: 40 of event 'sched:sched_switch'
> # Event count (approx.): 1005527702
> #
> # Overhead Command Shared Object Symbol
> # ........ ....... ................. ..............
> #
> 100.00% foo [kernel.kallsyms] [k] __schedule
> |
> --- __schedule
> schedule
> |
> |--79.81%-- schedule_hrtimeout_range_clock
> | schedule_hrtimeout_range
> | poll_schedule_timeout
> | do_select
> | core_sys_select
> | sys_select
> | system_call_fastpath
> | __select
> | __libc_start_main
> |
> --20.19%-- do_nanosleep
> hrtimer_nanosleep
> sys_nanosleep
> system_call_fastpath
> __GI___libc_nanosleep
> __libc_start_main
>
> Andrew Vagin (3):
> perf: teach "perf inject" to work with files
> perf: teach perf inject to merge sched_stat_* and sched_switch events
> perf: mark a dso if it's used
>
> tools/perf/builtin-inject.c | 139 ++++++++++++++++++++++++++++++++++++++++---
> tools/perf/util/build-id.c | 2 +-
> tools/perf/util/build-id.h | 5 ++
> 3 files changed, 137 insertions(+), 9 deletions(-)
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2012-08-24 17:59:58

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 0/4] perf: Teach perf tool to profile sleep times (v2)

Em Fri, Aug 24, 2012 at 05:32:21PM +0400, Andrey Wagin escreveu:
> Hello Arnaldo,
>
> What do you think about this series?
>
> It has been fixed according with your comments to the previous
> patches. Are you going to take it?

I've put them in a perf/sleep branch in my tree, waiting on Fr?d?ric
that is reviewing it.

- Arnaldo

> 2012/8/7 Andrew Vagin <[email protected]>:
> > This functionality helps to analize where a task sleeps or waits locks.
> > This feature can help to investigate a scalability problems.
> >
> > The main idea is that we can combine sched_switch and sched_stat_sleep events.
> > sched_switch contains a callchain, when a task starts sleeping.
> > sched_stat_sleep contains a time period for which a task slept.
> >
> > This series teaches "perf inject" to combine this events.
> >
> > All kernel related patches were committed committed in 3.6-rc1.
> >
> > Here is an example of a report:
> > $ cat ~/foo.c
> > ....
> > for (i = 0; i < 10; i++) {
> > ts1.tv_sec = 0;
> > ts1.tv_nsec = 10000000;
> > nanosleep(&ts1, NULL);
> >
> > tv1.tv_sec = 0;
> > tv1.tv_usec = 40000;
> > select(0, NULL, NULL, NULL,&tv1);
> > }
> > ...
> >
> > $ ./perf record -e sched:sched_stat_sleep -e sched:sched_switch \
> > -e sched:sched_process_exit -gP -o ~/perf.data.raw ~/foo
> > [ perf record: Woken up 1 times to write data ]
> > [ perf record: Captured and wrote 0.015 MB /root/perf.data.raw (~661 samples) ]
> > $ ./perf inject -v -s -i ~/perf.data.raw -o ~/perf.data
> > $ ./perf report -i ~/perf.data
> > # Samples: 40 of event 'sched:sched_switch'
> > # Event count (approx.): 1005527702
> > #
> > # Overhead Command Shared Object Symbol
> > # ........ ....... ................. ..............
> > #
> > 100.00% foo [kernel.kallsyms] [k] __schedule
> > |
> > --- __schedule
> > schedule
> > |
> > |--79.81%-- schedule_hrtimeout_range_clock
> > | schedule_hrtimeout_range
> > | poll_schedule_timeout
> > | do_select
> > | core_sys_select
> > | sys_select
> > | system_call_fastpath
> > | __select
> > | __libc_start_main
> > |
> > --20.19%-- do_nanosleep
> > hrtimer_nanosleep
> > sys_nanosleep
> > system_call_fastpath
> > __GI___libc_nanosleep
> > __libc_start_main
> >
> > Andrew Vagin (3):
> > perf: teach "perf inject" to work with files
> > perf: teach perf inject to merge sched_stat_* and sched_switch events
> > perf: mark a dso if it's used
> >
> > tools/perf/builtin-inject.c | 139 ++++++++++++++++++++++++++++++++++++++++---
> > tools/perf/util/build-id.c | 2 +-
> > tools/perf/util/build-id.h | 5 ++
> > 3 files changed, 137 insertions(+), 9 deletions(-)
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/

2012-08-25 11:48:08

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH 3/4] perf: teach perf inject to merge sched_stat_* and sched_switch events (v2)

On Tue, Aug 07, 2012 at 04:56:04PM +0400, Andrew Vagin wrote:
> +struct event_entry {
> + struct list_head node;
> + u32 pid;
> + union perf_event event[0];
> +};
> +
> +static LIST_HEAD(samples);
> +
> +static int perf_event__sched_stat(struct perf_tool *tool,
> + union perf_event *event,
> + struct perf_sample *sample,
> + struct perf_evsel *evsel,
> + struct machine *machine)
> +{
> + const char *evname = NULL;
> + uint32_t size;
> + struct event_entry *ent;
> + union perf_event *event_sw = NULL;
> + struct perf_sample sample_sw;
> + int sched_process_exit;
> +
> + size = event->header.size;
> +
> + evname = evsel->tp_format->name;
> +
> + sched_process_exit = !strcmp(evname, "sched_process_exit");
> +
> + if (!strcmp(evname, "sched_switch") || sched_process_exit) {
> + list_for_each_entry(ent, &samples, node)
> + if (sample->pid == ent->pid)

I suspect what you're rather interested in is the sample tid.

> + break;
> +
> + if (&ent->node != &samples) {
> + list_del(&ent->node);
> + free(ent);
> + }
> +
> + if (sched_process_exit)
> + return 0;
> +
> + ent = malloc(size + sizeof(struct event_entry));
> + if (ent == NULL)
> + die("malloc");
> + ent->pid = sample->pid;

Ditto.

> + memcpy(&ent->event, event, size);
> + list_add(&ent->node, &samples);
> + return 0;
> +
> + } else if (!strncmp(evname, "sched_stat_", 11)) {
> + u32 pid;
> +
> + pid = raw_field_value(evsel->tp_format,
> + "pid", sample->raw_data);

There you parse the pid from the trace content. That's fine because
it's actually the tid that is saved on the trace event. But this one
is not pid-namespace safe (it saves current->pid directly) while
sample->tid is pid-namespace safe (it uses task_pid_nr_ns).

So I suggest you to use sample->tid instead, plus that's going to be
consistant with what you did above.

Thanks.

2012-08-27 07:22:27

by Andrei Vagin

[permalink] [raw]
Subject: Re: [PATCH 3/4] perf: teach perf inject to merge sched_stat_* and sched_switch events (v2)

2012/8/25 Frederic Weisbecker <[email protected]>:
>> + if (!strcmp(evname, "sched_switch") || sched_process_exit) {
>> + list_for_each_entry(ent, &samples, node)
>> + if (sample->pid == ent->pid)
>
> I suspect what you're rather interested in is the sample tid.

Yes, you are right.

>
>> +
>> + } else if (!strncmp(evname, "sched_stat_", 11)) {
>> + u32 pid;
>> +
>> + pid = raw_field_value(evsel->tp_format,
>> + "pid", sample->raw_data);
>
> There you parse the pid from the trace content. That's fine because
> it's actually the tid that is saved on the trace event. But this one
> is not pid-namespace safe (it saves current->pid directly) while
> sample->tid is pid-namespace safe (it uses task_pid_nr_ns).
>
> So I suggest you to use sample->tid instead, plus that's going to be
> consistant with what you did above.

Here is a problem, because a pid from the trace content and
sample->tid are not the same.
A kernel wakes up a task1 and do it from a context of another task2.
In this case a pid from the trace content is a pid of the task1 and a
sample->tid is a pid of the task2.

DECLARE_EVENT_CLASS(sched_stat_template,
TP_PROTO(struct task_struct *tsk, u64 delay),
...
__entry->pid = tsk->pid;
...


Here is a patch, which allow us to get "foreign" events.
http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=e6dab5ffab59e910ec0e3355f4a6f29f7a7be474

2012-08-27 20:51:27

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 3/4] perf: teach perf inject to merge sched_stat_* and sched_switch events (v2)

Andrew Vagin <[email protected]> writes:
>
> My code saves sched_switch event for each process and when it meets
> stat_iowait, it reports the sched_switch event, because this event
> contains a correct callchain. By another words it replaces all
> stat_iowait events on proper sched_switch events.

Is there any way to generalize this to arbitary events? I found myself
writing scripts to compute lengths between various events. But I
presume this could be done in a more general way. The advantage
of perf-inject over my script is that the output can be viewed
in the normal browser.

-Andi

--
[email protected] -- Speaking for myself only

2012-08-27 21:56:29

by David Ahern

[permalink] [raw]
Subject: Re: [PATCH 3/4] perf: teach perf inject to merge sched_stat_* and sched_switch events (v2)

On 8/27/12 2:51 PM, Andi Kleen wrote:
> Andrew Vagin <[email protected]> writes:
>>
>> My code saves sched_switch event for each process and when it meets
>> stat_iowait, it reports the sched_switch event, because this event
>> contains a correct callchain. By another words it replaces all
>> stat_iowait events on proper sched_switch events.
>
> Is there any way to generalize this to arbitary events? I found myself
> writing scripts to compute lengths between various events. But I

You mean delta-time between events? I have toyed around with patches to
perf-script for this -- dt between same event on a cpu, dt between
consecutive events on a cpu, dt between events for a thread, etc.

David

2012-08-27 22:14:29

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 3/4] perf: teach perf inject to merge sched_stat_* and sched_switch events (v2)

On Mon, Aug 27, 2012 at 03:56:23PM -0600, David Ahern wrote:
> On 8/27/12 2:51 PM, Andi Kleen wrote:
> >Andrew Vagin <[email protected]> writes:
> >>
> >>My code saves sched_switch event for each process and when it meets
> >>stat_iowait, it reports the sched_switch event, because this event
> >>contains a correct callchain. By another words it replaces all
> >>stat_iowait events on proper sched_switch events.
> >
> >Is there any way to generalize this to arbitary events? I found myself
> >writing scripts to compute lengths between various events. But I
>
> You mean delta-time between events? I have toyed around with patches to
> perf-script for this -- dt between same event on a cpu, dt between
> consecutive events on a cpu, dt between events for a thread, etc.

I did the same. But I then realized perf script is the wrong approach.
perf script is a deadend as far as perf is concerned.
You want a filter that generates another perf.data that you then can
view in the browser. So perf inject is the better model.

-Andi

2012-08-29 08:28:22

by Andrew Vagin

[permalink] [raw]
Subject: Re: [PATCH 3/4] perf: teach perf inject to merge sched_stat_* and sched_switch events (v2)

On Tue, Aug 28, 2012 at 12:51:02AM +0400, Andi Kleen wrote:
> Andrew Vagin <[email protected]> writes:
> >
> > My code saves sched_switch event for each process and when it meets
> > stat_iowait, it reports the sched_switch event, because this event
> > contains a correct callchain. By another words it replaces all
> > stat_iowait events on proper sched_switch events.
>
> Is there any way to generalize this to arbitary events? I found myself
> writing scripts to compute lengths between various events. But I
> presume this could be done in a more general way. The advantage
> of perf-inject over my script is that the output can be viewed
> in the normal browser.

I don't know what do you mean to generalize this to arbitary events.
I think a generalization will be obvious, when we will have a few
examples.

Currently we can modify events by a simple way.
What would I want to have? It's a way to create a new event type and
events in "perf inject".

>
> -Andi
>
> --
> [email protected] -- Speaking for myself only

2012-10-26 15:09:45

by Andrei Vagin

[permalink] [raw]
Subject: [tip:perf/core] perf inject: Work with files

Commit-ID: e558a5bd8b74aff4690a8c55b08a1dc91ef50d7c
Gitweb: http://git.kernel.org/tip/e558a5bd8b74aff4690a8c55b08a1dc91ef50d7c
Author: Andrew Vagin <[email protected]>
AuthorDate: Tue, 7 Aug 2012 16:56:02 +0400
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Fri, 26 Oct 2012 11:22:24 -0200

perf inject: Work with files

Before this patch "perf inject" can only handle data from pipe.

I want to use "perf inject" for reworking events. Look at my following patch.

v2: add information about new options in tools/perf/Documentation/

Signed-off-by: Andrew Vagin <[email protected]>
Acked-by: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
[ committer note: fixed it up to cope with 5852a44, 5ded57a, 002439e & f62d3f0 ]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/Documentation/perf-inject.txt | 6 ++++
tools/perf/builtin-inject.c | 38 +++++++++++++++++++++++++++--
2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index 025630d..673ef97 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -29,6 +29,12 @@ OPTIONS
-v::
--verbose::
Be more verbose.
+-i::
+--input=::
+ Input file name. (default: stdin)
+-o::
+--output=::
+ Output file name. (default: stdout)

SEE ALSO
--------
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 386a5c0..a706ed5 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -17,24 +17,30 @@
struct perf_inject {
struct perf_tool tool;
bool build_ids;
+ const char *input_name;
+ int pipe_output,
+ output;
+ u64 bytes_written;
};

-static int perf_event__repipe_synth(struct perf_tool *tool __maybe_unused,
+static int perf_event__repipe_synth(struct perf_tool *tool,
union perf_event *event,
struct machine *machine __maybe_unused)
{
+ struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
uint32_t size;
void *buf = event;

size = event->header.size;

while (size) {
- int ret = write(STDOUT_FILENO, buf, size);
+ int ret = write(inject->output, buf, size);
if (ret < 0)
return -errno;

size -= ret;
buf += ret;
+ inject->bytes_written += ret;
}

return 0;
@@ -231,12 +237,20 @@ static int __cmd_inject(struct perf_inject *inject)
inject->tool.tracing_data = perf_event__repipe_tracing_data;
}

- session = perf_session__new("-", O_RDONLY, false, true, &inject->tool);
+ session = perf_session__new(inject->input_name, O_RDONLY, false, true, &inject->tool);
if (session == NULL)
return -ENOMEM;

+ if (!inject->pipe_output)
+ lseek(inject->output, session->header.data_offset, SEEK_SET);
+
ret = perf_session__process_events(session, &inject->tool);

+ if (!inject->pipe_output) {
+ session->header.data_size = inject->bytes_written;
+ perf_session__write_header(session, session->evlist, inject->output, true);
+ }
+
perf_session__delete(session);

return ret;
@@ -260,10 +274,16 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
.tracing_data = perf_event__repipe_tracing_data_synth,
.build_id = perf_event__repipe_op2_synth,
},
+ .input_name = "-",
};
+ const char *output_name = "-";
const struct option options[] = {
OPT_BOOLEAN('b', "build-ids", &inject.build_ids,
"Inject build-ids into the output stream"),
+ OPT_STRING('i', "input", &inject.input_name, "file",
+ "input file name"),
+ OPT_STRING('o', "output", &output_name, "file",
+ "output file name"),
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show build ids, etc)"),
OPT_END()
@@ -281,6 +301,18 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
if (argc)
usage_with_options(inject_usage, options);

+ if (!strcmp(output_name, "-")) {
+ inject.pipe_output = 1;
+ inject.output = STDOUT_FILENO;
+ } else {
+ inject.output = open(output_name, O_CREAT | O_WRONLY | O_TRUNC,
+ S_IRUSR | S_IWUSR);
+ if (inject.output < 0) {
+ perror("failed to create output file");
+ return -1;
+ }
+ }
+
if (symbol__init() < 0)
return -1;

2012-10-26 15:10:43

by Andrei Vagin

[permalink] [raw]
Subject: [tip:perf/core] perf inject: Merge sched_stat_* and sched_switch events

Commit-ID: 26a031e136f4f8dc82c64df48cca0eb3b5d3eb4f
Gitweb: http://git.kernel.org/tip/26a031e136f4f8dc82c64df48cca0eb3b5d3eb4f
Author: Andrew Vagin <[email protected]>
AuthorDate: Tue, 7 Aug 2012 16:56:04 +0400
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Fri, 26 Oct 2012 11:22:25 -0200

perf inject: Merge sched_stat_* and sched_switch events

You may want to know where and how long a task is sleeping. A callchain
may be found in sched_switch and a time slice in stat_iowait, so I add
handler in perf inject for merging this events.

My code saves sched_switch event for each process and when it meets
stat_iowait, it reports the sched_switch event, because this event
contains a correct callchain. By another words it replaces all
stat_iowait events on proper sched_switch events.

I use the next sequence of commands for testing:

perf record -e sched:sched_stat_sleep -e sched:sched_switch \
-e sched:sched_process_exit -g -o ~/perf.data.raw \
~/test-program
perf inject -v -s -i ~/perf.data.raw -o ~/perf.data
perf report --stdio -i ~/perf.data
100.00% foo [kernel.kallsyms] [k] __schedule
|
--- __schedule
schedule
|
|--79.75%-- schedule_hrtimeout_range_clock
| schedule_hrtimeout_range
| poll_schedule_timeout
| do_select
| core_sys_select
| sys_select
| system_call_fastpath
| __select
| __libc_start_main
|
--20.25%-- do_nanosleep
hrtimer_nanosleep
sys_nanosleep
system_call_fastpath
__GI___libc_nanosleep
__libc_start_main

And here is test-program.c:

#include<unistd.h>
#include<time.h>
#include<sys/select.h>

int main()
{
struct timespec ts1;
struct timeval tv1;
int i;
long s;

for (i = 0; i < 10; i++) {
ts1.tv_sec = 0;
ts1.tv_nsec = 10000000;
nanosleep(&ts1, NULL);

tv1.tv_sec = 0;
tv1.tv_usec = 40000;
select(0, NULL, NULL, NULL,&tv1);
}
return 1;
}

Signed-off-by: Andrew Vagin <[email protected]>
Acked-by: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
[ committer note: Made it use evsel->handler ]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/Documentation/perf-inject.txt | 5 +
tools/perf/builtin-inject.c | 142 +++++++++++++++++++++++++++++-
2 files changed, 144 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index 673ef97..a00a342 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -35,6 +35,11 @@ OPTIONS
-o::
--output=::
Output file name. (default: stdout)
+-s::
+--sched-stat::
+ Merge sched_stat and sched_switch for getting events where and how long
+ tasks slept. sched_switch contains a callchain where a task slept and
+ sched_stat contains a timeslice how long a task slept.

SEE ALSO
--------
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index a706ed5..a4a3072 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -8,19 +8,32 @@
#include "builtin.h"

#include "perf.h"
+#include "util/color.h"
+#include "util/evlist.h"
+#include "util/evsel.h"
#include "util/session.h"
#include "util/tool.h"
#include "util/debug.h"

#include "util/parse-options.h"

+#include <linux/list.h>
+
struct perf_inject {
struct perf_tool tool;
bool build_ids;
+ bool sched_stat;
const char *input_name;
int pipe_output,
output;
u64 bytes_written;
+ struct list_head samples;
+};
+
+struct event_entry {
+ struct list_head node;
+ u32 tid;
+ union perf_event event[0];
};

static int perf_event__repipe_synth(struct perf_tool *tool,
@@ -86,12 +99,23 @@ static int perf_event__repipe(struct perf_tool *tool,
return perf_event__repipe_synth(tool, event, machine);
}

+typedef int (*inject_handler)(struct perf_tool *tool,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct perf_evsel *evsel,
+ struct machine *machine);
+
static int perf_event__repipe_sample(struct perf_tool *tool,
union perf_event *event,
- struct perf_sample *sample __maybe_unused,
- struct perf_evsel *evsel __maybe_unused,
- struct machine *machine)
+ struct perf_sample *sample,
+ struct perf_evsel *evsel,
+ struct machine *machine)
{
+ if (evsel->handler.func) {
+ inject_handler f = evsel->handler.func;
+ return f(tool, event, sample, evsel, machine);
+ }
+
return perf_event__repipe_synth(tool, event, machine);
}

@@ -216,6 +240,79 @@ repipe:
return 0;
}

+static int perf_inject__sched_process_exit(struct perf_tool *tool,
+ union perf_event *event __maybe_unused,
+ struct perf_sample *sample,
+ struct perf_evsel *evsel __maybe_unused,
+ struct machine *machine __maybe_unused)
+{
+ struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+ struct event_entry *ent;
+
+ list_for_each_entry(ent, &inject->samples, node) {
+ if (sample->tid == ent->tid) {
+ list_del_init(&ent->node);
+ free(ent);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int perf_inject__sched_switch(struct perf_tool *tool,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct perf_evsel *evsel,
+ struct machine *machine)
+{
+ struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+ struct event_entry *ent;
+
+ perf_inject__sched_process_exit(tool, event, sample, evsel, machine);
+
+ ent = malloc(event->header.size + sizeof(struct event_entry));
+ if (ent == NULL) {
+ color_fprintf(stderr, PERF_COLOR_RED,
+ "Not enough memory to process sched switch event!");
+ return -1;
+ }
+
+ ent->tid = sample->tid;
+ memcpy(&ent->event, event, event->header.size);
+ list_add(&ent->node, &inject->samples);
+ return 0;
+}
+
+static int perf_inject__sched_stat(struct perf_tool *tool,
+ union perf_event *event __maybe_unused,
+ struct perf_sample *sample,
+ struct perf_evsel *evsel,
+ struct machine *machine)
+{
+ struct event_entry *ent;
+ union perf_event *event_sw;
+ struct perf_sample sample_sw;
+ struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+ u32 pid = perf_evsel__intval(evsel, sample, "pid");
+
+ list_for_each_entry(ent, &inject->samples, node) {
+ if (pid == ent->tid)
+ goto found;
+ }
+
+ return 0;
+found:
+ event_sw = &ent->event[0];
+ perf_evsel__parse_sample(evsel, event_sw, &sample_sw);
+
+ sample_sw.period = sample->period;
+ sample_sw.time = sample->time;
+ perf_event__synthesize_sample(event_sw, evsel->attr.sample_type,
+ &sample_sw, false);
+ return perf_event__repipe(tool, event_sw, &sample_sw, machine);
+}
+
extern volatile int session_done;

static void sig_handler(int sig __maybe_unused)
@@ -223,6 +320,21 @@ static void sig_handler(int sig __maybe_unused)
session_done = 1;
}

+static int perf_evsel__check_stype(struct perf_evsel *evsel,
+ u64 sample_type, const char *sample_msg)
+{
+ struct perf_event_attr *attr = &evsel->attr;
+ const char *name = perf_evsel__name(evsel);
+
+ if (!(attr->sample_type & sample_type)) {
+ pr_err("Samples for %s event do not have %s attribute set.",
+ name, sample_msg);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int __cmd_inject(struct perf_inject *inject)
{
struct perf_session *session;
@@ -241,6 +353,26 @@ static int __cmd_inject(struct perf_inject *inject)
if (session == NULL)
return -ENOMEM;

+ if (inject->sched_stat) {
+ struct perf_evsel *evsel;
+
+ inject->tool.ordered_samples = true;
+
+ list_for_each_entry(evsel, &session->evlist->entries, node) {
+ const char *name = perf_evsel__name(evsel);
+
+ if (!strcmp(name, "sched:sched_switch")) {
+ if (perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID"))
+ return -EINVAL;
+
+ evsel->handler.func = perf_inject__sched_switch;
+ } else if (!strcmp(name, "sched:sched_process_exit"))
+ evsel->handler.func = perf_inject__sched_process_exit;
+ else if (!strncmp(name, "sched:sched_stat_", 17))
+ evsel->handler.func = perf_inject__sched_stat;
+ }
+ }
+
if (!inject->pipe_output)
lseek(inject->output, session->header.data_offset, SEEK_SET);

@@ -275,6 +407,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
.build_id = perf_event__repipe_op2_synth,
},
.input_name = "-",
+ .samples = LIST_HEAD_INIT(inject.samples),
};
const char *output_name = "-";
const struct option options[] = {
@@ -284,6 +417,9 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
"input file name"),
OPT_STRING('o', "output", &output_name, "file",
"output file name"),
+ OPT_BOOLEAN('s', "sched-stat", &inject.sched_stat,
+ "Merge sched-stat and sched-switch for getting events "
+ "where and how long tasks slept"),
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show build ids, etc)"),
OPT_END()

2012-10-26 15:11:39

by Andrei Vagin

[permalink] [raw]
Subject: [tip:perf/core] perf inject: Mark a dso if it's used

Commit-ID: 54a3cf59b53b3f01989a28344ecf4cb68217a6f6
Gitweb: http://git.kernel.org/tip/54a3cf59b53b3f01989a28344ecf4cb68217a6f6
Author: Andrew Vagin <[email protected]>
AuthorDate: Tue, 7 Aug 2012 16:56:05 +0400
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Fri, 26 Oct 2012 11:22:25 -0200

perf inject: Mark a dso if it's used

Otherwise they will be not written in an output file.

Signed-off-by: Andrew Vagin <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
[ committer note: Fixed up wrt changes made in the immediate previous patches ]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/builtin-inject.c | 11 ++++++++---
tools/perf/util/build-id.c | 10 +++++-----
tools/perf/util/build-id.h | 4 ++++
3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index a4a3072..84ad6ab 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -14,6 +14,7 @@
#include "util/session.h"
#include "util/tool.h"
#include "util/debug.h"
+#include "util/build-id.h"

#include "util/parse-options.h"

@@ -116,6 +117,8 @@ static int perf_event__repipe_sample(struct perf_tool *tool,
return f(tool, event, sample, evsel, machine);
}

+ build_id__mark_dso_hit(tool, event, sample, evsel, machine);
+
return perf_event__repipe_synth(tool, event, machine);
}

@@ -310,6 +313,7 @@ found:
sample_sw.time = sample->time;
perf_event__synthesize_sample(event_sw, evsel->attr.sample_type,
&sample_sw, false);
+ build_id__mark_dso_hit(tool, event_sw, &sample_sw, evsel, machine);
return perf_event__repipe(tool, event_sw, &sample_sw, machine);
}

@@ -342,8 +346,7 @@ static int __cmd_inject(struct perf_inject *inject)

signal(SIGINT, sig_handler);

- if (inject->build_ids) {
- inject->tool.sample = perf_event__inject_buildid;
+ if (inject->build_ids || inject->sched_stat) {
inject->tool.mmap = perf_event__repipe_mmap;
inject->tool.fork = perf_event__repipe_fork;
inject->tool.tracing_data = perf_event__repipe_tracing_data;
@@ -353,7 +356,9 @@ static int __cmd_inject(struct perf_inject *inject)
if (session == NULL)
return -ENOMEM;

- if (inject->sched_stat) {
+ if (inject->build_ids) {
+ inject->tool.sample = perf_event__inject_buildid;
+ } else if (inject->sched_stat) {
struct perf_evsel *evsel;

inject->tool.ordered_samples = true;
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 6a63999..94ca117 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -16,11 +16,11 @@
#include "session.h"
#include "tool.h"

-static int build_id__mark_dso_hit(struct perf_tool *tool __maybe_unused,
- union perf_event *event,
- struct perf_sample *sample __maybe_unused,
- struct perf_evsel *evsel __maybe_unused,
- struct machine *machine)
+int build_id__mark_dso_hit(struct perf_tool *tool __maybe_unused,
+ union perf_event *event,
+ struct perf_sample *sample __maybe_unused,
+ struct perf_evsel *evsel __maybe_unused,
+ struct machine *machine)
{
struct addr_location al;
u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h
index a993ba8..45c500b 100644
--- a/tools/perf/util/build-id.h
+++ b/tools/perf/util/build-id.h
@@ -7,4 +7,8 @@ extern struct perf_tool build_id__mark_dso_hit_ops;

char *dso__build_id_filename(struct dso *self, char *bf, size_t size);

+int build_id__mark_dso_hit(struct perf_tool *tool, union perf_event *event,
+ struct perf_sample *sample, struct perf_evsel *evsel,
+ struct machine *machine);
+
#endif