2015-05-08 19:29:15

by Josef Bacik

[permalink] [raw]
Subject: [PATCH 0/5] trace-cmd profile enhancements

I had to use trace-cmd profile to debug a regression with hhvm recently. These
are the patches I needed to make trace-cmd profile useful to me. There are a
few changes in here

1) Adding --merge to trace-cmd profile. This merges events for like comms
together. HHVM is a huge app that spawns worker threads to handle requests, I
needed all of this data in one place to see how the app as a whole was behaving,
--merge provides this.

2) Fixup the output formatting. I also wanted to see which syscalls were taking
the longest but this was hard because we were just spitting out the syscall
number and it wasn't in any order, so now all the events are ordered based on
time taken and the syscall names are printed instead of the number.

3) Random trace-cmd hist fix, because it didn't work right with instances.

I've been using these in production for a couple of weeks now and they seem to
work well. Thanks,

Josef


2015-05-08 19:29:23

by Josef Bacik

[permalink] [raw]
Subject: [PATCH 1/5] trace-cmd: add option to group like comms for profile

When profiling large applications that have multiple children we really want to
see the profile of the app all together. Add a --merge option to accomplish
this for profile. This matches events based on comm instead of PID, this way we
can easilly see what a giant app is doing, such as HHVM. Thanks,

Signed-off-by: Josef Bacik <[email protected]>
---
trace-hash-local.h | 7 ++
trace-local.h | 1 +
trace-profile.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++++----
trace-record.c | 5 ++
trace-usage.c | 1 +
5 files changed, 208 insertions(+), 13 deletions(-)

diff --git a/trace-hash-local.h b/trace-hash-local.h
index 997b11c..5bae3f3 100644
--- a/trace-hash-local.h
+++ b/trace-hash-local.h
@@ -48,4 +48,11 @@ static inline unsigned int trace_hash(int val)
return hash;
}

+static inline unsigned int trace_hash_str(char *str)
+{
+ int val = 0;
+
+ while (*str) val += (int)*(str++);
+ return trace_hash(val);
+}
#endif /* _TRACE_HASH_LOCAL_H */
diff --git a/trace-local.h b/trace-local.h
index ab2bd34..50f36ba 100644
--- a/trace-local.h
+++ b/trace-local.h
@@ -81,6 +81,7 @@ int trace_profile_record(struct tracecmd_input *handle,
void trace_init_profile(struct tracecmd_input *handle, struct hook_list *hooks,
int global);
int trace_profile(void);
+void trace_profile_merge(void);

struct tracecmd_input *
trace_stream_init(struct buffer_instance *instance, int cpu, int fd, int cpus,
diff --git a/trace-profile.c b/trace-profile.c
index f71a7c6..bd775e8 100644
--- a/trace-profile.c
+++ b/trace-profile.c
@@ -34,6 +34,7 @@
#define start_from_item(item) container_of(item, struct start_data, hash)
#define event_from_item(item) container_of(item, struct event_hash, hash)
#define stack_from_item(item) container_of(item, struct stack_data, hash)
+#define group_from_item(item) container_of(item, struct group_data, hash)
#define event_data_from_item(item) container_of(item, struct event_data, hash)

static unsigned long long nsecs_per_sec(unsigned long long ts)
@@ -141,6 +142,12 @@ struct event_hash {
struct trace_hash stacks;
};

+struct group_data {
+ struct trace_hash_item hash;
+ char *comm;
+ struct trace_hash event_hash;
+};
+
struct task_data {
struct trace_hash_item hash;
int pid;
@@ -156,6 +163,7 @@ struct task_data {
struct event_hash *last_event;
struct pevent_record *last_stack;
struct handle_data *handle;
+ struct group_data *group;
};

struct cpu_info {
@@ -173,6 +181,7 @@ struct handle_data {
struct pevent *pevent;

struct trace_hash events;
+ struct trace_hash group_hash;

struct cpu_info **cpu_data;

@@ -196,6 +205,12 @@ struct handle_data {

static struct handle_data *handles;
static struct event_data *stacktrace_event;
+static bool merge_like_comms = false;
+
+void trace_profile_merge(void)
+{
+ merge_like_comms = true;
+}

static struct start_data *
add_start(struct task_data *task,
@@ -250,6 +265,7 @@ static struct event_hash *
find_event_hash(struct task_data *task, struct event_data_match *edata)
{
struct event_hash *event_hash;
+ struct trace_hash *hash;
struct trace_hash_item *item;
unsigned long long key;

@@ -257,7 +273,11 @@ find_event_hash(struct task_data *task, struct event_data_match *edata)
(unsigned long)edata->search_val +
(unsigned long)edata->val;
key = trace_hash(key);
- item = trace_hash_find(&task->event_hash, key, match_event, edata);
+ if (task->group)
+ hash = &task->group->event_hash;
+ else
+ hash = &task->event_hash;
+ item = trace_hash_find(hash, key, match_event, edata);
if (item)
return event_from_item(item);

@@ -270,7 +290,7 @@ find_event_hash(struct task_data *task, struct event_data_match *edata)
event_hash->hash.key = key;
trace_hash_init(&event_hash->stacks, 32);

- trace_hash_add(&task->event_hash, &event_hash->hash);
+ trace_hash_add(hash, &event_hash->hash);

return event_hash;
}
@@ -496,9 +516,59 @@ find_task(struct handle_data *h, int pid)
return last_task;
}

+static int match_group(struct trace_hash_item *item, void *data)
+{
+ struct group_data *group = group_from_item(item);
+
+ return strcmp(group->comm, (char *)data) == 0;
+}
+
+static void add_group(struct handle_data *h, struct task_data *task)
+{
+ unsigned long long key;
+ struct trace_hash_item *item;
+ void *data = task->comm;
+
+ if (!merge_like_comms)
+ return;
+ if (task->group || !task->comm)
+ return;
+
+ key = trace_hash_str(task->comm);
+
+ item = trace_hash_find(&h->group_hash, key, match_group, data);
+ if (item) {
+ task->group = group_from_item(item);
+ } else {
+ struct group_data *grp;
+ struct trace_hash_item **bucket;
+
+ grp = malloc_or_die(sizeof(*grp));
+ memset(grp, 0, sizeof(*grp));
+
+ grp->comm = strdup(task->comm);
+ if (!grp->comm)
+ die("strdup");
+ grp->hash.key = key;
+ trace_hash_add(&h->group_hash, &grp->hash);
+ task->group = grp;
+ trace_hash_init(&grp->event_hash, 32);
+
+ trace_hash_for_each_bucket(bucket, &task->event_hash) {
+ trace_hash_while_item(item, bucket) {
+ struct event_hash *event_hash;
+
+ event_hash = event_from_item(item);
+ trace_hash_del(&event_hash->hash);
+ trace_hash_add(&grp->event_hash, &event_hash->hash);
+ }
+ }
+ }
+}
+
static void
-add_task_comm(struct task_data *task, struct format_field *field,
- struct pevent_record *record)
+add_task_comm(struct handle_data *h, struct task_data *task,
+ struct format_field *field, struct pevent_record *record)
{
const char *comm;

@@ -506,6 +576,7 @@ add_task_comm(struct task_data *task, struct format_field *field,
comm = record->data + field->offset;
memcpy(task->comm, comm, field->size);
task->comm[field->size] = 0;
+ add_group(h, task);
}

/* Account for tasks that don't have starts */
@@ -932,7 +1003,7 @@ static int handle_sched_switch_event(struct handle_data *h,

task = find_task(h, prev_pid);
if (!task->comm)
- add_task_comm(task, h->switch_prev_comm, record);
+ add_task_comm(h, task, h->switch_prev_comm, record);

if (prev_state)
task->sleeping = 1;
@@ -940,13 +1011,13 @@ static int handle_sched_switch_event(struct handle_data *h,
task->sleeping = 0;

/* task is being scheduled out. prev_state tells why */
- start = add_start(task, event_data, record, prev_pid, prev_state);
+ start = add_start(task, event_data, record, prev_state, prev_state);
task->last_start = start;
task->last_event = NULL;

task = find_task(h, next_pid);
if (!task->comm)
- add_task_comm(task, h->switch_next_comm, record);
+ add_task_comm(h, task, h->switch_next_comm, record);

/*
* If the next task was blocked, it required a wakeup to
@@ -1133,7 +1204,7 @@ static int handle_sched_wakeup_event(struct handle_data *h,

task = find_task(h, pid);
if (!task->comm)
- add_task_comm(task, h->wakeup_comm, record);
+ add_task_comm(h, task, h->wakeup_comm, record);

/* if the task isn't sleeping, then ignore the wake up */
if (!task->sleeping) {
@@ -1196,6 +1267,7 @@ void trace_init_profile(struct tracecmd_input *handle, struct hook_list *hook,

trace_hash_init(&h->task_hash, 1024);
trace_hash_init(&h->events, 1024);
+ trace_hash_init(&h->group_hash, 512);

h->handle = handle;
h->pevent = pevent;
@@ -1874,6 +1946,9 @@ static void output_task(struct handle_data *h, struct task_data *task)
int nr_events = 0;
int i;

+ if (trace_hash_empty(&task->event_hash))
+ return;
+
if (task->comm)
comm = task->comm;
else
@@ -1907,6 +1982,39 @@ static void output_task(struct handle_data *h, struct task_data *task)
free(events);
}

+static void output_group(struct handle_data *h, struct group_data *group)
+{
+ struct trace_hash_item **bucket;
+ struct trace_hash_item *item;
+ struct event_hash **events;
+ int nr_events = 0;
+ int i;
+
+ printf("\ngroup: %s\n", group->comm);
+
+ trace_hash_for_each_bucket(bucket, &group->event_hash) {
+ trace_hash_for_each_item(item, bucket) {
+ nr_events++;
+ }
+ }
+
+ events = malloc_or_die(sizeof(*events) * nr_events);
+
+ i = 0;
+ trace_hash_for_each_bucket(bucket, &group->event_hash) {
+ trace_hash_for_each_item(item, bucket) {
+ events[i++] = event_from_item(item);
+ }
+ }
+
+ qsort(events, nr_events, sizeof(*events), compare_events);
+
+ for (i = 0; i < nr_events; i++)
+ output_event(events[i]);
+
+ free(events);
+}
+
static int compare_tasks(const void *a, const void *b)
{
struct task_data * const *A = a;
@@ -1919,6 +2027,14 @@ static int compare_tasks(const void *a, const void *b)
return 0;
}

+static int compare_groups(const void *a, const void *b)
+{
+ const char *A = a;
+ const char *B = b;
+
+ return strcmp(A, B);
+}
+
static void free_event_hash(struct event_hash *event_hash)
{
struct trace_hash_item **bucket;
@@ -1976,6 +2092,25 @@ static void free_task(struct task_data *task)
free(task);
}

+static void free_group(struct group_data *group)
+{
+ struct trace_hash_item **bucket;
+ struct trace_hash_item *item;
+ struct event_hash *event_hash;
+
+ free(group->comm);
+
+ trace_hash_for_each_bucket(bucket, &group->event_hash) {
+ trace_hash_while_item(item, bucket) {
+ event_hash = event_from_item(item);
+ trace_hash_del(item);
+ free_event_hash(event_hash);
+ }
+ }
+ trace_hash_free(&group->event_hash);
+ free(group);
+}
+
static void show_global_task(struct handle_data *h,
struct task_data *task)
{
@@ -1985,7 +2120,7 @@ static void show_global_task(struct handle_data *h,
output_task(h, task);
}

-static void output_handle(struct handle_data *h)
+static void output_tasks(struct handle_data *h)
{
struct trace_hash_item **bucket;
struct trace_hash_item *item;
@@ -1993,10 +2128,6 @@ static void output_handle(struct handle_data *h)
int nr_tasks = 0;
int i;

- show_global_task(h, h->global_task);
- for (i = 0; i < h->cpus; i++)
- show_global_task(h, &h->global_percpu_tasks[i]);
-
trace_hash_for_each_bucket(bucket, &h->task_hash) {
trace_hash_for_each_item(item, bucket) {
nr_tasks++;
@@ -2024,6 +2155,56 @@ static void output_handle(struct handle_data *h)
free(tasks);
}

+static void output_groups(struct handle_data *h)
+{
+ struct trace_hash_item **bucket;
+ struct trace_hash_item *item;
+ struct group_data **groups;
+ int nr_groups = 0;
+ int i;
+
+ trace_hash_for_each_bucket(bucket, &h->group_hash) {
+ trace_hash_for_each_item(item, bucket) {
+ nr_groups++;
+ }
+ }
+
+ if (nr_groups == 0)
+ return;
+
+ groups = malloc_or_die(sizeof(*groups) * nr_groups);
+
+ nr_groups = 0;
+
+ trace_hash_for_each_bucket(bucket, &h->group_hash) {
+ trace_hash_while_item(item, bucket) {
+ groups[nr_groups++] = group_from_item(item);
+ trace_hash_del(item);
+ }
+ }
+
+ qsort(groups, nr_groups, sizeof(*groups), compare_groups);
+
+ for (i = 0; i < nr_groups; i++) {
+ output_group(h, groups[i]);
+ free_group(groups[i]);
+ }
+
+ free(groups);
+}
+
+static void output_handle(struct handle_data *h)
+{
+ int i;
+
+ show_global_task(h, h->global_task);
+ for (i = 0; i < h->cpus; i++)
+ show_global_task(h, &h->global_percpu_tasks[i]);
+
+ output_groups(h);
+ output_tasks(h);
+}
+
int trace_profile(void)
{
struct handle_data *h;
diff --git a/trace-record.c b/trace-record.c
index 9874a12..7b61cde 100644
--- a/trace-record.c
+++ b/trace-record.c
@@ -3703,6 +3703,7 @@ static void add_hook(struct buffer_instance *instance, const char *arg)
}

enum {
+ OPT_merge = 250,
OPT_stderr = 251,
OPT_profile = 252,
OPT_nosplice = 253,
@@ -3871,6 +3872,7 @@ void trace_record (int argc, char **argv)
{"nosplice", no_argument, NULL, OPT_nosplice},
{"profile", no_argument, NULL, OPT_profile},
{"stderr", no_argument, NULL, OPT_stderr},
+ {"merge", no_argument, NULL, OPT_merge},
{"help", no_argument, NULL, '?'},
{NULL, 0, NULL, 0}
};
@@ -4102,6 +4104,9 @@ void trace_record (int argc, char **argv)
close(1);
dup2(2, 1);
break;
+ case OPT_merge:
+ trace_profile_merge();
+ break;
default:
usage(argv);
}
diff --git a/trace-usage.c b/trace-usage.c
index 520b14b..190990b 100644
--- a/trace-usage.c
+++ b/trace-usage.c
@@ -48,6 +48,7 @@ static struct usage_help usage_help[] = {
" -B create sub buffer and folling events will be enabled here\n"
" -k do not reset the buffers after tracing.\n"
" -i do not fail if an event is not found\n"
+ " --merge used with --profile, merge events for related comms\n"
" --profile enable tracing options needed for report --profile\n"
" --func-stack perform a stack trace for function tracer\n"
" (use with caution)\n"
--
2.1.0

2015-05-08 19:29:21

by Josef Bacik

[permalink] [raw]
Subject: [PATCH 2/5] trace-cmd: group schedule stacktraces together with --merge

With --merge we want to be able to merge the scheduling information as well so
we can see how the whole app spent its time. This is a little tricky because we
have to sort based on comm instead of pid. It works out well enough because we
store the intermediate information in the per task hash, but we want to group
the final information together.

Signed-off-by: Josef Bacik <[email protected]>
---
trace-profile.c | 22 ++++++++++++++++++----
1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/trace-profile.c b/trace-profile.c
index bd775e8..eee026e 100644
--- a/trace-profile.c
+++ b/trace-profile.c
@@ -985,6 +985,7 @@ static int handle_sched_switch_event(struct handle_data *h,
unsigned long long prev_pid;
unsigned long long prev_state;
unsigned long long next_pid;
+ unsigned long long val = 0;
struct start_data *start;

/* pid_field holds prev_pid, data_field holds prev_state */
@@ -1019,6 +1020,12 @@ static int handle_sched_switch_event(struct handle_data *h,
if (!task->comm)
add_task_comm(h, task, h->switch_next_comm, record);

+ if (task->comm && task->group) {
+ char *str = task->comm;
+ while (*str) val += (int)*(str++);
+ } else
+ val = next_pid;
+
/*
* If the next task was blocked, it required a wakeup to
* restart, and there should be one.
@@ -1028,10 +1035,10 @@ static int handle_sched_switch_event(struct handle_data *h,
*
* event_data->start holds the sched_wakeup event data.
*/
- find_and_update_start(task, event_data->start, record->ts, next_pid);
+ find_and_update_start(task, event_data->start, record->ts, val);

/* Look for this task if it was preempted (no wakeup found). */
- find_and_update_start(task, event_data, record->ts, next_pid);
+ find_and_update_start(task, event_data, record->ts, val);

return 0;
}
@@ -1186,6 +1193,7 @@ static int handle_sched_wakeup_event(struct handle_data *h,
struct task_data *task = NULL;
struct start_data *start;
unsigned long long success;
+ unsigned long long val = 0;

proxy = find_task(h, pid);

@@ -1224,11 +1232,17 @@ static int handle_sched_wakeup_event(struct handle_data *h,
*/
proxy->proxy = task;

+ if (task->comm && task->group) {
+ char *str = task->comm;
+ while (*str) val += (int)*(str++);
+ } else
+ val = pid;
+
/* There should be a blocked schedule out of this task */
- find_and_update_start(task, event_data->start, record->ts, pid);
+ find_and_update_start(task, event_data->start, record->ts, val);

/* Set this up for timing how long the wakeup takes */
- start = add_start(task, event_data, record, pid, pid);
+ start = add_start(task, event_data, record, val, val);
task->last_event = NULL;
task->last_start = start;

--
2.1.0

2015-05-08 19:29:27

by Josef Bacik

[permalink] [raw]
Subject: [PATCH 3/5] trace-cmd: lookup syscall names in profile

When we profile syscall times we're just given the syscall number, not the name.
This isn't helpful for human beings, so lookup the syscall names when we are
printing out the information. This uses the audit library to get this
information, I'm not married to it, but it seems that any app that does this
mapping has to scrape /usr/include/asm/unistd*.h to pull out the names, so might
as well just use somebody elses work.

Signed-off-by: Josef Bacik <[email protected]>
---
Makefile | 2 +-
trace-profile.c | 19 ++++++++++++++++---
2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 63f7e79..402f711 100644
--- a/Makefile
+++ b/Makefile
@@ -160,7 +160,7 @@ bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
plugin_dir_SQ = $(subst ','\'',$(plugin_dir))
python_dir_SQ = $(subst ','\'',$(python_dir))

-LIBS = -L. -ltracecmd -ldl
+LIBS = -L. -ltracecmd -ldl -laudit
LIB_FILE = libtracecmd.a

PACKAGES= gtk+-2.0 libxml-2.0 gthread-2.0
diff --git a/trace-profile.c b/trace-profile.c
index eee026e..0b84c11 100644
--- a/trace-profile.c
+++ b/trace-profile.c
@@ -23,6 +23,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <libaudit.h>

#include "trace-local.h"
#include "trace-hash.h"
@@ -916,8 +917,20 @@ static void func_print(struct trace_seq *s, struct event_hash *event_hash)
trace_seq_printf(s, "func: 0x%llx", event_hash->val);
}

-static void print_int(struct trace_seq *s, struct event_hash *event_hash)
+static void syscall_print(struct trace_seq *s, struct event_hash *event_hash)
{
+ const char *name = NULL;
+ int machine;
+
+ machine = audit_detect_machine();
+ if (machine < 0)
+ goto fail;
+ name = audit_syscall_to_name(event_hash->val, machine);
+ if (!name)
+ goto fail;
+ trace_seq_printf(s, "syscall:%s", name);
+ return;
+fail:
trace_seq_printf(s, "%s:%d", event_hash->event_data->event->name,
(int)event_hash->val);
}
@@ -1425,8 +1438,8 @@ void trace_init_profile(struct tracecmd_input *handle, struct hook_list *hook,

if (syscall_enter && syscall_exit) {
mate_events(h, syscall_enter, NULL, "id", syscall_exit, "id", 1, 0);
- syscall_enter->print_func = print_int;
- syscall_exit->print_func = print_int;
+ syscall_enter->print_func = syscall_print;
+ syscall_exit->print_func = syscall_print;
}

events = pevent_list_events(pevent, EVENT_SORT_ID);
--
2.1.0

2015-05-08 19:29:30

by Josef Bacik

[permalink] [raw]
Subject: [PATCH 4/5] trace-cmd: sort the events in the profile output

Currently we just spit out the events as they appear in the hash, which makes it
hard to tell where we are spending our time. Instead order the output by type
and total time spent. So you'll get something like this

syscall 1: 100000ms
syscall 2: 80000ms
syscall 3: 4ms
func 1: 10ms
func 2: 1ms

instead of them all jumbled up.

Signed-off-by: Josef Bacik <[email protected]>
---
trace-profile.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/trace-profile.c b/trace-profile.c
index 0b84c11..3a9a9f3 100644
--- a/trace-profile.c
+++ b/trace-profile.c
@@ -1961,6 +1961,10 @@ static int compare_events(const void *a, const void *b)
return 1;
if (event_data_a->id < event_data_b->id)
return -1;
+ if ((*A)->time_total > (*B)->time_total)
+ return -1;
+ if ((*A)->time_total < (*B)->time_total)
+ return 1;
return 0;
}

--
2.1.0

2015-05-08 19:29:48

by Josef Bacik

[permalink] [raw]
Subject: [PATCH 5/5] trace-cmd: make trace-hist work with instances

If you recorded information completely in an instance then trace-hist wouldn't
think your trace.dat file had any events because it didn't know about instances.
Fix this.

Signed-off-by: Josef Bacik <[email protected]>
---
trace-hist.c | 33 +++++++++++++++++++++++++--------
1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/trace-hist.c b/trace-hist.c
index 574549e..2a2e780 100644
--- a/trace-hist.c
+++ b/trace-hist.c
@@ -954,13 +954,6 @@ static void do_trace_hist(struct tracecmd_input *handle)
int cpu;
int ret;

- ret = tracecmd_init_data(handle);
- if (ret < 0)
- die("failed to init data");
-
- if (ret > 0)
- die("trace-cmd hist does not work with latency traces\n");
-
cpus = tracecmd_cpus(handle);

/* Need to get any event */
@@ -1025,6 +1018,7 @@ void trace_hist(int argc, char **argv)
{
struct tracecmd_input *handle;
const char *input_file = NULL;
+ int instances;
int ret;

for (;;) {
@@ -1067,7 +1061,30 @@ void trace_hist(int argc, char **argv)
if (ret)
return;

- do_trace_hist(handle);
+ ret = tracecmd_init_data(handle);
+ if (ret < 0)
+ die("failed to init data");
+
+ if (ret > 0)
+ die("trace-cmd hist does not work with latency traces\n");
+
+ instances = tracecmd_buffer_instances(handle);
+ if (instances) {
+ struct tracecmd_input *new_handle;
+ int i;
+
+ for (i = 0; i < instances; i++) {
+ new_handle = tracecmd_buffer_instance_handle(handle, i);
+ if (!new_handle) {
+ warning("could not retrieve handle %d", i);
+ continue;
+ }
+ do_trace_hist(new_handle);
+ tracecmd_close(new_handle);
+ }
+ } else {
+ do_trace_hist(handle);
+ }

tracecmd_close(handle);
}
--
2.1.0

2015-05-08 20:13:16

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 0/5] trace-cmd profile enhancements

On Fri, 8 May 2015 15:28:59 -0400
Josef Bacik <[email protected]> wrote:

> I had to use trace-cmd profile to debug a regression with hhvm recently. These
> are the patches I needed to make trace-cmd profile useful to me. There are a
> few changes in here
>
> 1) Adding --merge to trace-cmd profile. This merges events for like comms
> together. HHVM is a huge app that spawns worker threads to handle requests, I
> needed all of this data in one place to see how the app as a whole was behaving,
> --merge provides this.
>
> 2) Fixup the output formatting. I also wanted to see which syscalls were taking
> the longest but this was hard because we were just spitting out the syscall
> number and it wasn't in any order, so now all the events are ordered based on
> time taken and the syscall names are printed instead of the number.
>
> 3) Random trace-cmd hist fix, because it didn't work right with instances.
>
> I've been using these in production for a couple of weeks now and they seem to
> work well. Thanks,
>

Josef,

Thanks a lot! I really appreciate this.

I'll take a look at it next week as I have other things on my plate for
today. Ping me if I somehow forget to do so.

Thanks again,

-- Steve

2015-05-20 17:31:38

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 1/5] trace-cmd: add option to group like comms for profile

Finally have some time to look at this.


On Fri, 8 May 2015 15:29:00 -0400
Josef Bacik <[email protected]> wrote:

> When profiling large applications that have multiple children we really want to
> see the profile of the app all together. Add a --merge option to accomplish
> this for profile. This matches events based on comm instead of PID, this way we
> can easilly see what a giant app is doing, such as HHVM. Thanks,
>
> Signed-off-by: Josef Bacik <[email protected]>
> ---
> trace-hash-local.h | 7 ++
> trace-local.h | 1 +
> trace-profile.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++++----
> trace-record.c | 5 ++

Shouldn't we add this option to trace-cmd report --profile ?

Also, I think we need a man page.

Also, I'm not so sure I like the "--merge" name. It's not very
descriptive to what it does. What about "--by-comm"? Or maybe "--comm"?

> trace-usage.c | 1 +
> 5 files changed, 208 insertions(+), 13 deletions(-)
>
> diff --git a/trace-hash-local.h b/trace-hash-local.h
> index 997b11c..5bae3f3 100644
> --- a/trace-hash-local.h
> +++ b/trace-hash-local.h
> @@ -48,4 +48,11 @@ static inline unsigned int trace_hash(int val)
> return hash;
> }
>
> +static inline unsigned int trace_hash_str(char *str)
> +{
> + int val = 0;
> +
> + while (*str) val += (int)*(str++);
> + return trace_hash(val);
> +}

Hmm, the above doesn't account for order of characters. That is "bash"
is the same as "shab". What about this:

int val = 0;
int i;

for (i = 0; str[i]; i++)
val += ((int)str[i]) << (i & 0xffffff);
return trace_hash(val);



> #endif /* _TRACE_HASH_LOCAL_H */
> diff --git a/trace-local.h b/trace-local.h
> index ab2bd34..50f36ba 100644
> --- a/trace-local.h
> +++ b/trace-local.h
> @@ -81,6 +81,7 @@ int trace_profile_record(struct tracecmd_input *handle,
> void trace_init_profile(struct tracecmd_input *handle, struct hook_list *hooks,
> int global);
> int trace_profile(void);
> +void trace_profile_merge(void);
>
> struct tracecmd_input *
> trace_stream_init(struct buffer_instance *instance, int cpu, int fd, int cpus,
> diff --git a/trace-profile.c b/trace-profile.c
> index f71a7c6..bd775e8 100644
> --- a/trace-profile.c
> +++ b/trace-profile.c
> @@ -34,6 +34,7 @@
> #define start_from_item(item) container_of(item, struct start_data, hash)
> #define event_from_item(item) container_of(item, struct event_hash, hash)
> #define stack_from_item(item) container_of(item, struct stack_data, hash)
> +#define group_from_item(item) container_of(item, struct group_data, hash)
> #define event_data_from_item(item) container_of(item, struct event_data, hash)
>
> static unsigned long long nsecs_per_sec(unsigned long long ts)
> @@ -141,6 +142,12 @@ struct event_hash {
> struct trace_hash stacks;
> };
>
> +struct group_data {
> + struct trace_hash_item hash;
> + char *comm;
> + struct trace_hash event_hash;
> +};
> +
> struct task_data {
> struct trace_hash_item hash;
> int pid;
> @@ -156,6 +163,7 @@ struct task_data {
> struct event_hash *last_event;
> struct pevent_record *last_stack;
> struct handle_data *handle;
> + struct group_data *group;
> };
>
> struct cpu_info {
> @@ -173,6 +181,7 @@ struct handle_data {
> struct pevent *pevent;
>
> struct trace_hash events;
> + struct trace_hash group_hash;
>
> struct cpu_info **cpu_data;
>
> @@ -196,6 +205,12 @@ struct handle_data {
>
> static struct handle_data *handles;
> static struct event_data *stacktrace_event;
> +static bool merge_like_comms = false;
> +
> +void trace_profile_merge(void)
> +{
> + merge_like_comms = true;
> +}
>
> static struct start_data *
> add_start(struct task_data *task,
> @@ -250,6 +265,7 @@ static struct event_hash *
> find_event_hash(struct task_data *task, struct event_data_match *edata)
> {
> struct event_hash *event_hash;
> + struct trace_hash *hash;
> struct trace_hash_item *item;
> unsigned long long key;
>
> @@ -257,7 +273,11 @@ find_event_hash(struct task_data *task, struct event_data_match *edata)
> (unsigned long)edata->search_val +
> (unsigned long)edata->val;
> key = trace_hash(key);
> - item = trace_hash_find(&task->event_hash, key, match_event, edata);
> + if (task->group)
> + hash = &task->group->event_hash;
> + else
> + hash = &task->event_hash;
> + item = trace_hash_find(hash, key, match_event, edata);
> if (item)
> return event_from_item(item);
>
> @@ -270,7 +290,7 @@ find_event_hash(struct task_data *task, struct event_data_match *edata)
> event_hash->hash.key = key;
> trace_hash_init(&event_hash->stacks, 32);
>
> - trace_hash_add(&task->event_hash, &event_hash->hash);
> + trace_hash_add(hash, &event_hash->hash);
>
> return event_hash;
> }
> @@ -496,9 +516,59 @@ find_task(struct handle_data *h, int pid)
> return last_task;
> }
>
> +static int match_group(struct trace_hash_item *item, void *data)
> +{
> + struct group_data *group = group_from_item(item);
> +
> + return strcmp(group->comm, (char *)data) == 0;
> +}
> +
> +static void add_group(struct handle_data *h, struct task_data *task)
> +{
> + unsigned long long key;
> + struct trace_hash_item *item;
> + void *data = task->comm;
> +
> + if (!merge_like_comms)
> + return;
> + if (task->group || !task->comm)
> + return;
> +
> + key = trace_hash_str(task->comm);
> +
> + item = trace_hash_find(&h->group_hash, key, match_group, data);
> + if (item) {
> + task->group = group_from_item(item);
> + } else {
> + struct group_data *grp;
> + struct trace_hash_item **bucket;
> +
> + grp = malloc_or_die(sizeof(*grp));
> + memset(grp, 0, sizeof(*grp));
> +
> + grp->comm = strdup(task->comm);
> + if (!grp->comm)
> + die("strdup");
> + grp->hash.key = key;
> + trace_hash_add(&h->group_hash, &grp->hash);
> + task->group = grp;
> + trace_hash_init(&grp->event_hash, 32);
> +
> + trace_hash_for_each_bucket(bucket, &task->event_hash) {
> + trace_hash_while_item(item, bucket) {
> + struct event_hash *event_hash;
> +
> + event_hash = event_from_item(item);
> + trace_hash_del(&event_hash->hash);
> + trace_hash_add(&grp->event_hash, &event_hash->hash);
> + }
> + }
> + }
> +}
> +
> static void
> -add_task_comm(struct task_data *task, struct format_field *field,
> - struct pevent_record *record)
> +add_task_comm(struct handle_data *h, struct task_data *task,
> + struct format_field *field, struct pevent_record *record)
> {
> const char *comm;
>
> @@ -506,6 +576,7 @@ add_task_comm(struct task_data *task, struct format_field *field,
> comm = record->data + field->offset;
> memcpy(task->comm, comm, field->size);
> task->comm[field->size] = 0;
> + add_group(h, task);
> }
>
> /* Account for tasks that don't have starts */
> @@ -932,7 +1003,7 @@ static int handle_sched_switch_event(struct handle_data *h,
>
> task = find_task(h, prev_pid);
> if (!task->comm)
> - add_task_comm(task, h->switch_prev_comm, record);
> + add_task_comm(h, task, h->switch_prev_comm, record);
>
> if (prev_state)
> task->sleeping = 1;
> @@ -940,13 +1011,13 @@ static int handle_sched_switch_event(struct handle_data *h,
> task->sleeping = 0;
>
> /* task is being scheduled out. prev_state tells why */
> - start = add_start(task, event_data, record, prev_pid, prev_state);
> + start = add_start(task, event_data, record, prev_state, prev_state);
> task->last_start = start;
> task->last_event = NULL;
>
> task = find_task(h, next_pid);
> if (!task->comm)
> - add_task_comm(task, h->switch_next_comm, record);
> + add_task_comm(h, task, h->switch_next_comm, record);
>
> /*
> * If the next task was blocked, it required a wakeup to
> @@ -1133,7 +1204,7 @@ static int handle_sched_wakeup_event(struct handle_data *h,
>
> task = find_task(h, pid);
> if (!task->comm)
> - add_task_comm(task, h->wakeup_comm, record);
> + add_task_comm(h, task, h->wakeup_comm, record);
>
> /* if the task isn't sleeping, then ignore the wake up */
> if (!task->sleeping) {
> @@ -1196,6 +1267,7 @@ void trace_init_profile(struct tracecmd_input *handle, struct hook_list *hook,
>
> trace_hash_init(&h->task_hash, 1024);
> trace_hash_init(&h->events, 1024);
> + trace_hash_init(&h->group_hash, 512);
>
> h->handle = handle;
> h->pevent = pevent;
> @@ -1874,6 +1946,9 @@ static void output_task(struct handle_data *h, struct task_data *task)
> int nr_events = 0;
> int i;
>
> + if (trace_hash_empty(&task->event_hash))
> + return;
> +

Was this needed for this patch, or is it for groups? Even if the task
doesn't have any events, I do sometimes want to see if it was recorder.

If you added this to not show tasks if they belong to a group, why not
add:

if (task->group)
return;
?

> if (task->comm)
> comm = task->comm;
> else
> @@ -1907,6 +1982,39 @@ static void output_task(struct handle_data *h, struct task_data *task)
> free(events);
> }
>
> +static void output_group(struct handle_data *h, struct group_data *group)
> +{
> + struct trace_hash_item **bucket;
> + struct trace_hash_item *item;
> + struct event_hash **events;
> + int nr_events = 0;
> + int i;
> +
> + printf("\ngroup: %s\n", group->comm);
> +
> + trace_hash_for_each_bucket(bucket, &group->event_hash) {
> + trace_hash_for_each_item(item, bucket) {
> + nr_events++;
> + }
> + }
> +
> + events = malloc_or_die(sizeof(*events) * nr_events);
> +
> + i = 0;
> + trace_hash_for_each_bucket(bucket, &group->event_hash) {
> + trace_hash_for_each_item(item, bucket) {
> + events[i++] = event_from_item(item);
> + }
> + }
> +
> + qsort(events, nr_events, sizeof(*events), compare_events);
> +
> + for (i = 0; i < nr_events; i++)
> + output_event(events[i]);
> +
> + free(events);
> +}
> +
> static int compare_tasks(const void *a, const void *b)
> {
> struct task_data * const *A = a;
> @@ -1919,6 +2027,14 @@ static int compare_tasks(const void *a, const void *b)
> return 0;
> }
>
> +static int compare_groups(const void *a, const void *b)
> +{
> + const char *A = a;
> + const char *B = b;
> +
> + return strcmp(A, B);
> +}
> +
> static void free_event_hash(struct event_hash *event_hash)
> {
> struct trace_hash_item **bucket;
> @@ -1976,6 +2092,25 @@ static void free_task(struct task_data *task)
> free(task);
> }
>
> +static void free_group(struct group_data *group)
> +{
> + struct trace_hash_item **bucket;
> + struct trace_hash_item *item;
> + struct event_hash *event_hash;
> +
> + free(group->comm);
> +
> + trace_hash_for_each_bucket(bucket, &group->event_hash) {
> + trace_hash_while_item(item, bucket) {
> + event_hash = event_from_item(item);
> + trace_hash_del(item);
> + free_event_hash(event_hash);
> + }
> + }
> + trace_hash_free(&group->event_hash);
> + free(group);
> +}
> +
> static void show_global_task(struct handle_data *h,
> struct task_data *task)
> {
> @@ -1985,7 +2120,7 @@ static void show_global_task(struct handle_data *h,
> output_task(h, task);
> }
>
> -static void output_handle(struct handle_data *h)
> +static void output_tasks(struct handle_data *h)
> {
> struct trace_hash_item **bucket;
> struct trace_hash_item *item;
> @@ -1993,10 +2128,6 @@ static void output_handle(struct handle_data *h)
> int nr_tasks = 0;
> int i;
>
> - show_global_task(h, h->global_task);
> - for (i = 0; i < h->cpus; i++)
> - show_global_task(h, &h->global_percpu_tasks[i]);
> -
> trace_hash_for_each_bucket(bucket, &h->task_hash) {
> trace_hash_for_each_item(item, bucket) {
> nr_tasks++;
> @@ -2024,6 +2155,56 @@ static void output_handle(struct handle_data *h)
> free(tasks);
> }
>
> +static void output_groups(struct handle_data *h)
> +{
> + struct trace_hash_item **bucket;
> + struct trace_hash_item *item;
> + struct group_data **groups;
> + int nr_groups = 0;
> + int i;
> +
> + trace_hash_for_each_bucket(bucket, &h->group_hash) {
> + trace_hash_for_each_item(item, bucket) {
> + nr_groups++;
> + }
> + }
> +
> + if (nr_groups == 0)
> + return;
> +
> + groups = malloc_or_die(sizeof(*groups) * nr_groups);
> +
> + nr_groups = 0;
> +
> + trace_hash_for_each_bucket(bucket, &h->group_hash) {
> + trace_hash_while_item(item, bucket) {
> + groups[nr_groups++] = group_from_item(item);
> + trace_hash_del(item);
> + }
> + }
> +
> + qsort(groups, nr_groups, sizeof(*groups), compare_groups);
> +
> + for (i = 0; i < nr_groups; i++) {
> + output_group(h, groups[i]);
> + free_group(groups[i]);
> + }
> +
> + free(groups);
> +}
> +
> +static void output_handle(struct handle_data *h)
> +{
> + int i;
> +
> + show_global_task(h, h->global_task);
> + for (i = 0; i < h->cpus; i++)
> + show_global_task(h, &h->global_percpu_tasks[i]);
> +
> + output_groups(h);
> + output_tasks(h);
> +}
> +


Hmm, I wonder if the combination should be done on output. Because how
do you handle threads where an event is per task? Like scheduling one
out and the other one in? What about wake ups? If a bunch of threads
get woken up, how do we match the wakeups to the tasks?

Would it be easier to just match all the tasks by comm after the
recording was done? I would think matching pids would be quicker than
matching strings.

-- Steve

> int trace_profile(void)
> {
> struct handle_data *h;
> diff --git a/trace-record.c b/trace-record.c
> index 9874a12..7b61cde 100644
> --- a/trace-record.c
> +++ b/trace-record.c
> @@ -3703,6 +3703,7 @@ static void add_hook(struct buffer_instance *instance, const char *arg)
> }
>
> enum {
> + OPT_merge = 250,
> OPT_stderr = 251,
> OPT_profile = 252,
> OPT_nosplice = 253,
> @@ -3871,6 +3872,7 @@ void trace_record (int argc, char **argv)
> {"nosplice", no_argument, NULL, OPT_nosplice},
> {"profile", no_argument, NULL, OPT_profile},
> {"stderr", no_argument, NULL, OPT_stderr},
> + {"merge", no_argument, NULL, OPT_merge},
> {"help", no_argument, NULL, '?'},
> {NULL, 0, NULL, 0}
> };
> @@ -4102,6 +4104,9 @@ void trace_record (int argc, char **argv)
> close(1);
> dup2(2, 1);
> break;
> + case OPT_merge:
> + trace_profile_merge();
> + break;
> default:
> usage(argv);
> }
> diff --git a/trace-usage.c b/trace-usage.c
> index 520b14b..190990b 100644
> --- a/trace-usage.c
> +++ b/trace-usage.c
> @@ -48,6 +48,7 @@ static struct usage_help usage_help[] = {
> " -B create sub buffer and folling events will be enabled here\n"
> " -k do not reset the buffers after tracing.\n"
> " -i do not fail if an event is not found\n"
> + " --merge used with --profile, merge events for related comms\n"
> " --profile enable tracing options needed for report --profile\n"
> " --func-stack perform a stack trace for function tracer\n"
> " (use with caution)\n"

2015-05-20 17:36:18

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 2/5] trace-cmd: group schedule stacktraces together with --merge

On Fri, 8 May 2015 15:29:01 -0400
Josef Bacik <[email protected]> wrote:

> With --merge we want to be able to merge the scheduling information as well so
> we can see how the whole app spent its time. This is a little tricky because we
> have to sort based on comm instead of pid. It works out well enough because we
> store the intermediate information in the per task hash, but we want to group
> the final information together.

If we do the merge at the end, we wont need to worry about this.

-- Steve

>
> Signed-off-by: Josef Bacik <[email protected]>
>

2015-05-20 17:52:03

by Josef Bacik

[permalink] [raw]
Subject: Re: [PATCH 1/5] trace-cmd: add option to group like comms for profile

On 05/20/2015 01:31 PM, Steven Rostedt wrote:
> Finally have some time to look at this.
>
>
> On Fri, 8 May 2015 15:29:00 -0400
> Josef Bacik <[email protected]> wrote:
>
>> When profiling large applications that have multiple children we really want to
>> see the profile of the app all together. Add a --merge option to accomplish
>> this for profile. This matches events based on comm instead of PID, this way we
>> can easilly see what a giant app is doing, such as HHVM. Thanks,
>>
>> Signed-off-by: Josef Bacik <[email protected]>
>> ---
>> trace-hash-local.h | 7 ++
>> trace-local.h | 1 +
>> trace-profile.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++++----
>> trace-record.c | 5 ++
>
> Shouldn't we add this option to trace-cmd report --profile ?

Ah yeah I missed that part of it, I'll add that bit.

>
> Also, I think we need a man page.
>

No complaints here, but I'm not volunteering either ;).

> Also, I'm not so sure I like the "--merge" name. It's not very
> descriptive to what it does. What about "--by-comm"? Or maybe "--comm"?
>

I like --by-comm, I'll change it.

>> trace-usage.c | 1 +
>> 5 files changed, 208 insertions(+), 13 deletions(-)
>>
>> diff --git a/trace-hash-local.h b/trace-hash-local.h
>> index 997b11c..5bae3f3 100644
>> --- a/trace-hash-local.h
>> +++ b/trace-hash-local.h
>> @@ -48,4 +48,11 @@ static inline unsigned int trace_hash(int val)
>> return hash;
>> }
>>
>> +static inline unsigned int trace_hash_str(char *str)
>> +{
>> + int val = 0;
>> +
>> + while (*str) val += (int)*(str++);
>> + return trace_hash(val);
>> +}
>
> Hmm, the above doesn't account for order of characters. That is "bash"
> is the same as "shab". What about this:
>
> int val = 0;
> int i;
>
> for (i = 0; str[i]; i++)
> val += ((int)str[i]) << (i & 0xffffff);
> return trace_hash(val);
>

Good point, I meant to look up a real hashing thing later but obviously
forgot, I'll switch to yours.

>
>
>> #endif /* _TRACE_HASH_LOCAL_H */
>> diff --git a/trace-local.h b/trace-local.h
>> index ab2bd34..50f36ba 100644
>> --- a/trace-local.h
>> +++ b/trace-local.h
>> @@ -81,6 +81,7 @@ int trace_profile_record(struct tracecmd_input *handle,
>> void trace_init_profile(struct tracecmd_input *handle, struct hook_list *hooks,
>> int global);
>> int trace_profile(void);
>> +void trace_profile_merge(void);
>>
>> struct tracecmd_input *
>> trace_stream_init(struct buffer_instance *instance, int cpu, int fd, int cpus,
>> diff --git a/trace-profile.c b/trace-profile.c
>> index f71a7c6..bd775e8 100644
>> --- a/trace-profile.c
>> +++ b/trace-profile.c
>> @@ -34,6 +34,7 @@
>> #define start_from_item(item) container_of(item, struct start_data, hash)
>> #define event_from_item(item) container_of(item, struct event_hash, hash)
>> #define stack_from_item(item) container_of(item, struct stack_data, hash)
>> +#define group_from_item(item) container_of(item, struct group_data, hash)
>> #define event_data_from_item(item) container_of(item, struct event_data, hash)
>>
>> static unsigned long long nsecs_per_sec(unsigned long long ts)
>> @@ -141,6 +142,12 @@ struct event_hash {
>> struct trace_hash stacks;
>> };
>>
>> +struct group_data {
>> + struct trace_hash_item hash;
>> + char *comm;
>> + struct trace_hash event_hash;
>> +};
>> +
>> struct task_data {
>> struct trace_hash_item hash;
>> int pid;
>> @@ -156,6 +163,7 @@ struct task_data {
>> struct event_hash *last_event;
>> struct pevent_record *last_stack;
>> struct handle_data *handle;
>> + struct group_data *group;
>> };
>>
>> struct cpu_info {
>> @@ -173,6 +181,7 @@ struct handle_data {
>> struct pevent *pevent;
>>
>> struct trace_hash events;
>> + struct trace_hash group_hash;
>>
>> struct cpu_info **cpu_data;
>>
>> @@ -196,6 +205,12 @@ struct handle_data {
>>
>> static struct handle_data *handles;
>> static struct event_data *stacktrace_event;
>> +static bool merge_like_comms = false;
>> +
>> +void trace_profile_merge(void)
>> +{
>> + merge_like_comms = true;
>> +}
>>
>> static struct start_data *
>> add_start(struct task_data *task,
>> @@ -250,6 +265,7 @@ static struct event_hash *
>> find_event_hash(struct task_data *task, struct event_data_match *edata)
>> {
>> struct event_hash *event_hash;
>> + struct trace_hash *hash;
>> struct trace_hash_item *item;
>> unsigned long long key;
>>
>> @@ -257,7 +273,11 @@ find_event_hash(struct task_data *task, struct event_data_match *edata)
>> (unsigned long)edata->search_val +
>> (unsigned long)edata->val;
>> key = trace_hash(key);
>> - item = trace_hash_find(&task->event_hash, key, match_event, edata);
>> + if (task->group)
>> + hash = &task->group->event_hash;
>> + else
>> + hash = &task->event_hash;
>> + item = trace_hash_find(hash, key, match_event, edata);
>> if (item)
>> return event_from_item(item);
>>
>> @@ -270,7 +290,7 @@ find_event_hash(struct task_data *task, struct event_data_match *edata)
>> event_hash->hash.key = key;
>> trace_hash_init(&event_hash->stacks, 32);
>>
>> - trace_hash_add(&task->event_hash, &event_hash->hash);
>> + trace_hash_add(hash, &event_hash->hash);
>>
>> return event_hash;
>> }
>> @@ -496,9 +516,59 @@ find_task(struct handle_data *h, int pid)
>> return last_task;
>> }
>>
>> +static int match_group(struct trace_hash_item *item, void *data)
>> +{
>> + struct group_data *group = group_from_item(item);
>> +
>> + return strcmp(group->comm, (char *)data) == 0;
>> +}
>> +
>> +static void add_group(struct handle_data *h, struct task_data *task)
>> +{
>> + unsigned long long key;
>> + struct trace_hash_item *item;
>> + void *data = task->comm;
>> +
>> + if (!merge_like_comms)
>> + return;
>> + if (task->group || !task->comm)
>> + return;
>> +
>> + key = trace_hash_str(task->comm);
>> +
>> + item = trace_hash_find(&h->group_hash, key, match_group, data);
>> + if (item) {
>> + task->group = group_from_item(item);
>> + } else {
>> + struct group_data *grp;
>> + struct trace_hash_item **bucket;
>> +
>> + grp = malloc_or_die(sizeof(*grp));
>> + memset(grp, 0, sizeof(*grp));
>> +
>> + grp->comm = strdup(task->comm);
>> + if (!grp->comm)
>> + die("strdup");
>> + grp->hash.key = key;
>> + trace_hash_add(&h->group_hash, &grp->hash);
>> + task->group = grp;
>> + trace_hash_init(&grp->event_hash, 32);
>> +
>> + trace_hash_for_each_bucket(bucket, &task->event_hash) {
>> + trace_hash_while_item(item, bucket) {
>> + struct event_hash *event_hash;
>> +
>> + event_hash = event_from_item(item);
>> + trace_hash_del(&event_hash->hash);
>> + trace_hash_add(&grp->event_hash, &event_hash->hash);
>> + }
>> + }
>> + }
>> +}
>> +
>> static void
>> -add_task_comm(struct task_data *task, struct format_field *field,
>> - struct pevent_record *record)
>> +add_task_comm(struct handle_data *h, struct task_data *task,
>> + struct format_field *field, struct pevent_record *record)
>> {
>> const char *comm;
>>
>> @@ -506,6 +576,7 @@ add_task_comm(struct task_data *task, struct format_field *field,
>> comm = record->data + field->offset;
>> memcpy(task->comm, comm, field->size);
>> task->comm[field->size] = 0;
>> + add_group(h, task);
>> }
>>
>> /* Account for tasks that don't have starts */
>> @@ -932,7 +1003,7 @@ static int handle_sched_switch_event(struct handle_data *h,
>>
>> task = find_task(h, prev_pid);
>> if (!task->comm)
>> - add_task_comm(task, h->switch_prev_comm, record);
>> + add_task_comm(h, task, h->switch_prev_comm, record);
>>
>> if (prev_state)
>> task->sleeping = 1;
>> @@ -940,13 +1011,13 @@ static int handle_sched_switch_event(struct handle_data *h,
>> task->sleeping = 0;
>>
>> /* task is being scheduled out. prev_state tells why */
>> - start = add_start(task, event_data, record, prev_pid, prev_state);
>> + start = add_start(task, event_data, record, prev_state, prev_state);
>> task->last_start = start;
>> task->last_event = NULL;
>>
>> task = find_task(h, next_pid);
>> if (!task->comm)
>> - add_task_comm(task, h->switch_next_comm, record);
>> + add_task_comm(h, task, h->switch_next_comm, record);
>>
>> /*
>> * If the next task was blocked, it required a wakeup to
>> @@ -1133,7 +1204,7 @@ static int handle_sched_wakeup_event(struct handle_data *h,
>>
>> task = find_task(h, pid);
>> if (!task->comm)
>> - add_task_comm(task, h->wakeup_comm, record);
>> + add_task_comm(h, task, h->wakeup_comm, record);
>>
>> /* if the task isn't sleeping, then ignore the wake up */
>> if (!task->sleeping) {
>> @@ -1196,6 +1267,7 @@ void trace_init_profile(struct tracecmd_input *handle, struct hook_list *hook,
>>
>> trace_hash_init(&h->task_hash, 1024);
>> trace_hash_init(&h->events, 1024);
>> + trace_hash_init(&h->group_hash, 512);
>>
>> h->handle = handle;
>> h->pevent = pevent;
>> @@ -1874,6 +1946,9 @@ static void output_task(struct handle_data *h, struct task_data *task)
>> int nr_events = 0;
>> int i;
>>
>> + if (trace_hash_empty(&task->event_hash))
>> + return;
>> +
>
> Was this needed for this patch, or is it for groups? Even if the task
> doesn't have any events, I do sometimes want to see if it was recorder.
>
> If you added this to not show tasks if they belong to a group, why not
> add:
>
> if (task->group)
> return;
> ?

Yup I can do that too, I added it because we got a bunch of "task: blah"
output at the end using groups and it seemed ugly, so this check made it
only output tasks that weren't grouped. I'll make your change instead.

>
>> if (task->comm)
>> comm = task->comm;
>> else
>> @@ -1907,6 +1982,39 @@ static void output_task(struct handle_data *h, struct task_data *task)
>> free(events);
>> }
>>
>> +static void output_group(struct handle_data *h, struct group_data *group)
>> +{
>> + struct trace_hash_item **bucket;
>> + struct trace_hash_item *item;
>> + struct event_hash **events;
>> + int nr_events = 0;
>> + int i;
>> +
>> + printf("\ngroup: %s\n", group->comm);
>> +
>> + trace_hash_for_each_bucket(bucket, &group->event_hash) {
>> + trace_hash_for_each_item(item, bucket) {
>> + nr_events++;
>> + }
>> + }
>> +
>> + events = malloc_or_die(sizeof(*events) * nr_events);
>> +
>> + i = 0;
>> + trace_hash_for_each_bucket(bucket, &group->event_hash) {
>> + trace_hash_for_each_item(item, bucket) {
>> + events[i++] = event_from_item(item);
>> + }
>> + }
>> +
>> + qsort(events, nr_events, sizeof(*events), compare_events);
>> +
>> + for (i = 0; i < nr_events; i++)
>> + output_event(events[i]);
>> +
>> + free(events);
>> +}
>> +
>> static int compare_tasks(const void *a, const void *b)
>> {
>> struct task_data * const *A = a;
>> @@ -1919,6 +2027,14 @@ static int compare_tasks(const void *a, const void *b)
>> return 0;
>> }
>>
>> +static int compare_groups(const void *a, const void *b)
>> +{
>> + const char *A = a;
>> + const char *B = b;
>> +
>> + return strcmp(A, B);
>> +}
>> +
>> static void free_event_hash(struct event_hash *event_hash)
>> {
>> struct trace_hash_item **bucket;
>> @@ -1976,6 +2092,25 @@ static void free_task(struct task_data *task)
>> free(task);
>> }
>>
>> +static void free_group(struct group_data *group)
>> +{
>> + struct trace_hash_item **bucket;
>> + struct trace_hash_item *item;
>> + struct event_hash *event_hash;
>> +
>> + free(group->comm);
>> +
>> + trace_hash_for_each_bucket(bucket, &group->event_hash) {
>> + trace_hash_while_item(item, bucket) {
>> + event_hash = event_from_item(item);
>> + trace_hash_del(item);
>> + free_event_hash(event_hash);
>> + }
>> + }
>> + trace_hash_free(&group->event_hash);
>> + free(group);
>> +}
>> +
>> static void show_global_task(struct handle_data *h,
>> struct task_data *task)
>> {
>> @@ -1985,7 +2120,7 @@ static void show_global_task(struct handle_data *h,
>> output_task(h, task);
>> }
>>
>> -static void output_handle(struct handle_data *h)
>> +static void output_tasks(struct handle_data *h)
>> {
>> struct trace_hash_item **bucket;
>> struct trace_hash_item *item;
>> @@ -1993,10 +2128,6 @@ static void output_handle(struct handle_data *h)
>> int nr_tasks = 0;
>> int i;
>>
>> - show_global_task(h, h->global_task);
>> - for (i = 0; i < h->cpus; i++)
>> - show_global_task(h, &h->global_percpu_tasks[i]);
>> -
>> trace_hash_for_each_bucket(bucket, &h->task_hash) {
>> trace_hash_for_each_item(item, bucket) {
>> nr_tasks++;
>> @@ -2024,6 +2155,56 @@ static void output_handle(struct handle_data *h)
>> free(tasks);
>> }
>>
>> +static void output_groups(struct handle_data *h)
>> +{
>> + struct trace_hash_item **bucket;
>> + struct trace_hash_item *item;
>> + struct group_data **groups;
>> + int nr_groups = 0;
>> + int i;
>> +
>> + trace_hash_for_each_bucket(bucket, &h->group_hash) {
>> + trace_hash_for_each_item(item, bucket) {
>> + nr_groups++;
>> + }
>> + }
>> +
>> + if (nr_groups == 0)
>> + return;
>> +
>> + groups = malloc_or_die(sizeof(*groups) * nr_groups);
>> +
>> + nr_groups = 0;
>> +
>> + trace_hash_for_each_bucket(bucket, &h->group_hash) {
>> + trace_hash_while_item(item, bucket) {
>> + groups[nr_groups++] = group_from_item(item);
>> + trace_hash_del(item);
>> + }
>> + }
>> +
>> + qsort(groups, nr_groups, sizeof(*groups), compare_groups);
>> +
>> + for (i = 0; i < nr_groups; i++) {
>> + output_group(h, groups[i]);
>> + free_group(groups[i]);
>> + }
>> +
>> + free(groups);
>> +}
>> +
>> +static void output_handle(struct handle_data *h)
>> +{
>> + int i;
>> +
>> + show_global_task(h, h->global_task);
>> + for (i = 0; i < h->cpus; i++)
>> + show_global_task(h, &h->global_percpu_tasks[i]);
>> +
>> + output_groups(h);
>> + output_tasks(h);
>> +}
>> +
>
>
> Hmm, I wonder if the combination should be done on output. Because how
> do you handle threads where an event is per task? Like scheduling one
> out and the other one in? What about wake ups? If a bunch of threads
> get woken up, how do we match the wakeups to the tasks?

So the scheduling things are all tracked per task. I'm not actually
killing the tasks, I'm adding a grouping with all of the per task
information collated.

So for example, the schedule wakeup stuff gets handled the same way in
the per task hash bucket, and then when it completes and we add it to
the event list we go ahead and move it to the group stuff, after we've
done all of our accounting.

>
> Would it be easier to just match all the tasks by comm after the
> recording was done? I would think matching pids would be quicker than
> matching strings.
>

We could move it to the end I guess, I did it this way so there wasn't a
bunch of lag between stopping the profile and then outputting stuff
because we have to walk all of the tasks and collate its info at that
point. Do you feel strongly either way? I'll change it if you do, but
otherwise this seems to work fine and doesn't mess up the accounting
since the per-pid stuff is tracked in the task itself. Thanks,

Josef

2015-05-20 17:54:40

by Josef Bacik

[permalink] [raw]
Subject: Re: [PATCH 2/5] trace-cmd: group schedule stacktraces together with --merge

On 05/20/2015 01:36 PM, Steven Rostedt wrote:
> On Fri, 8 May 2015 15:29:01 -0400
> Josef Bacik <[email protected]> wrote:
>
>> With --merge we want to be able to merge the scheduling information as well so
>> we can see how the whole app spent its time. This is a little tricky because we
>> have to sort based on comm instead of pid. It works out well enough because we
>> store the intermediate information in the per task hash, but we want to group
>> the final information together.
>
> If we do the merge at the end, we wont need to worry about this.

Yeah that's true, I guess this is probably the best argument for doing
it this way. Ignore my last email, I'll go ahead and rework stuff to do
the grouping at the end. Thanks,

Josef

2015-05-20 18:00:36

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/5] trace-cmd: lookup syscall names in profile

On Fri, 8 May 2015 15:29:02 -0400
Josef Bacik <[email protected]> wrote:

> When we profile syscall times we're just given the syscall number, not the name.
> This isn't helpful for human beings, so lookup the syscall names when we are
> printing out the information. This uses the audit library to get this
> information, I'm not married to it, but it seems that any app that does this
> mapping has to scrape /usr/include/asm/unistd*.h to pull out the names, so might
> as well just use somebody elses work.
>
> Signed-off-by: Josef Bacik <[email protected]>
> ---
> Makefile | 2 +-
> trace-profile.c | 19 ++++++++++++++++---
> 2 files changed, 17 insertions(+), 4 deletions(-)
>
> diff --git a/Makefile b/Makefile
> index 63f7e79..402f711 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -160,7 +160,7 @@ bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
> plugin_dir_SQ = $(subst ','\'',$(plugin_dir))
> python_dir_SQ = $(subst ','\'',$(python_dir))
>
> -LIBS = -L. -ltracecmd -ldl
> +LIBS = -L. -ltracecmd -ldl -laudit

This fails to compile for me because I don't have a audit-libs-devel
installed.

I'll keep this as is, but I'm going to add code to check for this
library, and go back to the default raw numbers if it doesn't exist.

-- Steve

2015-05-20 18:22:41

by Josef Bacik

[permalink] [raw]
Subject: Re: [PATCH 3/5] trace-cmd: lookup syscall names in profile

On 05/20/2015 02:00 PM, Steven Rostedt wrote:
> On Fri, 8 May 2015 15:29:02 -0400
> Josef Bacik <[email protected]> wrote:
>
>> When we profile syscall times we're just given the syscall number, not the name.
>> This isn't helpful for human beings, so lookup the syscall names when we are
>> printing out the information. This uses the audit library to get this
>> information, I'm not married to it, but it seems that any app that does this
>> mapping has to scrape /usr/include/asm/unistd*.h to pull out the names, so might
>> as well just use somebody elses work.
>>
>> Signed-off-by: Josef Bacik <[email protected]>
>> ---
>> Makefile | 2 +-
>> trace-profile.c | 19 ++++++++++++++++---
>> 2 files changed, 17 insertions(+), 4 deletions(-)
>>
>> diff --git a/Makefile b/Makefile
>> index 63f7e79..402f711 100644
>> --- a/Makefile
>> +++ b/Makefile
>> @@ -160,7 +160,7 @@ bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
>> plugin_dir_SQ = $(subst ','\'',$(plugin_dir))
>> python_dir_SQ = $(subst ','\'',$(python_dir))
>>
>> -LIBS = -L. -ltracecmd -ldl
>> +LIBS = -L. -ltracecmd -ldl -laudit
>
> This fails to compile for me because I don't have a audit-libs-devel
> installed.
>
> I'll keep this as is, but I'm going to add code to check for this
> library, and go back to the default raw numbers if it doesn't exist.
>

Maybe we should think about autoconf at some point? Thanks,

Josef

2015-05-20 18:56:24

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/5] trace-cmd: lookup syscall names in profile

On Wed, 20 May 2015 14:22:31 -0400
Josef Bacik <[email protected]> wrote:

> > I'll keep this as is, but I'm going to add code to check for this
> > library, and go back to the default raw numbers if it doesn't exist.
> >
>
> Maybe we should think about autoconf at some point? Thanks,

Egad no! I hate autoconf. And so do many others!

A simple test works fine. I don't want to require libaudit. Just warn
users that compile without it.

I like to install trace-cmd on minimal setups (like embedded devices).
autoconf will kill that.

-- Steve

2015-05-20 19:01:38

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/5] trace-cmd: lookup syscall names in profile

On Wed, 20 May 2015 14:56:17 -0400
Steven Rostedt <[email protected]> wrote:

> On Wed, 20 May 2015 14:22:31 -0400
> Josef Bacik <[email protected]> wrote:
>
> > > I'll keep this as is, but I'm going to add code to check for this
> > > library, and go back to the default raw numbers if it doesn't exist.
> > >
> >
> > Maybe we should think about autoconf at some point? Thanks,
>
> Egad no! I hate autoconf. And so do many others!
>
> A simple test works fine. I don't want to require libaudit. Just warn
> users that compile without it.
>
> I like to install trace-cmd on minimal setups (like embedded devices).
> autoconf will kill that.
>

Here's what I did to fix it:

-- Steve

diff --git a/Makefile b/Makefile
index 402f7114ab42..a8e63ad0c223 100644
--- a/Makefile
+++ b/Makefile
@@ -160,7 +160,7 @@ bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
plugin_dir_SQ = $(subst ','\'',$(plugin_dir))
python_dir_SQ = $(subst ','\'',$(python_dir))

-LIBS = -L. -ltracecmd -ldl -laudit
+LIBS = -L. -ltracecmd -ldl
LIB_FILE = libtracecmd.a

PACKAGES= gtk+-2.0 libxml-2.0 gthread-2.0
@@ -231,12 +231,25 @@ override CFLAGS += -D_GNU_SOURCE
ifndef NO_PTRACE
ifneq ($(call try-cc,$(SOURCE_PTRACE),),y)
NO_PTRACE = 1
- CFLAGS += -DWARN_NO_PTRACE
+ override CFLAGS += -DWARN_NO_PTRACE
endif
endif

ifdef NO_PTRACE
-CFLAGS += -DNO_PTRACE
+override CFLAGS += -DNO_PTRACE
+endif
+
+ifndef NO_AUDIT
+ifneq ($(call try-cc,$(SOURCE_AUDIT),-laudit),y)
+ NO_AUDIT = 1
+ override CFLAGS += -DWARN_NO_AUDIT
+endif
+endif
+
+ifdef NO_AUDIT
+override CFLAGS += -DNO_AUDIT
+else
+LIBS += -laudit
endif

# Append required CFLAGS
diff --git a/features.mk b/features.mk
index cb2e8bd897be..9c9d63c13a23 100644
--- a/features.mk
+++ b/features.mk
@@ -34,3 +34,21 @@ int main (void)
return ret;
}
endef
+
+define SOURCE_AUDIT
+#include <stdio.h>
+#include <libaudit.h>
+
+int main (void)
+{
+ char *name;
+ int ret;
+ ret = audit_detect_machine();
+ if (ret < 0)
+ return ret;
+ name = audit_syscall_to_name(1, ret);
+ if (!name)
+ return -1;
+ return ret;
+}
+endef
diff --git a/trace-profile.c b/trace-profile.c
index 24f989715f05..640dca822d46 100644
--- a/trace-profile.c
+++ b/trace-profile.c
@@ -23,11 +23,17 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#ifndef NO_AUDIT
#include <libaudit.h>
-
+#endif
#include "trace-local.h"
#include "trace-hash.h"

+#ifdef WARN_NO_AUDIT
+# warning "lib audit not found, using raw syscalls " \
+ "(install libaudit-devel and try again)"
+#endif
+
#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP"
#define TASK_STATE_MAX 1024

@@ -848,6 +854,7 @@ static void func_print(struct trace_seq *s, struct event_hash *event_hash)

static void syscall_print(struct trace_seq *s, struct event_hash *event_hash)
{
+#ifndef NO_AUDIT
const char *name = NULL;
int machine;

@@ -860,6 +867,7 @@ static void syscall_print(struct trace_seq *s, struct event_hash *event_hash)
trace_seq_printf(s, "syscall:%s", name);
return;
fail:
+#endif
trace_seq_printf(s, "%s:%d", event_hash->event_data->event->name,
(int)event_hash->val);
}

2015-05-20 19:07:11

by Josef Bacik

[permalink] [raw]
Subject: Re: [PATCH 3/5] trace-cmd: lookup syscall names in profile

On 05/20/2015 03:01 PM, Steven Rostedt wrote:
> On Wed, 20 May 2015 14:56:17 -0400
> Steven Rostedt <[email protected]> wrote:
>
>> On Wed, 20 May 2015 14:22:31 -0400
>> Josef Bacik <[email protected]> wrote:
>>
>>>> I'll keep this as is, but I'm going to add code to check for this
>>>> library, and go back to the default raw numbers if it doesn't exist.
>>>>
>>>
>>> Maybe we should think about autoconf at some point? Thanks,
>>
>> Egad no! I hate autoconf. And so do many others!
>>
>> A simple test works fine. I don't want to require libaudit. Just warn
>> users that compile without it.
>>
>> I like to install trace-cmd on minimal setups (like embedded devices).
>> autoconf will kill that.
>>
>
> Here's what I did to fix it:
>
> -- Steve
>
> diff --git a/Makefile b/Makefile
> index 402f7114ab42..a8e63ad0c223 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -160,7 +160,7 @@ bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
> plugin_dir_SQ = $(subst ','\'',$(plugin_dir))
> python_dir_SQ = $(subst ','\'',$(python_dir))
>
> -LIBS = -L. -ltracecmd -ldl -laudit
> +LIBS = -L. -ltracecmd -ldl
> LIB_FILE = libtracecmd.a
>
> PACKAGES= gtk+-2.0 libxml-2.0 gthread-2.0
> @@ -231,12 +231,25 @@ override CFLAGS += -D_GNU_SOURCE
> ifndef NO_PTRACE
> ifneq ($(call try-cc,$(SOURCE_PTRACE),),y)
> NO_PTRACE = 1
> - CFLAGS += -DWARN_NO_PTRACE
> + override CFLAGS += -DWARN_NO_PTRACE
> endif
> endif
>
> ifdef NO_PTRACE
> -CFLAGS += -DNO_PTRACE
> +override CFLAGS += -DNO_PTRACE
> +endif
> +
> +ifndef NO_AUDIT
> +ifneq ($(call try-cc,$(SOURCE_AUDIT),-laudit),y)
> + NO_AUDIT = 1
> + override CFLAGS += -DWARN_NO_AUDIT
> +endif
> +endif
> +
> +ifdef NO_AUDIT
> +override CFLAGS += -DNO_AUDIT
> +else
> +LIBS += -laudit
> endif
>
> # Append required CFLAGS
> diff --git a/features.mk b/features.mk
> index cb2e8bd897be..9c9d63c13a23 100644
> --- a/features.mk
> +++ b/features.mk
> @@ -34,3 +34,21 @@ int main (void)
> return ret;
> }
> endef
> +
> +define SOURCE_AUDIT
> +#include <stdio.h>
> +#include <libaudit.h>
> +
> +int main (void)
> +{
> + char *name;
> + int ret;
> + ret = audit_detect_machine();
> + if (ret < 0)
> + return ret;
> + name = audit_syscall_to_name(1, ret);
> + if (!name)
> + return -1;
> + return ret;
> +}
> +endef
> diff --git a/trace-profile.c b/trace-profile.c
> index 24f989715f05..640dca822d46 100644
> --- a/trace-profile.c
> +++ b/trace-profile.c
> @@ -23,11 +23,17 @@
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> +#ifndef NO_AUDIT
> #include <libaudit.h>
> -
> +#endif
> #include "trace-local.h"
> #include "trace-hash.h"
>
> +#ifdef WARN_NO_AUDIT
> +# warning "lib audit not found, using raw syscalls " \
> + "(install libaudit-devel and try again)"
> +#endif
> +
> #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP"
> #define TASK_STATE_MAX 1024
>
> @@ -848,6 +854,7 @@ static void func_print(struct trace_seq *s, struct event_hash *event_hash)
>
> static void syscall_print(struct trace_seq *s, struct event_hash *event_hash)
> {
> +#ifndef NO_AUDIT
> const char *name = NULL;
> int machine;
>
> @@ -860,6 +867,7 @@ static void syscall_print(struct trace_seq *s, struct event_hash *event_hash)
> trace_seq_printf(s, "syscall:%s", name);
> return;
> fail:
> +#endif
> trace_seq_printf(s, "%s:%d", event_hash->event_data->event->name,
> (int)event_hash->val);
> }
>

Looks good to me, thanks,

Josef