Date: Wed, 20 May 2015 13:31:30 -0400
From: Steven Rostedt <rostedt@goodmis.org>
To: Josef Bacik <jbacik@fb.com>
Cc: <linux-kernel@vger.kernel.org>, <kernel-team@fb.com>
Subject: Re: [PATCH 1/5] trace-cmd: add option to group like comms for
 profile
Message-ID: <20150520133130.5c170df1@gandalf.local.home>
In-Reply-To: <1431113344-22579-2-git-send-email-jbacik@fb.com>
References: <1431113344-22579-1-git-send-email-jbacik@fb.com>
	<1431113344-22579-2-git-send-email-jbacik@fb.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 15013
Lines: 505

Finally have some time to look at this.


On Fri, 8 May 2015 15:29:00 -0400
Josef Bacik <jbacik@fb.com> wrote:

> When profiling large applications that have multiple children we really want to
> see the profile of the app all together.  Add a --merge option to accomplish
> this for profile.  This matches events based on comm instead of PID, this way we
> can easilly see what a giant app is doing, such as HHVM.  Thanks,
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
>  trace-hash-local.h |   7 ++
>  trace-local.h      |   1 +
>  trace-profile.c    | 207 +++++++++++++++++++++++++++++++++++++++++++++++++----
>  trace-record.c     |   5 ++

Shouldn't we add this option to trace-cmd report --profile ?

Also, I think we need a man page.

Also, I'm not so sure I like the "--merge" name. It's not very
descriptive to what it does. What about "--by-comm"? Or maybe "--comm"?

>  trace-usage.c      |   1 +
>  5 files changed, 208 insertions(+), 13 deletions(-)
> 
> diff --git a/trace-hash-local.h b/trace-hash-local.h
> index 997b11c..5bae3f3 100644
> --- a/trace-hash-local.h
> +++ b/trace-hash-local.h
> @@ -48,4 +48,11 @@ static inline unsigned int trace_hash(int val)
>  	return hash;
>  }
>  
> +static inline unsigned int trace_hash_str(char *str)
> +{
> +	int val = 0;
> +
> +	while (*str) val += (int)*(str++);
> +	return trace_hash(val);
> +}

Hmm, the above doesn't account for order of characters. That is "bash"
is the same as "shab". What about this:

	int val = 0;
	int i;

	for (i = 0; str[i]; i++)
		val += ((int)str[i]) << (i & 0xffffff);
	return trace_hash(val);


>  #endif /* _TRACE_HASH_LOCAL_H */
> diff --git a/trace-local.h b/trace-local.h
> index ab2bd34..50f36ba 100644
> --- a/trace-local.h
> +++ b/trace-local.h
> @@ -81,6 +81,7 @@ int trace_profile_record(struct tracecmd_input *handle,
>  void trace_init_profile(struct tracecmd_input *handle, struct hook_list *hooks,
>  			int global);
>  int trace_profile(void);
> +void trace_profile_merge(void);
>  
>  struct tracecmd_input *
>  trace_stream_init(struct buffer_instance *instance, int cpu, int fd, int cpus,
> diff --git a/trace-profile.c b/trace-profile.c
> index f71a7c6..bd775e8 100644
> --- a/trace-profile.c
> +++ b/trace-profile.c
> @@ -34,6 +34,7 @@
>  #define start_from_item(item)	container_of(item, struct start_data, hash)
>  #define event_from_item(item)	container_of(item, struct event_hash, hash)
>  #define stack_from_item(item)	container_of(item, struct stack_data, hash)
> +#define group_from_item(item)	container_of(item, struct group_data, hash)
>  #define event_data_from_item(item)	container_of(item, struct event_data, hash)
>  
>  static unsigned long long nsecs_per_sec(unsigned long long ts)
> @@ -141,6 +142,12 @@ struct event_hash {
>  	struct trace_hash	stacks;
>  };
>  
> +struct group_data {
> +	struct trace_hash_item	hash;
> +	char			*comm;
> +	struct trace_hash	event_hash;
> +};
> +
>  struct task_data {
>  	struct trace_hash_item	hash;
>  	int			pid;
> @@ -156,6 +163,7 @@ struct task_data {
>  	struct event_hash	*last_event;
>  	struct pevent_record	*last_stack;
>  	struct handle_data	*handle;
> +	struct group_data	*group;
>  };
>  
>  struct cpu_info {
> @@ -173,6 +181,7 @@ struct handle_data {
>  	struct pevent		*pevent;
>  
>  	struct trace_hash	events;
> +	struct trace_hash	group_hash;
>  
>  	struct cpu_info		**cpu_data;
>  
> @@ -196,6 +205,12 @@ struct handle_data {
>  
>  static struct handle_data *handles;
>  static struct event_data *stacktrace_event;
> +static bool merge_like_comms = false;
> +
> +void trace_profile_merge(void)
> +{
> +	merge_like_comms = true;
> +}
>  
>  static struct start_data *
>  add_start(struct task_data *task,
> @@ -250,6 +265,7 @@ static struct event_hash *
>  find_event_hash(struct task_data *task, struct event_data_match *edata)
>  {
>  	struct event_hash *event_hash;
> +	struct trace_hash *hash;
>  	struct trace_hash_item *item;
>  	unsigned long long key;
>  
> @@ -257,7 +273,11 @@ find_event_hash(struct task_data *task, struct event_data_match *edata)
>  		(unsigned long)edata->search_val +
>  		(unsigned long)edata->val;
>  	key = trace_hash(key);
> -	item = trace_hash_find(&task->event_hash, key, match_event, edata);
> +	if (task->group)
> +		hash = &task->group->event_hash;
> +	else
> +		hash = &task->event_hash;
> +	item = trace_hash_find(hash, key, match_event, edata);
>  	if (item)
>  		return event_from_item(item);
>  
> @@ -270,7 +290,7 @@ find_event_hash(struct task_data *task, struct event_data_match *edata)
>  	event_hash->hash.key = key;
>  	trace_hash_init(&event_hash->stacks, 32);
>  
> -	trace_hash_add(&task->event_hash, &event_hash->hash);
> +	trace_hash_add(hash, &event_hash->hash);
>  
>  	return event_hash;
>  }
> @@ -496,9 +516,59 @@ find_task(struct handle_data *h, int pid)
>  	return last_task;
>  }
>  
> +static int match_group(struct trace_hash_item *item, void *data)
> +{
> +	struct group_data *group = group_from_item(item);
> +
> +	return strcmp(group->comm, (char *)data) == 0;
> +}
> +
> +static void add_group(struct handle_data *h, struct task_data *task)
> +{
> +	unsigned long long key;
> +	struct trace_hash_item *item;
> +	void *data = task->comm;
> +
> +	if (!merge_like_comms)
> +		return;
> +	if (task->group || !task->comm)
> +		return;
> +
> +	key = trace_hash_str(task->comm);
> +
> +	item = trace_hash_find(&h->group_hash, key, match_group, data);
> +	if (item) {
> +		task->group = group_from_item(item);
> +	} else {
> +		struct group_data *grp;
> +		struct trace_hash_item **bucket;
> +
> +		grp = malloc_or_die(sizeof(*grp));
> +		memset(grp, 0, sizeof(*grp));
> +
> +		grp->comm = strdup(task->comm);
> +		if (!grp->comm)
> +			die("strdup");
> +		grp->hash.key = key;
> +		trace_hash_add(&h->group_hash, &grp->hash);
> +		task->group = grp;
> +		trace_hash_init(&grp->event_hash, 32);
> +
> +		trace_hash_for_each_bucket(bucket, &task->event_hash) {
> +			trace_hash_while_item(item, bucket) {
> +				struct event_hash *event_hash;
> +
> +				event_hash = event_from_item(item);
> +				trace_hash_del(&event_hash->hash);
> +				trace_hash_add(&grp->event_hash, &event_hash->hash);
> +			}
> +		}
> +	}
> +}
> +
>  static void
> -add_task_comm(struct task_data *task, struct format_field *field,
> -	      struct pevent_record *record)
> +add_task_comm(struct handle_data *h, struct task_data *task,
> +	      struct format_field *field, struct pevent_record *record)
>  {
>  	const char *comm;
>  
> @@ -506,6 +576,7 @@ add_task_comm(struct task_data *task, struct format_field *field,
>  	comm = record->data + field->offset;
>  	memcpy(task->comm, comm, field->size);
>  	task->comm[field->size] = 0;
> +	add_group(h, task);
>  }
>  
>  /* Account for tasks that don't have starts */
> @@ -932,7 +1003,7 @@ static int handle_sched_switch_event(struct handle_data *h,
>  
>  	task = find_task(h, prev_pid);
>  	if (!task->comm)
> -		add_task_comm(task, h->switch_prev_comm, record);
> +		add_task_comm(h, task, h->switch_prev_comm, record);
>  
>  	if (prev_state)
>  		task->sleeping = 1;
> @@ -940,13 +1011,13 @@ static int handle_sched_switch_event(struct handle_data *h,
>  		task->sleeping = 0;
>  
>  	/* task is being scheduled out. prev_state tells why */
> -	start = add_start(task, event_data, record, prev_pid, prev_state);
> +	start = add_start(task, event_data, record, prev_state, prev_state);
>  	task->last_start = start;
>  	task->last_event = NULL;
>  
>  	task = find_task(h, next_pid);
>  	if (!task->comm)
> -		add_task_comm(task, h->switch_next_comm, record);
> +		add_task_comm(h, task, h->switch_next_comm, record);
>  
>  	/*
>  	 * If the next task was blocked, it required a wakeup to
> @@ -1133,7 +1204,7 @@ static int handle_sched_wakeup_event(struct handle_data *h,
>  
>  	task = find_task(h, pid);
>  	if (!task->comm)
> -		add_task_comm(task, h->wakeup_comm, record);
> +		add_task_comm(h, task, h->wakeup_comm, record);
>  
>  	/* if the task isn't sleeping, then ignore the wake up */
>  	if (!task->sleeping) {
> @@ -1196,6 +1267,7 @@ void trace_init_profile(struct tracecmd_input *handle, struct hook_list *hook,
>  
>  	trace_hash_init(&h->task_hash, 1024);
>  	trace_hash_init(&h->events, 1024);
> +	trace_hash_init(&h->group_hash, 512);
>  
>  	h->handle = handle;
>  	h->pevent = pevent;
> @@ -1874,6 +1946,9 @@ static void output_task(struct handle_data *h, struct task_data *task)
>  	int nr_events = 0;
>  	int i;
>  
> +	if (trace_hash_empty(&task->event_hash))
> +		return;
> +

Was this needed for this patch, or is it for groups? Even if the task
doesn't have any events, I do sometimes want to see if it was recorder.

If you added this to not show tasks if they belong to a group, why not
add:

	if (task->group)
		return;
?

>  	if (task->comm)
>  		comm = task->comm;
>  	else
> @@ -1907,6 +1982,39 @@ static void output_task(struct handle_data *h, struct task_data *task)
>  	free(events);
>  }
>  
> +static void output_group(struct handle_data *h, struct group_data *group)
> +{
> +	struct trace_hash_item **bucket;
> +	struct trace_hash_item *item;
> +	struct event_hash **events;
> +	int nr_events = 0;
> +	int i;
> +
> +	printf("\ngroup: %s\n", group->comm);
> +
> +	trace_hash_for_each_bucket(bucket, &group->event_hash) {
> +		trace_hash_for_each_item(item, bucket) {
> +			nr_events++;
> +		}
> +	}
> +
> +	events = malloc_or_die(sizeof(*events) * nr_events);
> +
> +	i = 0;
> +	trace_hash_for_each_bucket(bucket, &group->event_hash) {
> +		trace_hash_for_each_item(item, bucket) {
> +			events[i++] = event_from_item(item);
> +		}
> +	}
> +
> +	qsort(events, nr_events, sizeof(*events), compare_events);
> +
> +	for (i = 0; i < nr_events; i++)
> +		output_event(events[i]);
> +
> +	free(events);
> +}
> +
>  static int compare_tasks(const void *a, const void *b)
>  {
>  	struct task_data * const *A = a;
> @@ -1919,6 +2027,14 @@ static int compare_tasks(const void *a, const void *b)
>  	return 0;
>  }
>  
> +static int compare_groups(const void *a, const void *b)
> +{
> +	const char *A = a;
> +	const char *B = b;
> +
> +	return strcmp(A, B);
> +}
> +
>  static void free_event_hash(struct event_hash *event_hash)
>  {
>  	struct trace_hash_item **bucket;
> @@ -1976,6 +2092,25 @@ static void free_task(struct task_data *task)
>  	free(task);
>  }
>  
> +static void free_group(struct group_data *group)
> +{
> +	struct trace_hash_item **bucket;
> +	struct trace_hash_item *item;
> +	struct event_hash *event_hash;
> +
> +	free(group->comm);
> +
> +	trace_hash_for_each_bucket(bucket, &group->event_hash) {
> +		trace_hash_while_item(item, bucket) {
> +			event_hash = event_from_item(item);
> +			trace_hash_del(item);
> +			free_event_hash(event_hash);
> +		}
> +	}
> +	trace_hash_free(&group->event_hash);
> +	free(group);
> +}
> +
>  static void show_global_task(struct handle_data *h,
>  			     struct task_data *task)
>  {
> @@ -1985,7 +2120,7 @@ static void show_global_task(struct handle_data *h,
>  	output_task(h, task);
>  }
>  
> -static void output_handle(struct handle_data *h)
> +static void output_tasks(struct handle_data *h)
>  {
>  	struct trace_hash_item **bucket;
>  	struct trace_hash_item *item;
> @@ -1993,10 +2128,6 @@ static void output_handle(struct handle_data *h)
>  	int nr_tasks = 0;
>  	int i;
>  
> -	show_global_task(h, h->global_task);
> -	for (i = 0; i < h->cpus; i++)
> -		show_global_task(h, &h->global_percpu_tasks[i]);
> -
>  	trace_hash_for_each_bucket(bucket, &h->task_hash) {
>  		trace_hash_for_each_item(item, bucket) {
>  			nr_tasks++;
> @@ -2024,6 +2155,56 @@ static void output_handle(struct handle_data *h)
>  	free(tasks);
>  }
>  
> +static void output_groups(struct handle_data *h)
> +{
> +	struct trace_hash_item **bucket;
> +	struct trace_hash_item *item;
> +	struct group_data **groups;
> +	int nr_groups = 0;
> +	int i;
> +
> +	trace_hash_for_each_bucket(bucket, &h->group_hash) {
> +		trace_hash_for_each_item(item, bucket) {
> +			nr_groups++;
> +		}
> +	}
> +
> +	if (nr_groups == 0)
> +		return;
> +
> +	groups = malloc_or_die(sizeof(*groups) * nr_groups);
> +
> +	nr_groups = 0;
> +
> +	trace_hash_for_each_bucket(bucket, &h->group_hash) {
> +		trace_hash_while_item(item, bucket) {
> +			groups[nr_groups++] = group_from_item(item);
> +			trace_hash_del(item);
> +		}
> +	}
> +
> +	qsort(groups, nr_groups, sizeof(*groups), compare_groups);
> +
> +	for (i = 0; i < nr_groups; i++) {
> +		output_group(h, groups[i]);
> +		free_group(groups[i]);
> +	}
> +
> +	free(groups);
> +}
> +
> +static void output_handle(struct handle_data *h)
> +{
> +	int i;
> +
> +	show_global_task(h, h->global_task);
> +	for (i = 0; i < h->cpus; i++)
> +		show_global_task(h, &h->global_percpu_tasks[i]);
> +
> +	output_groups(h);
> +	output_tasks(h);
> +}
> +


Hmm, I wonder if the combination should be done on output. Because how
do you handle threads where an event is per task? Like scheduling one
out and the other one in? What about wake ups? If a bunch of threads
get woken up, how do we match the wakeups to the tasks?

Would it be easier to just match all the tasks by comm after the
recording was done? I would think matching pids would be quicker than
matching strings.

-- Steve

>  int trace_profile(void)
>  {
>  	struct handle_data *h;
> diff --git a/trace-record.c b/trace-record.c
> index 9874a12..7b61cde 100644
> --- a/trace-record.c
> +++ b/trace-record.c
> @@ -3703,6 +3703,7 @@ static void add_hook(struct buffer_instance *instance, const char *arg)
>  }
>  
>  enum {
> +	OPT_merge	= 250,
>  	OPT_stderr	= 251,
>  	OPT_profile	= 252,
>  	OPT_nosplice	= 253,
> @@ -3871,6 +3872,7 @@ void trace_record (int argc, char **argv)
>  			{"nosplice", no_argument, NULL, OPT_nosplice},
>  			{"profile", no_argument, NULL, OPT_profile},
>  			{"stderr", no_argument, NULL, OPT_stderr},
> +			{"merge", no_argument, NULL, OPT_merge},
>  			{"help", no_argument, NULL, '?'},
>  			{NULL, 0, NULL, 0}
>  		};
> @@ -4102,6 +4104,9 @@ void trace_record (int argc, char **argv)
>  			close(1);
>  			dup2(2, 1);
>  			break;
> +		case OPT_merge:
> +			trace_profile_merge();
> +			break;
>  		default:
>  			usage(argv);
>  		}
> diff --git a/trace-usage.c b/trace-usage.c
> index 520b14b..190990b 100644
> --- a/trace-usage.c
> +++ b/trace-usage.c
> @@ -48,6 +48,7 @@ static struct usage_help usage_help[] = {
>  		"          -B create sub buffer and folling events will be enabled here\n"
>  		"          -k do not reset the buffers after tracing.\n"
>  		"          -i do not fail if an event is not found\n"
> +		"          --merge used with --profile, merge events for related comms\n"
>  		"          --profile enable tracing options needed for report --profile\n"
>  		"          --func-stack perform a stack trace for function tracer\n"
>  		"             (use with caution)\n"

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/