LinuxLists.cc - [RFC][PATCH 4/4] perf/events: Use helper functions in event assignment to shrink macro size

2014-02-06 18:11:15

Subject: [RFC][PATCH 4/4] perf/events: Use helper functions in event assignment to shrink macro size

From: Steven Rostedt <[email protected]>

The functions that assign the contents for the perf software events are
defined by the TRACE_EVENT() macros. Each event has its own unique
way to assign data to its buffer. When you have over 500 events,
that means there's 500 functions assigning data uniquely for each
event.

By making helper functions in the core kernel to do the work
instead, we can shrink the size of the kernel down a bit.

With a kernel configured with 707 events, the change in size was:

text data bss dec hex filename
12959102 1913504 9785344 24657950 178401e /tmp/vmlinux
12917629 1913568 9785344 24616541 1779e5d /tmp/vmlinux.patched

That's a total of 41473 bytes, which comes down to 82 bytes per event.

Note, most of the savings comes from moving the setup and final submit
into helper functions, where the setup does the work and stores the
data into a structure, and that structure is passed to the submit function,
moving the setup of the parameters of perf_trace_buf_submit().

Link: http://lkml.kernel.org/r/[email protected]

Cc: Peter Zijlstra <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/linux/ftrace_event.h | 17 ++++++++++++++
include/trace/ftrace.h | 33 ++++++++++----------------
kernel/trace/trace_event_perf.c | 51 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 80 insertions(+), 21 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4cc6852..f33162e 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -450,6 +450,23 @@ struct perf_event;

DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);

+struct perf_trace_event {
+ struct pt_regs regs;
+ struct hlist_head __percpu *head;
+ struct task_struct *task;
+ struct ftrace_event_call *event_call;
+ void *entry;
+ u64 addr;
+ u64 count;
+ int entry_size;
+ int rctx;
+ int constant;
+};
+
+extern void *perf_trace_event_setup(struct ftrace_event_call *event_call,
+ struct perf_trace_event *pe);
+extern void perf_trace_event_submit(struct perf_trace_event *pe);
+
extern int perf_trace_init(struct perf_event *event);
extern void perf_trace_destroy(struct perf_event *event);
extern int perf_trace_add(struct perf_event *event, int flags);
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index dc883a3..ba9173a 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -629,13 +629,13 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
#define __get_str(field) (char *)__get_dynamic_array(field)

#undef __perf_addr
-#define __perf_addr(a) (__addr = (a))
+#define __perf_addr(a) (__pe.addr = (a))

#undef __perf_count
-#define __perf_count(c) (__count = (c))
+#define __perf_count(c) (__pe.count = (c))

#undef __perf_task
-#define __perf_task(t) (__task = (t))
+#define __perf_task(t) (__pe.task = (t))

#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
@@ -645,28 +645,20 @@ perf_trace_##call(void *__data, proto) \
struct ftrace_event_call *event_call = __data; \
struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
struct ftrace_raw_##call *entry; \
- struct pt_regs __regs; \
- u64 __addr = 0, __count = 1; \
- struct task_struct *__task = NULL; \
- struct hlist_head *head; \
- int __entry_size; \
+ struct perf_trace_event __pe; \
int __data_size; \
- int rctx; \
+ \
+ __pe.task = NULL; \
\
__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
\
- head = this_cpu_ptr(event_call->perf_events); \
- if (__builtin_constant_p(!__task) && !__task && \
- hlist_empty(head)) \
- return; \
+ __pe.constant = __builtin_constant_p(!__pe.task) && !__pe.task; \
\
- __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
- sizeof(u64)); \
- __entry_size -= sizeof(u32); \
+ __pe.entry_size = __data_size + sizeof(*entry); \
+ __pe.addr = 0; \
+ __pe.count = 1; \
\
- perf_fetch_caller_regs(&__regs); \
- entry = perf_trace_buf_prepare(__entry_size, \
- event_call->event.type, &__regs, &rctx); \
+ entry = perf_trace_event_setup(event_call, &__pe); \
if (!entry) \
return; \
\
@@ -674,8 +666,7 @@ perf_trace_##call(void *__data, proto) \
\
{ assign; } \
\
- perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \
- __count, &__regs, head, __task); \
+ perf_trace_event_submit(&__pe); \
}

/*
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index e854f42..6b01559 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,6 +21,57 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
/* Count the events in use (per event id, not per instance) */
static int total_ref_count;

+/**
+ * perf_trace_event_setup - set up for a perf sw event
+ * @event_call: The sw event that is to be recorded
+ * @pe: The perf event structure to pass to the submit function
+ *
+ * This is a helper function to keep the work to set up a perf sw
+ * event out of the inlined trace code. Since the same work neeeds to
+ * be done for the sw events, having a separate function helps keep
+ * from duplicating that code all over the kernel.
+ *
+ * The use of the perf event structure (@pe) is to store and pass the
+ * data to the perf_trace_event_submit() call and keep the setting
+ * up of the parameters of perf_trace_buf_submit() out of the inlined
+ * trace code.
+ */
+void *perf_trace_event_setup(struct ftrace_event_call *event_call,
+ struct perf_trace_event *pe)
+{
+ pe->head = this_cpu_ptr(event_call->perf_events);
+ if (pe->constant && hlist_empty(pe->head))
+ return NULL;
+
+ pe->entry_size = ALIGN(pe->entry_size + sizeof(u32), sizeof(u64));
+ pe->entry_size -= sizeof(u32);
+ pe->event_call = event_call;
+
+ perf_fetch_caller_regs(&pe->regs);
+
+ pe->entry = perf_trace_buf_prepare(pe->entry_size,
+ event_call->event.type, &pe->regs, &pe->rctx);
+ return pe->entry;
+}
+EXPORT_SYMBOL_GPL(perf_trace_event_setup);
+
+/**
+ * perf_trace_event_submit - submit from perf sw event
+ * @pe: perf event structure that holds all the necessary data
+ *
+ * This is a helper function that removes a lot of the setting up of
+ * the function parameters to call perf_trace_buf_submit() from the
+ * inlined code. Using the perf event structure @pe to store the
+ * information passed from perf_trace_event_setup() keeps the overhead
+ * of building the function call paremeters out of the inlined functions.
+ */
+void perf_trace_event_submit(struct perf_trace_event *pe)
+{
+ perf_trace_buf_submit(pe->entry, pe->entry_size, pe->rctx, pe->addr,
+ pe->count, &pe->regs, pe->head, pe->task);
+}
+EXPORT_SYMBOL_GPL(perf_trace_event_submit);
+
static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
struct perf_event *p_event)
{
--
1.8.4.3

2014-02-06 18:47:43

by Steven Rostedt

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/4] perf/events: Use helper functions in event assignment to shrink macro size

On Thu, 06 Feb 2014 12:39:14 -0500
Steven Rostedt <[email protected]> wrote:

> From: Steven Rostedt <[email protected]>
>
> The functions that assign the contents for the perf software events are
> defined by the TRACE_EVENT() macros. Each event has its own unique
> way to assign data to its buffer. When you have over 500 events,
> that means there's 500 functions assigning data uniquely for each
> event.
>
> By making helper functions in the core kernel to do the work
> instead, we can shrink the size of the kernel down a bit.
>
> With a kernel configured with 707 events, the change in size was:
>
> text data bss dec hex filename
> 12959102 1913504 9785344 24657950 178401e /tmp/vmlinux
> 12917629 1913568 9785344 24616541 1779e5d /tmp/vmlinux.patched
>
> That's a total of 41473 bytes, which comes down to 82 bytes per event.
>
> Note, most of the savings comes from moving the setup and final submit
> into helper functions, where the setup does the work and stores the
> data into a structure, and that structure is passed to the submit function,
> moving the setup of the parameters of perf_trace_buf_submit().
>
> Link: http://lkml.kernel.org/r/[email protected]
>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Frederic Weisbecker <[email protected]>

Peter, Frederic,

Can you give an ack to this. Peter, you pretty much gave you ack before
except for one nit:

http://marc.info/?l=linux-kernel&m=134484533217124&w=2

> Signed-off-by: Steven Rostedt <[email protected]>
> ---
> include/linux/ftrace_event.h | 17 ++++++++++++++
> include/trace/ftrace.h | 33 ++++++++++----------------
> kernel/trace/trace_event_perf.c | 51 +++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 80 insertions(+), 21 deletions(-)
>

> +
> +/**
> + * perf_trace_event_submit - submit from perf sw event
> + * @pe: perf event structure that holds all the necessary data
> + *
> + * This is a helper function that removes a lot of the setting up of
> + * the function parameters to call perf_trace_buf_submit() from the
> + * inlined code. Using the perf event structure @pe to store the
> + * information passed from perf_trace_event_setup() keeps the overhead
> + * of building the function call paremeters out of the inlined functions.
> + */
> +void perf_trace_event_submit(struct perf_trace_event *pe)
> +{
> + perf_trace_buf_submit(pe->entry, pe->entry_size, pe->rctx, pe->addr,
> + pe->count, &pe->regs, pe->head, pe->task);
> +}
> +EXPORT_SYMBOL_GPL(perf_trace_event_submit);
> +

You wanted the perf_trace_buf_submit() to go away. Now I could do that,
bu that would require all other users to use the new perf_trace_event
structure to pass in. The only reason I did that was because this
structure is set up in perf_trace_event_setup() which passes in only
the event_call and the pe structure. In the setup function, the pe
structure is assigned all the information required for
perf_trace_event_submit().

What this does is to remove the function parameter setup from the
inlined tracepoint callers, which is quite a lot!

This is what a perf tracepoint currently looks like:

0000000000000b44 <perf_trace_sched_pi_setprio>:
b44: 55 push %rbp
b45: 48 89 e5 mov %rsp,%rbp
b48: 41 56 push %r14
b4a: 41 89 d6 mov %edx,%r14d
b4d: 41 55 push %r13
b4f: 49 89 fd mov %rdi,%r13
b52: 41 54 push %r12
b54: 49 89 f4 mov %rsi,%r12
b57: 53 push %rbx
b58: 48 81 ec c0 00 00 00 sub $0xc0,%rsp
b5f: 48 8b 9f 80 00 00 00 mov 0x80(%rdi),%rbx
b66: e8 00 00 00 00 callq b6b <perf_trace_sched_pi_setprio+0x27>
b67: R_X86_64_PC32 debug_smp_processor_id-0x4
b6b: 89 c0 mov %eax,%eax
b6d: 48 03 1c c5 00 00 00 add 0x0(,%rax,8),%rbx
b74: 00
b71: R_X86_64_32S __per_cpu_offset
b75: 48 83 3b 00 cmpq $0x0,(%rbx)
b79: 0f 84 92 00 00 00 je c11 <perf_trace_sched_pi_setprio+0xcd>
b7f: 48 8d bd 38 ff ff ff lea -0xc8(%rbp),%rdi
b86: e8 ab fe ff ff callq a36 <perf_fetch_caller_regs>
b8b: 41 8b 75 40 mov 0x40(%r13),%esi
b8f: 48 8d 8d 34 ff ff ff lea -0xcc(%rbp),%rcx
b96: 48 8d 95 38 ff ff ff lea -0xc8(%rbp),%rdx
b9d: bf 24 00 00 00 mov $0x24,%edi
ba2: 81 e6 ff ff 00 00 and $0xffff,%esi
ba8: e8 00 00 00 00 callq bad <perf_trace_sched_pi_setprio+0x69>
ba9: R_X86_64_PC32 perf_trace_buf_prepare-0x4
bad: 48 85 c0 test %rax,%rax
bb0: 74 5f je c11 <perf_trace_sched_pi_setprio+0xcd>
bb2: 49 8b 94 24 b0 04 00 mov 0x4b0(%r12),%rdx
bb9: 00
bba: 4c 8d 85 38 ff ff ff lea -0xc8(%rbp),%r8
bc1: 49 89 d9 mov %rbx,%r9
bc4: b9 24 00 00 00 mov $0x24,%ecx
bc9: be 01 00 00 00 mov $0x1,%esi
bce: 31 ff xor %edi,%edi
bd0: 48 89 50 08 mov %rdx,0x8(%rax)
bd4: 49 8b 94 24 b8 04 00 mov 0x4b8(%r12),%rdx
bdb: 00
bdc: 48 89 50 10 mov %rdx,0x10(%rax)
be0: 41 8b 94 24 0c 03 00 mov 0x30c(%r12),%edx
be7: 00
be8: 89 50 18 mov %edx,0x18(%rax)
beb: 41 8b 54 24 50 mov 0x50(%r12),%edx
bf0: 44 89 70 20 mov %r14d,0x20(%rax)
bf4: 89 50 1c mov %edx,0x1c(%rax)
bf7: 8b 95 34 ff ff ff mov -0xcc(%rbp),%edx
bfd: 48 c7 44 24 08 00 00 movq $0x0,0x8(%rsp)
c04: 00 00
c06: 89 14 24 mov %edx,(%rsp)
c09: 48 89 c2 mov %rax,%rdx
c0c: e8 00 00 00 00 callq c11 <perf_trace_sched_pi_setprio+0xcd>
c0d: R_X86_64_PC32 perf_tp_event-0x4
c11: 48 81 c4 c0 00 00 00 add $0xc0,%rsp
c18: 5b pop %rbx
c19: 41 5c pop %r12
c1b: 41 5d pop %r13
c1d: 41 5e pop %r14
c1f: 5d pop %rbp
c20: c3 retq

This is what it looks like after this patch:

0000000000000ab1 <perf_trace_sched_pi_setprio>:
ab1: 55 push %rbp
ab2: 48 89 e5 mov %rsp,%rbp
ab5: 41 54 push %r12
ab7: 41 89 d4 mov %edx,%r12d
aba: 53 push %rbx
abb: 48 89 f3 mov %rsi,%rbx
abe: 48 8d b5 08 ff ff ff lea -0xf8(%rbp),%rsi
ac5: 48 81 ec f0 00 00 00 sub $0xf0,%rsp
acc: 48 c7 45 b8 00 00 00 movq $0x0,-0x48(%rbp)
ad3: 00
ad4: c7 45 e8 01 00 00 00 movl $0x1,-0x18(%rbp)
adb: c7 45 e0 24 00 00 00 movl $0x24,-0x20(%rbp)
ae2: 48 c7 45 d0 00 00 00 movq $0x0,-0x30(%rbp)
ae9: 00
aea: 48 c7 45 d8 01 00 00 movq $0x1,-0x28(%rbp)
af1: 00
af2: e8 00 00 00 00 callq af7 <perf_trace_sched_pi_setprio+0x46>
af3: R_X86_64_PC32 perf_trace_event_setup-0x4
af7: 48 85 c0 test %rax,%rax
afa: 74 35 je b31 <perf_trace_sched_pi_setprio+0x80>
afc: 48 8b 93 b0 04 00 00 mov 0x4b0(%rbx),%rdx
b03: 48 8d bd 08 ff ff ff lea -0xf8(%rbp),%rdi
b0a: 48 89 50 08 mov %rdx,0x8(%rax)
b0e: 48 8b 93 b8 04 00 00 mov 0x4b8(%rbx),%rdx
b15: 48 89 50 10 mov %rdx,0x10(%rax)
b19: 8b 93 0c 03 00 00 mov 0x30c(%rbx),%edx
b1f: 89 50 18 mov %edx,0x18(%rax)
b22: 8b 53 50 mov 0x50(%rbx),%edx
b25: 44 89 60 20 mov %r12d,0x20(%rax)
b29: 89 50 1c mov %edx,0x1c(%rax)
b2c: e8 00 00 00 00 callq b31 <perf_trace_sched_pi_setprio+0x80>
b2d: R_X86_64_PC32 perf_trace_event_submit-0x4
b31: 48 81 c4 f0 00 00 00 add $0xf0,%rsp
b38: 5b pop %rbx
b39: 41 5c pop %r12
b3b: 5d pop %rbp
b3c: c3 retq

Thus, it's not really just a wrapper function, but a function that is
paired with the tracepoint setup version.

-- Steve

2014-02-12 19:58:57

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/4] perf/events: Use helper functions in event assignment to shrink macro size

> +void *perf_trace_event_setup(struct ftrace_event_call *event_call,
> + struct perf_trace_event *pe)
> +{
> + pe->head = this_cpu_ptr(event_call->perf_events);
> + if (pe->constant && hlist_empty(pe->head))
> + return NULL;
> +
> + pe->entry_size = ALIGN(pe->entry_size + sizeof(u32), sizeof(u64));
> + pe->entry_size -= sizeof(u32);
> + pe->event_call = event_call;
> +
> + perf_fetch_caller_regs(&pe->regs);

I think this one is wrong, we're getting the wrong caller now.

> + pe->entry = perf_trace_buf_prepare(pe->entry_size,
> + event_call->event.type, &pe->regs, &pe->rctx);
> + return pe->entry;
> +}
> +EXPORT_SYMBOL_GPL(perf_trace_event_setup);

> +void perf_trace_event_submit(struct perf_trace_event *pe)
> +{
> + perf_trace_buf_submit(pe->entry, pe->entry_size, pe->rctx, pe->addr,
> + pe->count, &pe->regs, pe->head, pe->task);
> +}
> +EXPORT_SYMBOL_GPL(perf_trace_event_submit);

This is ridiculous, perf_trace_buf_submit() is already a pointless
wrapper, which you now wrap moar...

It would be nice if we could reduce the api clutter a little and keep
this and the [ku]probe/syscall sites similar.

2014-02-21 18:54:01

by Steven Rostedt

[permalink] [raw]

Subject: Re: [RFC][PATCH 4/4] perf/events: Use helper functions in event assignment to shrink macro size

On Wed, 12 Feb 2014 20:58:27 +0100
Peter Zijlstra <[email protected]> wrote:

> > +void *perf_trace_event_setup(struct ftrace_event_call *event_call,
> > + struct perf_trace_event *pe)
> > +{
> > + pe->head = this_cpu_ptr(event_call->perf_events);
> > + if (pe->constant && hlist_empty(pe->head))
> > + return NULL;
> > +
> > + pe->entry_size = ALIGN(pe->entry_size + sizeof(u32), sizeof(u64));
> > + pe->entry_size -= sizeof(u32);
> > + pe->event_call = event_call;
> > +
> > + perf_fetch_caller_regs(&pe->regs);
>
> I think this one is wrong, we're getting the wrong caller now.

Ah, I didn't see the magic CALLER_ADDR() usage there. I guess I could
pass that into this function too. But I think you had a fix for this,
so I'll wait.

>
> > + pe->entry = perf_trace_buf_prepare(pe->entry_size,
> > + event_call->event.type, &pe->regs, &pe->rctx);
> > + return pe->entry;
> > +}
> > +EXPORT_SYMBOL_GPL(perf_trace_event_setup);
>
>
> > +void perf_trace_event_submit(struct perf_trace_event *pe)
> > +{
> > + perf_trace_buf_submit(pe->entry, pe->entry_size, pe->rctx, pe->addr,
> > + pe->count, &pe->regs, pe->head, pe->task);
> > +}
> > +EXPORT_SYMBOL_GPL(perf_trace_event_submit);
>
> This is ridiculous, perf_trace_buf_submit() is already a pointless
> wrapper, which you now wrap moar...

I did a couple of git greps and found this:

$ git grep perf_trace_buf_submit
include/linux/ftrace_event.h:perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
include/trace/ftrace.h: perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \
kernel/trace/trace_event_perf.c: perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
kernel/trace/trace_kprobe.c: perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
kernel/trace/trace_kprobe.c: perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
kernel/trace/trace_syscalls.c: perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
kernel/trace/trace_syscalls.c: perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
kernel/trace/trace_uprobe.c: perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);

$ git grep perf_tp_event
include/linux/ftrace_event.h: perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
include/linux/perf_event.h:extern void perf_tp_event(u64 addr, u64 count, void *record,
kernel/events/core.c:static int perf_tp_event_match(struct perf_event *event,
kernel/events/core.c:void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
kernel/events/core.c: if (perf_tp_event_match(event, &data, regs))
kernel/events/core.c: if (perf_tp_event_match(event, &data, regs))
kernel/events/core.c:EXPORT_SYMBOL_GPL(perf_tp_event);
kernel/events/core.c:static int perf_tp_event_init(struct perf_event *event)
kernel/events/core.c: .event_init = perf_tp_event_init,

As perf_tp_event() is only used internally, should we just rename it to
perf_trace_buf_submit, and remove the static inline of it?

>
> It would be nice if we could reduce the api clutter a little and keep
> this and the [ku]probe/syscall sites similar.

Yeah, looks like this can have a bigger clean up.

I think I'll just put my first three patches in my 3.15 queue, and we
can work on fixing the perf code for 3.16.

-- Steve