2022-03-02 14:08:59

by Steven Rostedt

[permalink] [raw]
Subject: [PATCH 2/2] tracing: Add sample code for custom trace events

From: "Steven Rostedt (Google)" <[email protected]>

Add sample code to show how to create custom trace events in the tracefs
directory that can be enabled and modified like any event in tracefs
(including triggers, histograms, synthetic events and event probes).

The example is creating a custom sched_switch and a sched_waking to limit
what is recorded:

If the custom sched switch only records the prev_prio, next_prio and
next_pid, it can bring the size from 64 bytes per event, down to just 16
bytes!

If sched_waking only records the prio and pid of the woken event, it will
bring the size down from 36 bytes to 12 bytes per event.

This will allow for a much smaller footprint into the ring buffer and keep
more events from dropping.

Suggested-by: Joel Fernandes <[email protected]>
Signed-off-by: Steven Rostedt (Google) <[email protected]>
---
samples/Kconfig | 8 +-
samples/Makefile | 1 +
samples/trace_events/Makefile | 2 +
samples/trace_events/trace_custom_sched.c | 280 ++++++++++++++++++++++
4 files changed, 290 insertions(+), 1 deletion(-)
create mode 100644 samples/trace_events/trace_custom_sched.c

diff --git a/samples/Kconfig b/samples/Kconfig
index 22cc921ae291..10e021c72282 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -14,7 +14,13 @@ config SAMPLE_TRACE_EVENTS
tristate "Build trace_events examples -- loadable modules only"
depends on EVENT_TRACING && m
help
- This build trace event example modules.
+ This builds the trace event example module.
+
+config SAMPLE_TRACE_CUSTOM_EVENTS
+ tristate "Build custom trace event example -- loadable modules only"
+ depends on EVENT_TRACING && m
+ help
+ This builds the custom trace event example module.

config SAMPLE_TRACE_PRINTK
tristate "Build trace_printk module - tests various trace_printk formats"
diff --git a/samples/Makefile b/samples/Makefile
index 1ae4de99c983..448343e8faeb 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg/
subdir-$(CONFIG_SAMPLE_SECCOMP) += seccomp
subdir-$(CONFIG_SAMPLE_TIMER) += timers
obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/
+obj-$(CONFIG_SAMPLE_TRACE_CUSTOM_EVENTS) += trace_events/
obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/
obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/
obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace/
diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile
index b78344e7bbed..e98afc447fe1 100644
--- a/samples/trace_events/Makefile
+++ b/samples/trace_events/Makefile
@@ -13,3 +13,5 @@
CFLAGS_trace-events-sample.o := -I$(src)

obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o
+
+obj-$(CONFIG_SAMPLE_TRACE_CUSTOM_EVENTS) += trace_custom_sched.o
diff --git a/samples/trace_events/trace_custom_sched.c b/samples/trace_events/trace_custom_sched.c
new file mode 100644
index 000000000000..5271a567d99b
--- /dev/null
+++ b/samples/trace_events/trace_custom_sched.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * event tracer
+ *
+ * Copyright (C) 2022 Google Inc, Steven Rostedt <[email protected]>
+ */
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/trace_events.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <trace/events/sched.h>
+
+#define THIS_SYSTEM "custom_sched"
+
+#define SCHED_PRINT_FMT \
+ C("prev_prio=%d next_pid=%d next_prio=%d", REC->prev_prio, REC->next_pid, \
+ REC->next_prio)
+
+#define SCHED_WAKING_FMT \
+ C("pid=%d prio=%d", REC->pid, REC->prio)
+
+#undef C
+#define C(a, b...) a, b
+
+static struct trace_event_fields sched_switch_fields[] = {
+ {
+ .type = "unsigned short",
+ .name = "prev_prio",
+ .size = sizeof(short),
+ .align = __alignof__(short),
+ .is_signed = 0,
+ .filter_type = FILTER_OTHER,
+ },
+ {
+ .type = "unsigned short",
+ .name = "next_prio",
+ .size = sizeof(short),
+ .align = __alignof__(short),
+ .is_signed = 0,
+ .filter_type = FILTER_OTHER,
+ },
+ {
+ .type = "unsigned int",
+ .name = "next_prio",
+ .size = sizeof(int),
+ .align = __alignof__(int),
+ .is_signed = 0,
+ .filter_type = FILTER_OTHER,
+ },
+ {}
+};
+
+struct sched_event {
+ struct trace_entry ent;
+ unsigned short prev_prio;
+ unsigned short next_prio;
+ unsigned int next_pid;
+};
+
+static struct trace_event_fields sched_waking_fields[] = {
+ {
+ .type = "unsigned int",
+ .name = "pid",
+ .size = sizeof(int),
+ .align = __alignof__(int),
+ .is_signed = 0,
+ .filter_type = FILTER_OTHER,
+ },
+ {
+ .type = "unsigned short",
+ .name = "prio",
+ .size = sizeof(short),
+ .align = __alignof__(short),
+ .is_signed = 0,
+ .filter_type = FILTER_OTHER,
+ },
+ {}
+};
+
+struct wake_event {
+ struct trace_entry ent;
+ unsigned int pid;
+ unsigned short prio;
+};
+
+static void sched_switch_probe(void *data, bool preempt, struct task_struct *prev,
+ struct task_struct *next)
+{
+ struct trace_event_file *trace_file = data;
+ struct trace_event_buffer fbuffer;
+ struct sched_event *entry;
+
+ if (trace_trigger_soft_disabled(trace_file))
+ return;
+
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry));
+
+ if (!entry)
+ return;
+
+ entry->prev_prio = prev->prio;
+ entry->next_prio = next->prio;
+ entry->next_pid = next->pid;
+
+ trace_event_buffer_commit(&fbuffer);
+}
+
+static struct trace_event_class sched_switch_class = {
+ .system = THIS_SYSTEM,
+ .reg = trace_event_reg,
+ .fields_array = sched_switch_fields,
+ .fields = LIST_HEAD_INIT(sched_switch_class.fields),
+ .probe = sched_switch_probe,
+};
+
+static void sched_waking_probe(void *data, struct task_struct *t)
+{
+ struct trace_event_file *trace_file = data;
+ struct trace_event_buffer fbuffer;
+ struct wake_event *entry;
+
+ if (trace_trigger_soft_disabled(trace_file))
+ return;
+
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry));
+
+ if (!entry)
+ return;
+
+ entry->prio = t->prio;
+ entry->pid = t->pid;
+
+ trace_event_buffer_commit(&fbuffer);
+}
+
+static struct trace_event_class sched_waking_class = {
+ .system = THIS_SYSTEM,
+ .reg = trace_event_reg,
+ .fields_array = sched_waking_fields,
+ .fields = LIST_HEAD_INIT(sched_waking_class.fields),
+ .probe = sched_waking_probe,
+};
+
+static enum print_line_t sched_switch_output(struct trace_iterator *iter, int flags,
+ struct trace_event *trace_event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct sched_event *REC = (struct sched_event *)iter->ent;
+ int ret;
+
+ ret = trace_raw_output_prep(iter, trace_event);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
+
+ trace_seq_printf(s, SCHED_PRINT_FMT);
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions sched_switch_funcs = {
+ .trace = sched_switch_output,
+};
+
+static enum print_line_t sched_waking_output(struct trace_iterator *iter, int flags,
+ struct trace_event *trace_event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct wake_event *REC = (struct wake_event *)iter->ent;
+ int ret;
+
+ ret = trace_raw_output_prep(iter, trace_event);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
+
+ trace_seq_printf(s, SCHED_WAKING_FMT);
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions sched_waking_funcs = {
+ .trace = sched_waking_output,
+};
+
+#undef C
+#define C(a, b...) #a "," __stringify(b)
+
+static struct trace_event_call sched_switch_call = {
+ .class = &sched_switch_class,
+ .event = {
+ .funcs = &sched_switch_funcs,
+ },
+ .print_fmt = SCHED_PRINT_FMT,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+ .module = THIS_MODULE,
+#else
+ .mod = THIS_MODULE,
+#endif
+ .flags = TRACE_EVENT_FL_TRACEPOINT,
+};
+
+static struct trace_event_call sched_waking_call = {
+ .class = &sched_waking_class,
+ .event = {
+ .funcs = &sched_waking_funcs,
+ },
+ .print_fmt = SCHED_WAKING_FMT,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+ .module = THIS_MODULE,
+#else
+ .mod = THIS_MODULE,
+#endif
+ .flags = TRACE_EVENT_FL_TRACEPOINT,
+};
+
+static void fct(struct tracepoint *tp, void *priv)
+{
+ if (tp->name && strcmp(tp->name, "sched_switch") == 0)
+ sched_switch_call.tp = tp;
+ else if (tp->name && strcmp(tp->name, "sched_waking") == 0)
+ sched_waking_call.tp = tp;
+}
+
+static int add_event(struct trace_event_call *call)
+{
+ int ret;
+
+ ret = register_trace_event(&call->event);
+ if (WARN_ON(!ret))
+ return -ENODEV;
+
+ ret = trace_add_event_call(call);
+ if (WARN_ON(ret))
+ unregister_trace_event(&call->event);
+
+ return ret;
+}
+
+static int __init trace_sched_init(void)
+{
+ int ret;
+
+ check_trace_callback_type_sched_switch(sched_switch_probe);
+ check_trace_callback_type_sched_waking(sched_waking_probe);
+
+ for_each_kernel_tracepoint(fct, NULL);
+
+ ret = add_event(&sched_switch_call);
+ if (ret)
+ return ret;
+
+ ret = add_event(&sched_waking_call);
+ if (ret)
+ trace_remove_event_call(&sched_switch_call);
+
+ return ret;
+}
+
+static void __exit trace_sched_exit(void)
+{
+ trace_set_clr_event(THIS_SYSTEM, "sched_switch", 0);
+ trace_set_clr_event(THIS_SYSTEM, "sched_waking", 0);
+
+ trace_remove_event_call(&sched_switch_call);
+ trace_remove_event_call(&sched_waking_call);
+}
+
+module_init(trace_sched_init);
+module_exit(trace_sched_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("Custom scheduling events");
+MODULE_LICENSE("GPL");
+
--
2.34.1


2022-03-03 01:40:20

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH 2/2] tracing: Add sample code for custom trace events

On Tue, 01 Mar 2022 22:24:16 -0500
Steven Rostedt <[email protected]> wrote:

> From: "Steven Rostedt (Google)" <[email protected]>
>
> Add sample code to show how to create custom trace events in the tracefs
> directory that can be enabled and modified like any event in tracefs
> (including triggers, histograms, synthetic events and event probes).
>
> The example is creating a custom sched_switch and a sched_waking to limit
> what is recorded:
>
> If the custom sched switch only records the prev_prio, next_prio and
> next_pid, it can bring the size from 64 bytes per event, down to just 16
> bytes!
>
> If sched_waking only records the prio and pid of the woken event, it will
> bring the size down from 36 bytes to 12 bytes per event.
>
> This will allow for a much smaller footprint into the ring buffer and keep
> more events from dropping.
>
> Suggested-by: Joel Fernandes <[email protected]>
> Signed-off-by: Steven Rostedt (Google) <[email protected]>
> ---
> samples/Kconfig | 8 +-
> samples/Makefile | 1 +
> samples/trace_events/Makefile | 2 +
> samples/trace_events/trace_custom_sched.c | 280 ++++++++++++++++++++++
> 4 files changed, 290 insertions(+), 1 deletion(-)
> create mode 100644 samples/trace_events/trace_custom_sched.c
>
> diff --git a/samples/Kconfig b/samples/Kconfig
> index 22cc921ae291..10e021c72282 100644
> --- a/samples/Kconfig
> +++ b/samples/Kconfig
> @@ -14,7 +14,13 @@ config SAMPLE_TRACE_EVENTS
> tristate "Build trace_events examples -- loadable modules only"
> depends on EVENT_TRACING && m
> help
> - This build trace event example modules.
> + This builds the trace event example module.
> +
> +config SAMPLE_TRACE_CUSTOM_EVENTS
> + tristate "Build custom trace event example -- loadable modules only"
> + depends on EVENT_TRACING && m
> + help
> + This builds the custom trace event example module.
>
> config SAMPLE_TRACE_PRINTK
> tristate "Build trace_printk module - tests various trace_printk formats"
> diff --git a/samples/Makefile b/samples/Makefile
> index 1ae4de99c983..448343e8faeb 100644
> --- a/samples/Makefile
> +++ b/samples/Makefile
> @@ -20,6 +20,7 @@ obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg/
> subdir-$(CONFIG_SAMPLE_SECCOMP) += seccomp
> subdir-$(CONFIG_SAMPLE_TIMER) += timers
> obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/
> +obj-$(CONFIG_SAMPLE_TRACE_CUSTOM_EVENTS) += trace_events/
> obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/
> obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/
> obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace/
> diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile
> index b78344e7bbed..e98afc447fe1 100644
> --- a/samples/trace_events/Makefile
> +++ b/samples/trace_events/Makefile
> @@ -13,3 +13,5 @@
> CFLAGS_trace-events-sample.o := -I$(src)
>
> obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o
> +
> +obj-$(CONFIG_SAMPLE_TRACE_CUSTOM_EVENTS) += trace_custom_sched.o
> diff --git a/samples/trace_events/trace_custom_sched.c b/samples/trace_events/trace_custom_sched.c
> new file mode 100644
> index 000000000000..5271a567d99b
> --- /dev/null
> +++ b/samples/trace_events/trace_custom_sched.c
> @@ -0,0 +1,280 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * event tracer
> + *
> + * Copyright (C) 2022 Google Inc, Steven Rostedt <[email protected]>
> + */
> +
> +#define pr_fmt(fmt) fmt
> +
> +#include <linux/trace_events.h>
> +#include <linux/version.h>
> +#include <linux/module.h>
> +#include <linux/sched.h>
> +#include <trace/events/sched.h>
> +
> +#define THIS_SYSTEM "custom_sched"
> +
> +#define SCHED_PRINT_FMT \
> + C("prev_prio=%d next_pid=%d next_prio=%d", REC->prev_prio, REC->next_pid, \
> + REC->next_prio)
> +
> +#define SCHED_WAKING_FMT \
> + C("pid=%d prio=%d", REC->pid, REC->prio)
> +
> +#undef C
> +#define C(a, b...) a, b
> +
> +static struct trace_event_fields sched_switch_fields[] = {
> + {
> + .type = "unsigned short",
> + .name = "prev_prio",
> + .size = sizeof(short),
> + .align = __alignof__(short),
> + .is_signed = 0,
> + .filter_type = FILTER_OTHER,
> + },
> + {
> + .type = "unsigned short",
> + .name = "next_prio",
> + .size = sizeof(short),
> + .align = __alignof__(short),
> + .is_signed = 0,
> + .filter_type = FILTER_OTHER,
> + },
> + {
> + .type = "unsigned int",
> + .name = "next_prio",
> + .size = sizeof(int),
> + .align = __alignof__(int),
> + .is_signed = 0,
> + .filter_type = FILTER_OTHER,
> + },
> + {}
> +};
> +
> +struct sched_event {
> + struct trace_entry ent;
> + unsigned short prev_prio;
> + unsigned short next_prio;
> + unsigned int next_pid;
> +};
> +
> +static struct trace_event_fields sched_waking_fields[] = {
> + {
> + .type = "unsigned int",
> + .name = "pid",
> + .size = sizeof(int),
> + .align = __alignof__(int),
> + .is_signed = 0,
> + .filter_type = FILTER_OTHER,
> + },
> + {
> + .type = "unsigned short",
> + .name = "prio",
> + .size = sizeof(short),
> + .align = __alignof__(short),
> + .is_signed = 0,
> + .filter_type = FILTER_OTHER,
> + },
> + {}
> +};
> +
> +struct wake_event {
> + struct trace_entry ent;
> + unsigned int pid;
> + unsigned short prio;
> +};
> +
> +static void sched_switch_probe(void *data, bool preempt, struct task_struct *prev,
> + struct task_struct *next)
> +{
> + struct trace_event_file *trace_file = data;
> + struct trace_event_buffer fbuffer;
> + struct sched_event *entry;
> +
> + if (trace_trigger_soft_disabled(trace_file))
> + return;
> +
> + entry = trace_event_buffer_reserve(&fbuffer, trace_file,
> + sizeof(*entry));
> +
> + if (!entry)
> + return;
> +
> + entry->prev_prio = prev->prio;
> + entry->next_prio = next->prio;
> + entry->next_pid = next->pid;
> +
> + trace_event_buffer_commit(&fbuffer);
> +}
> +
> +static struct trace_event_class sched_switch_class = {
> + .system = THIS_SYSTEM,
> + .reg = trace_event_reg,
> + .fields_array = sched_switch_fields,
> + .fields = LIST_HEAD_INIT(sched_switch_class.fields),
> + .probe = sched_switch_probe,
> +};
> +
> +static void sched_waking_probe(void *data, struct task_struct *t)
> +{
> + struct trace_event_file *trace_file = data;
> + struct trace_event_buffer fbuffer;
> + struct wake_event *entry;
> +
> + if (trace_trigger_soft_disabled(trace_file))
> + return;
> +
> + entry = trace_event_buffer_reserve(&fbuffer, trace_file,
> + sizeof(*entry));
> +
> + if (!entry)
> + return;
> +
> + entry->prio = t->prio;
> + entry->pid = t->pid;
> +
> + trace_event_buffer_commit(&fbuffer);
> +}
> +
> +static struct trace_event_class sched_waking_class = {
> + .system = THIS_SYSTEM,
> + .reg = trace_event_reg,
> + .fields_array = sched_waking_fields,
> + .fields = LIST_HEAD_INIT(sched_waking_class.fields),
> + .probe = sched_waking_probe,
> +};
> +
> +static enum print_line_t sched_switch_output(struct trace_iterator *iter, int flags,
> + struct trace_event *trace_event)
> +{
> + struct trace_seq *s = &iter->seq;
> + struct sched_event *REC = (struct sched_event *)iter->ent;
> + int ret;
> +
> + ret = trace_raw_output_prep(iter, trace_event);
> + if (ret != TRACE_TYPE_HANDLED)
> + return ret;
> +
> + trace_seq_printf(s, SCHED_PRINT_FMT);
> + trace_seq_putc(s, '\n');
> +
> + return trace_handle_return(s);
> +}
> +
> +static struct trace_event_functions sched_switch_funcs = {
> + .trace = sched_switch_output,
> +};
> +
> +static enum print_line_t sched_waking_output(struct trace_iterator *iter, int flags,
> + struct trace_event *trace_event)
> +{
> + struct trace_seq *s = &iter->seq;
> + struct wake_event *REC = (struct wake_event *)iter->ent;
> + int ret;
> +
> + ret = trace_raw_output_prep(iter, trace_event);
> + if (ret != TRACE_TYPE_HANDLED)
> + return ret;
> +
> + trace_seq_printf(s, SCHED_WAKING_FMT);
> + trace_seq_putc(s, '\n');
> +
> + return trace_handle_return(s);
> +}
> +
> +static struct trace_event_functions sched_waking_funcs = {
> + .trace = sched_waking_output,
> +};
> +
> +#undef C
> +#define C(a, b...) #a "," __stringify(b)
> +
> +static struct trace_event_call sched_switch_call = {
> + .class = &sched_switch_class,
> + .event = {
> + .funcs = &sched_switch_funcs,
> + },
> + .print_fmt = SCHED_PRINT_FMT,
> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
> + .module = THIS_MODULE,
> +#else
> + .mod = THIS_MODULE,
> +#endif

I think this example code doesn't need to care about older kernels.

Others looks good to me.

Reviewed-by: Masami Hiramatsu <[email protected]>

Hopefully, it is more helpful if there are more comments for other developers. :-)
(and usage comments too)

Thank you,

> + .flags = TRACE_EVENT_FL_TRACEPOINT,
> +};
> +
> +static struct trace_event_call sched_waking_call = {
> + .class = &sched_waking_class,
> + .event = {
> + .funcs = &sched_waking_funcs,
> + },
> + .print_fmt = SCHED_WAKING_FMT,
> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
> + .module = THIS_MODULE,
> +#else
> + .mod = THIS_MODULE,
> +#endif
> + .flags = TRACE_EVENT_FL_TRACEPOINT,
> +};
> +
> +static void fct(struct tracepoint *tp, void *priv)
> +{
> + if (tp->name && strcmp(tp->name, "sched_switch") == 0)
> + sched_switch_call.tp = tp;
> + else if (tp->name && strcmp(tp->name, "sched_waking") == 0)
> + sched_waking_call.tp = tp;
> +}
> +
> +static int add_event(struct trace_event_call *call)
> +{
> + int ret;
> +
> + ret = register_trace_event(&call->event);
> + if (WARN_ON(!ret))
> + return -ENODEV;
> +
> + ret = trace_add_event_call(call);
> + if (WARN_ON(ret))
> + unregister_trace_event(&call->event);
> +
> + return ret;
> +}
> +
> +static int __init trace_sched_init(void)
> +{
> + int ret;
> +
> + check_trace_callback_type_sched_switch(sched_switch_probe);
> + check_trace_callback_type_sched_waking(sched_waking_probe);
> +
> + for_each_kernel_tracepoint(fct, NULL);
> +
> + ret = add_event(&sched_switch_call);
> + if (ret)
> + return ret;
> +
> + ret = add_event(&sched_waking_call);
> + if (ret)
> + trace_remove_event_call(&sched_switch_call);
> +
> + return ret;
> +}
> +
> +static void __exit trace_sched_exit(void)
> +{
> + trace_set_clr_event(THIS_SYSTEM, "sched_switch", 0);
> + trace_set_clr_event(THIS_SYSTEM, "sched_waking", 0);
> +
> + trace_remove_event_call(&sched_switch_call);
> + trace_remove_event_call(&sched_waking_call);
> +}
> +
> +module_init(trace_sched_init);
> +module_exit(trace_sched_exit);
> +
> +MODULE_AUTHOR("Steven Rostedt");
> +MODULE_DESCRIPTION("Custom scheduling events");
> +MODULE_LICENSE("GPL");
> +
> --
> 2.34.1


--
Masami Hiramatsu <[email protected]>

2022-03-03 03:23:27

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH 2/2] tracing: Add sample code for custom trace events

Hi Steven,

On Tue, Mar 1, 2022 at 10:28 PM Steven Rostedt <[email protected]> wrote:
>
> From: "Steven Rostedt (Google)" <[email protected]>
>
> Add sample code to show how to create custom trace events in the tracefs
> directory that can be enabled and modified like any event in tracefs
> (including triggers, histograms, synthetic events and event probes).
>
> The example is creating a custom sched_switch and a sched_waking to limit
> what is recorded:
>
> If the custom sched switch only records the prev_prio, next_prio and
> next_pid, it can bring the size from 64 bytes per event, down to just 16
> bytes!
>
> If sched_waking only records the prio and pid of the woken event, it will
> bring the size down from 36 bytes to 12 bytes per event.
>
> This will allow for a much smaller footprint into the ring buffer and keep
> more events from dropping.
>
> Suggested-by: Joel Fernandes <[email protected]>
> Signed-off-by: Steven Rostedt (Google) <[email protected]>

Just 2 event fields related comments below, but other than that I
tested and it works quite well so:
Tested-By: Joel Fernandes <[email protected]>

> ---
[..]
> diff --git a/samples/trace_events/trace_custom_sched.c b/samples/trace_events/trace_custom_sched.c
> new file mode 100644
> index 000000000000..5271a567d99b
> --- /dev/null
> +++ b/samples/trace_events/trace_custom_sched.c
> @@ -0,0 +1,280 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * event tracer
> + *
> + * Copyright (C) 2022 Google Inc, Steven Rostedt <[email protected]>
> + */
> +
> +#define pr_fmt(fmt) fmt
> +
> +#include <linux/trace_events.h>
> +#include <linux/version.h>
> +#include <linux/module.h>
> +#include <linux/sched.h>
> +#include <trace/events/sched.h>
> +
> +#define THIS_SYSTEM "custom_sched"
> +
> +#define SCHED_PRINT_FMT \
> + C("prev_prio=%d next_pid=%d next_prio=%d", REC->prev_prio, REC->next_pid, \

Probably prev_pid should be included so we know what the previous task was?

Or are you expecting that a prior sched_switch would have that
information? If so, then prev_prio is also not needed as the previous
sched_switch's next_prio would have the prio. That would save even
more space too..

> + REC->next_prio)
> +
> +#define SCHED_WAKING_FMT \
> + C("pid=%d prio=%d", REC->pid, REC->prio)
> +

I think including the target_cpu of a wake up is also really important
to show where the task is going to be awakened, and maybe we can drop
prio since a subsequent sched_switch will have the priority in
next_prio.

[..]
> +static void __exit trace_sched_exit(void)
> +{
> + trace_set_clr_event(THIS_SYSTEM, "sched_switch", 0);
> + trace_set_clr_event(THIS_SYSTEM, "sched_waking", 0);
> +
> + trace_remove_event_call(&sched_switch_call);
> + trace_remove_event_call(&sched_waking_call);
> +}
> +
> +module_init(trace_sched_init);
> +module_exit(trace_sched_exit);
> +
> +MODULE_AUTHOR("Steven Rostedt");
> +MODULE_DESCRIPTION("Custom scheduling events");
> +MODULE_LICENSE("GPL");
> +

Remove extra lines from the end of the file?

Thanks,
Joel

2022-03-03 03:26:33

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 2/2] tracing: Add sample code for custom trace events

On Thu, 3 Mar 2022 10:40:06 +0900
Masami Hiramatsu <[email protected]> wrote:

> > +static struct trace_event_call sched_switch_call = {
> > + .class = &sched_switch_class,
> > + .event = {
> > + .funcs = &sched_switch_funcs,
> > + },
> > + .print_fmt = SCHED_PRINT_FMT,
> > +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
> > + .module = THIS_MODULE,
> > +#else
> > + .mod = THIS_MODULE,
> > +#endif
>
> I think this example code doesn't need to care about older kernels.

Oops, I thought I got rid of all the mod versions :-p I had this
working on a 5.4 kernel that we were going use, and there was a bunch
more of these in my original code. I'll nuke this on v2.

>
> Others looks good to me.
>
> Reviewed-by: Masami Hiramatsu <[email protected]>
>
> Hopefully, it is more helpful if there are more comments for other developers. :-)
> (and usage comments too)

Actually, I was thinking of coming up with a bunch of helper
functions/macros to make this less complex. But yeah, it needs a bunch
of comments before it can be accepted.

Thanks a lot for the review!

-- Steve

2022-03-03 03:45:50

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 2/2] tracing: Add sample code for custom trace events

On Wed, 2 Mar 2022 22:22:30 -0500
Joel Fernandes <[email protected]> wrote:

> > +#include <linux/trace_events.h>
> > +#include <linux/version.h>
> > +#include <linux/module.h>
> > +#include <linux/sched.h>
> > +#include <trace/events/sched.h>
> > +
> > +#define THIS_SYSTEM "custom_sched"
> > +
> > +#define SCHED_PRINT_FMT \
> > + C("prev_prio=%d next_pid=%d next_prio=%d", REC->prev_prio, REC->next_pid, \
>
> Probably prev_pid should be included so we know what the previous task was?
>
> Or are you expecting that a prior sched_switch would have that
> information? If so, then prev_prio is also not needed as the previous
> sched_switch's next_prio would have the prio. That would save even
> more space too..

No, I left it out because it's already recorded in the event:

kworker/u16:2-19213 [005] d..2. 24689.792052: sched_switch: prev_comm=kworker/u16:2 prev_pid=19213 prev_prio=120 prev_state=I ==> next_comm=swapper/5 next_pid=0 next_prio=120

The above is the normal sched_switch event. The prev pid is 19213 which
is equal to common_pid (the first 19213 in that string). As prios can
change, I would not want to drop that anyway.

>
> > + REC->next_prio)
> > +
> > +#define SCHED_WAKING_FMT \
> > + C("pid=%d prio=%d", REC->pid, REC->prio)
> > +
>
> I think including the target_cpu of a wake up is also really important
> to show where the task is going to be awakened, and maybe we can drop
> prio since a subsequent sched_switch will have the priority in
> next_prio.

True, we can do that.

>
> [..]
> > +static void __exit trace_sched_exit(void)
> > +{
> > + trace_set_clr_event(THIS_SYSTEM, "sched_switch", 0);
> > + trace_set_clr_event(THIS_SYSTEM, "sched_waking", 0);
> > +
> > + trace_remove_event_call(&sched_switch_call);
> > + trace_remove_event_call(&sched_waking_call);
> > +}
> > +
> > +module_init(trace_sched_init);
> > +module_exit(trace_sched_exit);
> > +
> > +MODULE_AUTHOR("Steven Rostedt");
> > +MODULE_DESCRIPTION("Custom scheduling events");
> > +MODULE_LICENSE("GPL");
> > +
>
> Remove extra lines from the end of the file?
>

Ah, yeah.

Thanks for reviewing.

-- Steve