From: "Steven Rostedt (Google)" <[email protected]>
Allow triggers to be enabled at kernel boot up. For example:
trace_trigger="sched_switch.stacktrace if prev_state == 2"
The above will enable the stacktrace trigger on top of the sched_switch
event and only trigger if its prev_state is 2 (TASK_UNINTERRUPTIBLE). Then
at boot up, a stacktrace will trigger and be recorded in the tracing ring
buffer every time the sched_switch happens where the previous state is
TASK_INTERRUPTIBLE.
Another useful trigger would be "traceoff" which can stop tracing on an
event if a field of the event matches a certain value defined by the
filter ("if" statement).
Signed-off-by: Steven Rostedt (Google) <[email protected]>
---
Changes since v1: https://lore.kernel.org/all/[email protected]/
- Encapsulated trigger code better to get rid of warning of unused 'i'
variable (kernel-test-robot)
- Removed synchronize_rcu() avoidance, as Paul McKenney will be updating it to
not cause issues: https://lore.kernel.org/all/20221020222108.GZ5600@paulmck-ThinkPad-P17-Gen-1/
.../admin-guide/kernel-parameters.txt | 19 +++++
kernel/trace/trace_events.c | 72 ++++++++++++++++++-
2 files changed, 89 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a465d5242774..ccf91a4bf113 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6257,6 +6257,25 @@
See also Documentation/trace/ftrace.rst "trace options"
section.
+ trace_trigger=[trigger-list]
+ [FTRACE] Add a event trigger on specific events.
+ Set a trigger on top of a specific event, with an optional
+ filter.
+
+ The format is is "trace_trigger=<event>.<trigger>[ if <filter>],..."
+ Where more than one trigger may be specified that are comma deliminated.
+
+ For example:
+
+ trace_trigger="sched_switch.stacktrace if prev_state == 2"
+
+ The above will enable the "stacktrace" trigger on the "sched_switch"
+ event but only trigger it if the "prev_state" of the "sched_switch"
+ event is "2" (TASK_UNINTERUPTIBLE).
+
+ See also "Event triggers" in Documentation/trace/events.rst
+
+
traceoff_on_warning
[FTRACE] enable this option to disable tracing when a
warning is hit. This turns off "tracing_on". Tracing can
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0356cae0cf74..7372fffb8109 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2796,6 +2796,44 @@ trace_create_new_event(struct trace_event_call *call,
return file;
}
+#ifdef CONFIG_HIST_TRIGGERS
+#define MAX_BOOT_TRIGGERS 32
+
+static struct boot_triggers {
+ const char *event;
+ char *trigger;
+} bootup_triggers[MAX_BOOT_TRIGGERS];
+
+static char bootup_trigger_buf[COMMAND_LINE_SIZE];
+static int nr_boot_triggers;
+
+static __init int setup_trace_triggers(char *str)
+{
+ char *trigger;
+ char *buf;
+ int i;
+
+ strlcpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
+ ring_buffer_expanded = true;
+ disable_tracing_selftest("running event triggers");
+
+ buf = bootup_trigger_buf;
+ for (i = 0; i < MAX_BOOT_TRIGGERS; i++) {
+ trigger = strsep(&buf, ",");
+ if (!trigger)
+ break;
+ bootup_triggers[i].event = strsep(&trigger, ".");
+ bootup_triggers[i].trigger = strsep(&trigger, ".");
+ if (!bootup_triggers[i].trigger)
+ break;
+ }
+
+ nr_boot_triggers = i;
+ return 1;
+}
+__setup("trace_trigger=", setup_trace_triggers);
+#endif
+
/* Add an event to a trace directory */
static int
__trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
@@ -2812,6 +2850,28 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
return event_define_fields(call);
}
+#ifdef CONFIG_HIST_TRIGGERS
+static void trace_early_triggers(struct trace_event_file *file, const char *name)
+{
+ int ret;
+ int i;
+
+ for (i = 0; i < nr_boot_triggers; i++) {
+ if (strcmp(name, bootup_triggers[i].event))
+ continue;
+ mutex_lock(&event_mutex);
+ ret = trigger_process_regex(file, bootup_triggers[i].trigger);
+ mutex_unlock(&event_mutex);
+ if (ret)
+ pr_err("Failed to register trigger '%s' on event %s\n",
+ bootup_triggers[i].trigger,
+ bootup_triggers[i].event);
+ }
+}
+#else
+static inline void trace_early_triggers(struct trace_event_file *file, const char *name) { }
+#endif
+
/*
* Just create a descriptor for early init. A descriptor is required
* for enabling events at boot. We want to enable events before
@@ -2822,12 +2882,19 @@ __trace_early_add_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
struct trace_event_file *file;
+ int ret;
file = trace_create_new_event(call, tr);
if (!file)
return -ENOMEM;
- return event_define_fields(call);
+ ret = event_define_fields(call);
+ if (ret)
+ return ret;
+
+ trace_early_triggers(file, trace_event_name(call));
+
+ return 0;
}
struct ftrace_module_file_ops;
@@ -3726,6 +3793,8 @@ static __init int event_trace_enable(void)
list_add(&call->list, &ftrace_events);
}
+ register_trigger_cmds();
+
/*
* We need the top trace array to have a working set of trace
* points at early init, before the debug files and directories
@@ -3740,7 +3809,6 @@ static __init int event_trace_enable(void)
register_event_cmds();
- register_trigger_cmds();
return 0;
}
--
2.35.1
On Thu, Oct 20, 2022 at 7:00 PM Steven Rostedt <[email protected]> wrote:
>
> From: "Steven Rostedt (Google)" <[email protected]>
>
> Allow triggers to be enabled at kernel boot up. For example:
>
> trace_trigger="sched_switch.stacktrace if prev_state == 2"
>
> The above will enable the stacktrace trigger on top of the sched_switch
> event and only trigger if its prev_state is 2 (TASK_UNINTERRUPTIBLE). Then
> at boot up, a stacktrace will trigger and be recorded in the tracing ring
> buffer every time the sched_switch happens where the previous state is
> TASK_INTERRUPTIBLE.
>
> Another useful trigger would be "traceoff" which can stop tracing on an
> event if a field of the event matches a certain value defined by the
> filter ("if" statement).
>
> Signed-off-by: Steven Rostedt (Google) <[email protected]>
<>
> +#ifdef CONFIG_HIST_TRIGGERS
Can you help me understand why this is only available if
CONFIG_HIST_TRIGGERS is selected in the kernel config? AFAICT this
code doesn't depend on the histogram code, and the run-time selection
of triggers is usable without CONFIG_HIST_TRIGGERS.
Thanks!
> +static void trace_early_triggers(struct trace_event_file *file, const char *name)
> +{
> + int ret;
> + int i;
<>
On Thu, 8 Dec 2022 15:27:07 -0700
Ross Zwisler <[email protected]> wrote:
> > +#ifdef CONFIG_HIST_TRIGGERS
>
> Can you help me understand why this is only available if
> CONFIG_HIST_TRIGGERS is selected in the kernel config? AFAICT this
> code doesn't depend on the histogram code, and the run-time selection
> of triggers is usable without CONFIG_HIST_TRIGGERS.
Good catch!
I got confused, and only saw that as "CONFIG_TRIGGERS" and wasn't thinking
that config was just for histogram triggers :-p
Care to send a patch to fix it?
Thanks Ross,
-- Steve
On Thu, Dec 08, 2022 at 06:39:45PM -0500, Steven Rostedt wrote:
> On Thu, 8 Dec 2022 15:27:07 -0700
> Ross Zwisler <[email protected]> wrote:
>
> > > +#ifdef CONFIG_HIST_TRIGGERS
> >
> > Can you help me understand why this is only available if
> > CONFIG_HIST_TRIGGERS is selected in the kernel config? AFAICT this
> > code doesn't depend on the histogram code, and the run-time selection
> > of triggers is usable without CONFIG_HIST_TRIGGERS.
>
> Good catch!
>
> I got confused, and only saw that as "CONFIG_TRIGGERS" and wasn't thinking
> that config was just for histogram triggers :-p
>
> Care to send a patch to fix it?
Sure, happy to.
One more question: I was playing with this code using examples from
https://www.kernel.org/doc/html/latest/trace/events.html
and when I tried to create a command line trigger to gather a snapshot:
trace_trigger="sched_switch.snapshot:1 if prev_state == 2"
it hits an oops:
[ 0.178179] traps: PANIC: double fault, error_code: 0xffffffffa0a02040
[ 0.178187] BUG: unable to handle page fault for address: fffffe000000b000
[ 0.178188] #PF: supervisor read access in kernel mode
[ 0.178189] #PF: error_code(0x0000) - not-present page
[ 0.178191] PGD 23ffc3067 P4D 23ffc3067 PUD 23ffc1067 PMD 23ffc0067 PTE 0
[ 0.178193] Oops: 0000 [#1] PREEMPT SMP PTI
[ 0.178195] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.1.0-rc7+ #4
[ 0.178197] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-4 04/01/2014
[ 0.178198] RIP: 0010:__die_header+0x1e/0x7d
[ 0.178204] Code: 00 00 31 c0 89 43 50 e9 30 ff 49 ff 0f 1f 44 00 00 8b 05 bf 3c 64 01 49 89 fa 85 c0 75 0e 48 c7 c7 60 f0 bc a1 b9 2a 00 01
[ 0.178206] RSP: 0000:fffffe000000aec0 EFLAGS: 00010046
[ 0.178207] RAX: 0000000000000000 RBX: ffffffffa0f5aaa8 RCX: 0000000000000002
[ 0.178208] RDX: ffffffffa0a02040 RSI: fffffe000000b000 RDI: ffffffffa1bcf100
[ 0.178209] RBP: fffffe000000af60 R08: ffffffffa125b280 R09: 6666666666666666
[ 0.178209] R10: ffffffffa0f5aaa8 R11: 203a65646f635f72 R12: ffffffffa0a02040
[ 0.178210] R13: ffffffffa0a02040 R14: 0000000000000000 R15: 0000000000000000
[ 0.178212] FS: 0000000000000000(0000) GS:ffff93ef77c00000(0000) knlGS:0000000000000000
[ 0.178213] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.178214] CR2: fffffe000000b000 CR3: 000000004500a000 CR4: 00000000000006b0
[ 0.178217] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 0.178218] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 0.178219] Call Trace:
[ 0.178223] <#DF>
[ 0.178224] __die+0x17/0x29
[ 0.178227] die+0x2a/0x50
[ 0.178232] ? zen_untrain_ret+0x1/0x1
[ 0.178237] exc_double_fault+0x173/0x180
[ 0.178242] asm_exc_double_fault+0x1f/0x30
[ 0.178248] WARNING: stack recursion on stack type 5
[ 0.178249] WARNING: can't access registers at asm_exc_double_fault+0x1f/0x30
[ 0.178253] </#DF>
[ 0.178253] <TASK>
[ 0.178254] ? _raw_spin_unlock_irq+0x11/0x40
[ 0.178256] ? wait_for_completion+0x7e/0x160
[ 0.178258] ? ring_buffer_resize+0x320/0x450
[ 0.178262] ? resize_buffer_duplicate_size+0x38/0xe0
[ 0.178264] ? tracing_alloc_snapshot_instance+0x23/0x40
[ 0.178266] ? register_snapshot_trigger+0x16/0x40
[ 0.178269] ? event_trigger_parse+0x113/0x160
[ 0.178272] ? trigger_process_regex+0xb8/0x100
[ 0.178274] ? __trace_early_add_events+0xb8/0x140
[ 0.178275] ? trace_event_init+0xcc/0x2dd
[ 0.178278] ? start_kernel+0x4a9/0x713
[ 0.178281] ? secondary_startup_64_no_verify+0xce/0xdb
[ 0.178285] </TASK>
[ 0.178285] Modules linked in:
[ 0.178286] CR2: fffffe000000b000
[ 0.220408] ---[ end trace 0000000000000000 ]---
[ 0.220409] RIP: 0010:__die_header+0x1e/0x7d
[ 0.220411] Code: 00 00 31 c0 89 43 50 e9 30 ff 49 ff 0f 1f 44 00 00 8b 05 bf 3c 64 01 49 89 fa 85 c0 75 0e 48 c7 c7 60 f0 bc a1 b9 2a 00 01
[ 0.220412] RSP: 0000:fffffe000000aec0 EFLAGS: 00010046
[ 0.220414] RAX: 0000000000000000 RBX: ffffffffa0f5aaa8 RCX: 0000000000000002
[ 0.220414] RDX: ffffffffa0a02040 RSI: fffffe000000b000 RDI: ffffffffa1bcf100
[ 0.220415] RBP: fffffe000000af60 R08: ffffffffa125b280 R09: 6666666666666666
[ 0.220416] R10: ffffffffa0f5aaa8 R11: 203a65646f635f72 R12: ffffffffa0a02040
[ 0.220416] R13: ffffffffa0a02040 R14: 0000000000000000 R15: 0000000000000000
[ 0.220417] FS: 0000000000000000(0000) GS:ffff93ef77c00000(0000) knlGS:0000000000000000
[ 0.220418] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.220419] CR2: fffffe000000b000 CR3: 000000004500a000 CR4: 00000000000006b0
[ 0.220421] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 0.220422] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 0.220423] Kernel panic - not syncing: Fatal exception in interrupt
I think that this is because the code to allocate the snapshot buffer uses
workqueues (ring_buffer_resize() calls schedule_work_on() then
wait_for_completion()), but at this point during the init process the
workqueues are up enough that we can enqueue entries, but they are not yet
doing work.
start_kernel() {
...
/*
* Allow workqueue creation and work item queueing/cancelling
* early. Work item execution depends on kthreads and starts after
* workqueue_init().
*/
workqueue_init_early();
...
/* Trace events are available after this */
trace_init(); // here is where we try and allocate the snapshot
...
arch_call_rest_init();
rest_init()
kernel_init()
kernel_init_freeable()
workqueue_init()
}
I'm guessing the best we can do here is just disallow snapshot triggers via
the command line option, so that others don't cut themselves on this sharp
corner? Other ideas?
Thanks,
- Ross
The trace_trigger command line option introduced by
commit a01fdc897fa5 ("tracing: Add trace_trigger kernel command line option")
doesn't need to depend on the CONFIG_HIST_TRIGGERS kernel config option.
This code doesn't depend on the histogram code, and the run-time
selection of triggers is usable without CONFIG_HIST_TRIGGERS.
Fixes: a01fdc897fa5 ("tracing: Add trace_trigger kernel command line option")
Signed-off-by: Ross Zwisler <[email protected]>
---
kernel/trace/trace_events.c | 6 ------
1 file changed, 6 deletions(-)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3bfaf560ecc4..33e0b4f8ebe6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2796,7 +2796,6 @@ trace_create_new_event(struct trace_event_call *call,
return file;
}
-#ifdef CONFIG_HIST_TRIGGERS
#define MAX_BOOT_TRIGGERS 32
static struct boot_triggers {
@@ -2832,7 +2831,6 @@ static __init int setup_trace_triggers(char *str)
return 1;
}
__setup("trace_trigger=", setup_trace_triggers);
-#endif
/* Add an event to a trace directory */
static int
@@ -2850,7 +2848,6 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
return event_define_fields(call);
}
-#ifdef CONFIG_HIST_TRIGGERS
static void trace_early_triggers(struct trace_event_file *file, const char *name)
{
int ret;
@@ -2868,9 +2865,6 @@ static void trace_early_triggers(struct trace_event_file *file, const char *name
bootup_triggers[i].event);
}
}
-#else
-static inline void trace_early_triggers(struct trace_event_file *file, const char *name) { }
-#endif
/*
* Just create a descriptor for early init. A descriptor is required
--
2.39.0.rc1.256.g54fd8350bd-goog
On Thu, 8 Dec 2022 17:19:21 -0700
Ross Zwisler <[email protected]> wrote:
> I think that this is because the code to allocate the snapshot buffer uses
> workqueues (ring_buffer_resize() calls schedule_work_on() then
> wait_for_completion()), but at this point during the init process the
> workqueues are up enough that we can enqueue entries, but they are not yet
> doing work.
Thanks for the report.
>
> start_kernel() {
> ...
>
> /*
> * Allow workqueue creation and work item queueing/cancelling
> * early. Work item execution depends on kthreads and starts after
> * workqueue_init().
> */
> workqueue_init_early();
> ...
>
> /* Trace events are available after this */
> trace_init(); // here is where we try and allocate the snapshot
>
> ...
>
> arch_call_rest_init();
> rest_init()
> kernel_init()
> kernel_init_freeable()
> workqueue_init()
> }
>
> I'm guessing the best we can do here is just disallow snapshot triggers via
> the command line option, so that others don't cut themselves on this sharp
> corner? Other ideas?
After debugging it, it's because that code path expects to be called with
interrupts enabled, but this early in boot up, interrupts haven't been
enabled yet and should not be.
> [ 0.178253] </#DF>
> [ 0.178253] <TASK>
> [ 0.178254] ? _raw_spin_unlock_irq+0x11/0x40
raw_spin_lock_irq() enables interrupts regardless.
> [ 0.178256] ? wait_for_completion+0x7e/0x160
> [ 0.178258] ? ring_buffer_resize+0x320/0x450
> [ 0.178262] ? resize_buffer_duplicate_size+0x38/0xe0
> [ 0.178264] ? tracing_alloc_snapshot_instance+0x23/0x40
> [ 0.178266] ? register_snapshot_trigger+0x16/0x40
> [ 0.178269] ? event_trigger_parse+0x113/0x160
> [ 0.178272] ? trigger_process_regex+0xb8/0x100
> [ 0.178274] ? __trace_early_add_events+0xb8/0x140
> [ 0.178275] ? trace_event_init+0xcc/0x2dd
> [ 0.178278] ? start_kernel+0x4a9/0x713
> [ 0.178281] ? secondary_startup_64_no_verify+0xce/0xdb
This appears to fix it.
-- Steve
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 843818ee4814..ef4da331ff61 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2062,8 +2062,10 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
struct list_head *pages = &cpu_buffer->new_pages;
int retries, success;
+ unsigned long flags;
- raw_spin_lock_irq(&cpu_buffer->reader_lock);
+ /* Can be called at early boot up, where interrupts have not been enabled */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
/*
* We are holding the reader lock, so the reader page won't be swapped
* in the ring buffer. Now we are racing with the writer trying to
@@ -2120,7 +2122,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
* tracing
*/
RB_WARN_ON(cpu_buffer, !success);
- raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
/* free pages if they weren't inserted */
if (!success) {
@@ -2171,6 +2173,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long nr_pages;
+ unsigned long flags;
int cpu, err;
/*
@@ -2248,8 +2251,19 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
rb_update_pages(cpu_buffer);
cpu_buffer->nr_pages_to_update = 0;
} else {
- schedule_work_on(cpu,
- &cpu_buffer->update_pages_work);
+ /*
+ * Run directly if possible.
+ * Save flags as this can be called at early boot up.
+ */
+ local_irq_save(flags);
+ if (cpu != smp_processor_id()) {
+ local_irq_restore(flags);
+ schedule_work_on(cpu,
+ &cpu_buffer->update_pages_work);
+ } else {
+ update_pages_handler(&cpu_buffer->update_pages_work);
+ local_irq_restore(flags);
+ }
}
}
@@ -2298,9 +2312,20 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
else {
- schedule_work_on(cpu_id,
- &cpu_buffer->update_pages_work);
- wait_for_completion(&cpu_buffer->update_done);
+ /*
+ * Run directly if possible.
+ * Save flags as this can be called at early boot up.
+ */
+ local_irq_save(flags);
+ if (cpu_id == smp_processor_id()) {
+ update_pages_handler(&cpu_buffer->update_pages_work);
+ local_irq_restore(flags);
+ } else {
+ local_irq_restore(flags);
+ schedule_work_on(cpu_id,
+ &cpu_buffer->update_pages_work);
+ wait_for_completion(&cpu_buffer->update_done);
+ }
}
cpu_buffer->nr_pages_to_update = 0;