Subject: [PATCH V3] tracing/timerlat: Hotplug support for the user-space interface

The osnoise/per_cpu/CPU$/timerlat_fd is create for each possible
CPU, but it might create confusion if the CPU is not online.

Create the file only for online CPUs, also follow hotplug by
creating and deleting as CPUs come and go.

Fixes: e88ed227f639 ("tracing/timerlat: Add user-space interface")
Signed-off-by: Daniel Bristot de Oliveira <[email protected]>
---

Changes from V2:
- Better split the code into the generic (per_cpu/cpu$)
and timerlat (/timerlat_fd) specific function (Daniel)
- Fixed a cpus_read_lock/unlock() usage (kbuild test)
Link: https://lore.kernel.org/lkml/6b9a5f306e488bc77bf8521faeade420a0adf3e4.1695224204.git.bristot@kernel.org/

Changes from V1:
- Fix compilation issue when !HOTPLUG
- Fix init interface | hotplug race
Link: https://lore.kernel.org/lkml/b619d9fd08a3bb47018cf40afa95783844a3c1fd.1694789910.git.bristot@kernel.org/

kernel/trace/trace_osnoise.c | 149 ++++++++++++++++++++++++++++-------
1 file changed, 121 insertions(+), 28 deletions(-)

diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index bd0d01d00fb9..f375fb1e721d 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -229,6 +229,19 @@ static inline struct osnoise_variables *this_cpu_osn_var(void)
}

#ifdef CONFIG_TIMERLAT_TRACER
+
+/*
+ * osnoise/per_cpu dir
+ */
+static struct dentry *osnoise_per_cpu_fd;
+
+struct osnoise_per_cpu_dir {
+ struct dentry *root;
+ struct dentry *timerlat_fd;
+};
+
+static DEFINE_PER_CPU(struct osnoise_per_cpu_dir, osnoise_per_cpu_dir);
+
/*
* Runtime information for the timer mode.
*/
@@ -2000,6 +2013,9 @@ static int start_kthread(unsigned int cpu)
char comm[24];

if (timerlat_enabled()) {
+ if (!test_bit(OSN_WORKLOAD, &osnoise_options))
+ return 0;
+
snprintf(comm, 24, "timerlat/%d", cpu);
main = timerlat_main;
} else {
@@ -2065,19 +2081,99 @@ static int start_per_cpu_kthreads(void)
return retval;
}

+#ifdef CONFIG_TIMERLAT_TRACER
+static struct dentry *osnoise_get_per_cpu_dir(long cpu)
+{
+ struct dentry *cpu_dir_entry = per_cpu_ptr(&osnoise_per_cpu_dir, cpu)->root;
+ char cpu_str[30]; /* see trace.c: tracing_init_tracefs_percpu() */
+
+ if (cpu_dir_entry)
+ return cpu_dir_entry;
+
+ if (!osnoise_per_cpu_fd)
+ return NULL;
+
+ snprintf(cpu_str, 30, "cpu%ld", cpu);
+ cpu_dir_entry = tracefs_create_dir(cpu_str, osnoise_per_cpu_fd);
+ if (!cpu_dir_entry)
+ return NULL;
+
+ per_cpu_ptr(&osnoise_per_cpu_dir, cpu)->root = cpu_dir_entry;
+
+ return cpu_dir_entry;
+}
+
+static void osnoise_put_per_cpu_dir(long cpu)
+{
+ struct dentry *cpu_dir_entry = per_cpu_ptr(&osnoise_per_cpu_dir, cpu)->root;
+
+ if (!cpu_dir_entry)
+ return;
+
+ tracefs_remove(cpu_dir_entry);
+
+ per_cpu_ptr(&osnoise_per_cpu_dir, cpu)->root = NULL;
+}
+
+static const struct file_operations timerlat_fd_fops;
+
+static int timerlat_add_per_cpu_interface(long cpu)
+{
+ struct dentry *cpu_dir_entry = osnoise_get_per_cpu_dir(cpu);
+ struct dentry *timerlat_fd;
+
+ lockdep_assert_cpus_held();
+
+ if (!cpu_dir_entry)
+ return -ENOMEM;
+
+ timerlat_fd = trace_create_file("timerlat_fd", TRACE_MODE_READ, cpu_dir_entry, NULL,
+ &timerlat_fd_fops);
+ if (!timerlat_fd)
+ goto out_put;
+
+ per_cpu_ptr(&osnoise_per_cpu_dir, cpu)->timerlat_fd = timerlat_fd;
+
+ /* Record the CPU */
+ d_inode(timerlat_fd)->i_cdev = (void *)(cpu);
+
+ return 0;
+
+out_put:
+ osnoise_put_per_cpu_dir(cpu);
+ return -ENOMEM;
+}
+
+static void timerlat_rm_per_cpu_interface(long cpu)
+{
+ struct dentry *timerlat_fd = per_cpu_ptr(&osnoise_per_cpu_dir, cpu)->timerlat_fd;
+
+ if (timerlat_fd) {
+ tracefs_remove(timerlat_fd);
+ per_cpu_ptr(&osnoise_per_cpu_dir, cpu)->timerlat_fd = NULL;
+ }
+
+ osnoise_put_per_cpu_dir(cpu);
+}
+#elif defined(CONFIG_HOTPLUG_CPU)
+static int timerlat_add_per_cpu_interface(long cpu) { return 0; };
+static void timerlat_rm_per_cpu_interface(long cpu) {};
+#endif
+
#ifdef CONFIG_HOTPLUG_CPU
static void osnoise_hotplug_workfn(struct work_struct *dummy)
{
unsigned int cpu = smp_processor_id();

mutex_lock(&trace_types_lock);
-
- if (!osnoise_has_registered_instances())
- goto out_unlock_trace;
-
mutex_lock(&interface_lock);
cpus_read_lock();

+ timerlat_add_per_cpu_interface(cpu);
+
+ if (!osnoise_has_registered_instances())
+ goto out_unlock;
+
if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
goto out_unlock;

@@ -2086,7 +2182,6 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
out_unlock:
cpus_read_unlock();
mutex_unlock(&interface_lock);
-out_unlock_trace:
mutex_unlock(&trace_types_lock);
}

@@ -2106,6 +2201,7 @@ static int osnoise_cpu_init(unsigned int cpu)
*/
static int osnoise_cpu_die(unsigned int cpu)
{
+ timerlat_rm_per_cpu_interface(cpu);
stop_kthread(cpu);
return 0;
}
@@ -2708,10 +2804,7 @@ static int init_timerlat_stack_tracefs(struct dentry *top_dir)

static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir)
{
- struct dentry *timerlat_fd;
- struct dentry *per_cpu;
- struct dentry *cpu_dir;
- char cpu_str[30]; /* see trace.c: tracing_init_tracefs_percpu() */
+ int retval;
long cpu;

/*
@@ -2720,29 +2813,28 @@ static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir)
* Because osnoise/timerlat have a single workload, having
* multiple files like these are wast of memory.
*/
- per_cpu = tracefs_create_dir("per_cpu", top_dir);
- if (!per_cpu)
+ osnoise_per_cpu_fd = tracefs_create_dir("per_cpu", top_dir);
+ if (!osnoise_per_cpu_fd)
return -ENOMEM;

- for_each_possible_cpu(cpu) {
- snprintf(cpu_str, 30, "cpu%ld", cpu);
- cpu_dir = tracefs_create_dir(cpu_str, per_cpu);
- if (!cpu_dir)
- goto out_clean;
-
- timerlat_fd = trace_create_file("timerlat_fd", TRACE_MODE_READ,
- cpu_dir, NULL, &timerlat_fd_fops);
- if (!timerlat_fd)
- goto out_clean;
-
- /* Record the CPU */
- d_inode(timerlat_fd)->i_cdev = (void *)(cpu);
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ retval = timerlat_add_per_cpu_interface(cpu);
+ if (retval)
+ goto out_rm_cpus_locked;
}
+ cpus_read_unlock();

return 0;

-out_clean:
- tracefs_remove(per_cpu);
+out_rm_cpus_locked:
+ for_each_online_cpu(cpu)
+ timerlat_rm_per_cpu_interface(cpu);
+ cpus_read_lock();
+
+ tracefs_remove(osnoise_per_cpu_fd);
+ /* tracefs_remove() recursively deletes all the other files */
+ osnoise_per_cpu_fd = NULL;
return -ENOMEM;
}

@@ -3122,11 +3214,12 @@ __init static int init_osnoise_tracer(void)
return ret;
}

- osnoise_init_hotplug_support();
-
INIT_LIST_HEAD_RCU(&osnoise_instances);

+ mutex_lock(&trace_types_lock);
init_tracefs();
+ osnoise_init_hotplug_support();
+ mutex_unlock(&trace_types_lock);

return 0;
}
--
2.38.1


2023-10-04 01:02:17

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH V3] tracing/timerlat: Hotplug support for the user-space interface

On Fri, 29 Sep 2023 17:02:46 +0200
Daniel Bristot de Oliveira <[email protected]> wrote:

> The osnoise/per_cpu/CPU$/timerlat_fd is create for each possible
> CPU, but it might create confusion if the CPU is not online.
>
> Create the file only for online CPUs, also follow hotplug by
> creating and deleting as CPUs come and go.
>
> Fixes: e88ed227f639 ("tracing/timerlat: Add user-space interface")

Is this a fix that needs to go in now and Cc'd to stable? Or is this
something that can wait till the next merge window?

-- Steve


> Signed-off-by: Daniel Bristot de Oliveira <[email protected]>
> ---
>
> Changes from V2:
> - Better split the code into the generic (per_cpu/cpu$)
> and timerlat (/timerlat_fd) specific function (Daniel)
> - Fixed a cpus_read_lock/unlock() usage (kbuild test)
> Link: https://lore.kernel.org/lkml/6b9a5f306e488bc77bf8521faeade420a0adf3e4.1695224204.git.bristot@kernel.org/
>
> Changes from V1:
> - Fix compilation issue when !HOTPLUG
> - Fix init interface | hotplug race
> Link: https://lore.kernel.org/lkml/b619d9fd08a3bb47018cf40afa95783844a3c1fd.1694789910.git.bristot@kernel.org/
>
> kernel/trace/trace_osnoise.c | 149 ++++++++++++++++++++++++++++-------
> 1 file changed, 121 insertions(+), 28 deletions(-)
>
>

Subject: Re: [PATCH V3] tracing/timerlat: Hotplug support for the user-space interface

On 10/4/23 03:03, Steven Rostedt wrote:
> On Fri, 29 Sep 2023 17:02:46 +0200
> Daniel Bristot de Oliveira <[email protected]> wrote:
>
>> The osnoise/per_cpu/CPU$/timerlat_fd is create for each possible
>> CPU, but it might create confusion if the CPU is not online.
>>
>> Create the file only for online CPUs, also follow hotplug by
>> creating and deleting as CPUs come and go.
>>
>> Fixes: e88ed227f639 ("tracing/timerlat: Add user-space interface")
>
> Is this a fix that needs to go in now and Cc'd to stable? Or is this
> something that can wait till the next merge window?

We can wait for the next merge window... it is a non-trivial fix.

-- Daniel
> -- Steve

2023-10-04 12:16:36

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH V3] tracing/timerlat: Hotplug support for the user-space interface

On Wed, 4 Oct 2023 12:02:52 +0200
Daniel Bristot de Oliveira <[email protected]> wrote:

> On 10/4/23 03:03, Steven Rostedt wrote:
> > On Fri, 29 Sep 2023 17:02:46 +0200
> > Daniel Bristot de Oliveira <[email protected]> wrote:
> >
> >> The osnoise/per_cpu/CPU$/timerlat_fd is create for each possible
> >> CPU, but it might create confusion if the CPU is not online.
> >>
> >> Create the file only for online CPUs, also follow hotplug by
> >> creating and deleting as CPUs come and go.
> >>
> >> Fixes: e88ed227f639 ("tracing/timerlat: Add user-space interface")
> >
> > Is this a fix that needs to go in now and Cc'd to stable? Or is this
> > something that can wait till the next merge window?
>
> We can wait for the next merge window... it is a non-trivial fix.
>

A requirement is if it's a fix, not really how "trivial" it is.

That said, I'm able to consistently triggered this:

BUG: kernel NULL pointer dereference, address: 00000000000000a0
#PF: supervisor write access in kernel mode
#PF: error_code(0x0002) - not-present page
PGD 0 P4D 0
Oops: 0002 [#1] PREEMPT SMP PTI
CPU: 1 PID: 20 Comm: cpuhp/1 Not tainted 6.6.0-rc4-test-00008-g2df8f295b0e2 #103
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:down_write+0x23/0x70
Code: 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 53 48 89 fb e8 2e bc ff ff bf 01 00 00 00 e8 24 14 31 ff 31 c0 ba 01 00 00 00 <f0> 48 0f b1 13 75 33 65 48 8b 04 25 00 36 03 00 48 89 43 08 bf 01
RSP: 0018:ffffb17f800e3d98 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 00000000000000a0 RCX: ffffff8100000000
RDX: 0000000000000001 RSI: 0000000000000064 RDI: ffffffffb6edd5cc
RBP: ffffb17f800e3df8 R08: ffff8c6237c61188 R09: 000000008020001b
R10: ffff8c6237c61160 R11: 0000000000000001 R12: 000000000002da30
R13: 0000000000000000 R14: ffffffffb6314080 R15: ffff8c6237c61188
FS: 0000000000000000(0000) GS:ffff8c6237c40000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000000000a0 CR3: 0000000102412001 CR4: 0000000000170ee0
Call Trace:
<TASK>
? __die+0x23/0x70
? page_fault_oops+0x17d/0x4c0
? exc_page_fault+0x7f/0x180
? asm_exc_page_fault+0x26/0x30
? __pfx_osnoise_cpu_die+0x10/0x10
? down_write+0x1c/0x70
? down_write+0x23/0x70
? down_write+0x1c/0x70
simple_recursive_removal+0xef/0x280
? __pfx_remove_one+0x10/0x10
? __pfx_osnoise_cpu_die+0x10/0x10
tracefs_remove+0x44/0x70
timerlat_rm_per_cpu_interface+0x28/0x70
osnoise_cpu_die+0xf/0x20
cpuhp_invoke_callback+0xf8/0x460
? __pfx_smpboot_thread_fn+0x10/0x10
cpuhp_thread_fun+0xf3/0x190
smpboot_thread_fn+0x18c/0x230
kthread+0xf7/0x130
? __pfx_kthread+0x10/0x10
ret_from_fork+0x34/0x50
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1b/0x30
</TASK>
Modules linked in:
CR2: 00000000000000a0
---[ end trace 0000000000000000 ]---
RIP: 0010:down_write+0x23/0x70
Code: 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 53 48 89 fb e8 2e bc ff ff bf 01 00 00 00 e8 24 14 31 ff 31 c0 ba 01 00 00 00 <f0> 48 0f b1 13 75 33 65 48 8b 04 25 00 36 03 00 48 89 43 08 bf 01
RSP: 0018:ffffb17f800e3d98 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 00000000000000a0 RCX: ffffff8100000000
RDX: 0000000000000001 RSI: 0000000000000064 RDI: ffffffffb6edd5cc
RBP: ffffb17f800e3df8 R08: ffff8c6237c61188 R09: 000000008020001b
R10: ffff8c6237c61160 R11: 0000000000000001 R12: 000000000002da30
R13: 0000000000000000 R14: ffffffffb6314080 R15: ffff8c6237c61188
FS: 0000000000000000(0000) GS:ffff8c6237c40000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000000000a0 CR3: 0000000102412001 CR4: 0000000000170ee0
note: cpuhp/1[20] exited with irqs disabled
note: cpuhp/1[20] exited with preempt_count 1


With running the attached script as:

# ./ftrace-test-tracers sleep 1

-- Steve


Attachments:
(No filename) (3.87 kB)
ftrace-test-tracers (799.00 B)
Download all attachments

2023-10-04 12:19:07

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH V3] tracing/timerlat: Hotplug support for the user-space interface

On Wed, 4 Oct 2023 08:17:31 -0400
Steven Rostedt <[email protected]> wrote:

> #!/bin/bash
>
> find_debugfs() {
> debugfs=`cat /proc/mounts | while read mount dir type opts a b; do
> if [ $mount == "debugfs" ]; then

I guess I should update this to look for tracefs. This script is actually
older than tracefs.

-- Steve


> echo $dir;
> break
> fi
> done`
> if [ -z "$debugfs" ]; then
> if ! mount -t debugfs nodev /sys/kernel/debug; then
> echo "FAILED to mount debugfs"
> exit -1
> fi
> echo "/sys/kernel/debug"
> else
> echo $debugfs
> fi
> }
>
> debugfs=`find_debugfs`
> tracedir="$debugfs/tracing"