2024-03-12 21:03:14

by Andrii Nakryiko

[permalink] [raw]
Subject: [PATCH bpf-next 0/3] uprobes: two common case speed ups

This patch set implements two speed ups for uprobe/uretprobe runtime execution
path for some common scenarios: BPF-only uprobes (patches #1 and #2) and
system-wide (non-PID-specific) uprobes (patch #3). Please see individual
patches for details.

Given I haven't worked with uprobe code before, I'm unfamiliar with
conventions in this subsystem, including which kernel tree patches should be
sent to. For now I based all the changes on top of bpf-next/master, which is
where I tested and benchmarked everything anyways. Please advise what should
I use as a base for subsequent revision. Thanks.

Andrii Nakryiko (3):
uprobes: encapsulate preparation of uprobe args buffer
uprobes: prepare uprobe args buffer lazily
uprobes: add speculative lockless system-wide uprobe filter check

kernel/trace/trace_uprobe.c | 103 ++++++++++++++++++++++--------------
1 file changed, 63 insertions(+), 40 deletions(-)

--
2.43.0



2024-03-12 21:03:23

by Andrii Nakryiko

[permalink] [raw]
Subject: [PATCH bpf-next 1/3] uprobes: encapsulate preparation of uprobe args buffer

Move the logic of fetching temporary per-CPU uprobe buffer and storing
uprobes args into it to a new helper function. Store data size as part
of this buffer, simplifying interfaces a bit, as now we only pass single
uprobe_cpu_buffer reference around, instead of pointer + dsize.

This logic was duplicated across uprobe_dispatcher and uretprobe_dispatcher,
and now will be centralized. All this is also in preparation to make
this uprobe_cpu_buffer handling logic optional in the next patch.

Signed-off-by: Andrii Nakryiko <[email protected]>
---
kernel/trace/trace_uprobe.c | 75 ++++++++++++++++++++-----------------
1 file changed, 41 insertions(+), 34 deletions(-)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a84b85d8aac1..a0f60bb10158 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -854,6 +854,7 @@ static const struct file_operations uprobe_profile_ops = {
struct uprobe_cpu_buffer {
struct mutex mutex;
void *buf;
+ int dsize;
};
static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
static int uprobe_buffer_refcnt;
@@ -943,9 +944,26 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
mutex_unlock(&ucb->mutex);
}

+static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
+ struct pt_regs *regs)
+{
+ struct uprobe_cpu_buffer *ucb;
+ int dsize, esize;
+
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+ dsize = __get_data_size(&tu->tp, regs);
+
+ ucb = uprobe_buffer_get();
+ ucb->dsize = dsize;
+
+ store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
+
+ return ucb;
+}
+
static void __uprobe_trace_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize,
+ struct uprobe_cpu_buffer *ucb,
struct trace_event_file *trace_file)
{
struct uprobe_trace_entry_head *entry;
@@ -956,14 +974,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,

WARN_ON(call != trace_file->event_call);

- if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
+ if (WARN_ON_ONCE(tu->tp.size + ucb->dsize > PAGE_SIZE))
return;

if (trace_trigger_soft_disabled(trace_file))
return;

esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
- size = esize + tu->tp.size + dsize;
+ size = esize + tu->tp.size + ucb->dsize;
entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
if (!entry)
return;
@@ -977,14 +995,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
data = DATAOF_TRACE_ENTRY(entry, false);
}

- memcpy(data, ucb->buf, tu->tp.size + dsize);
+ memcpy(data, ucb->buf, tu->tp.size + ucb->dsize);

trace_event_buffer_commit(&fbuffer);
}

/* uprobe handler */
static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer *ucb)
{
struct event_file_link *link;

@@ -993,7 +1011,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,

rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
- __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
+ __uprobe_trace_func(tu, 0, regs, ucb, link->file);
rcu_read_unlock();

return 0;
@@ -1001,13 +1019,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,

static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer *ucb)
{
struct event_file_link *link;

rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
- __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
+ __uprobe_trace_func(tu, func, regs, ucb, link->file);
rcu_read_unlock();
}

@@ -1335,7 +1353,7 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,

static void __uprobe_perf_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer *ucb)
{
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
struct uprobe_trace_entry_head *entry;
@@ -1356,7 +1374,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,

esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));

- size = esize + tu->tp.size + dsize;
+ size = esize + tu->tp.size + ucb->dsize;
size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
return;
@@ -1379,10 +1397,10 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
data = DATAOF_TRACE_ENTRY(entry, false);
}

- memcpy(data, ucb->buf, tu->tp.size + dsize);
+ memcpy(data, ucb->buf, tu->tp.size + ucb->dsize);

- if (size - esize > tu->tp.size + dsize) {
- int len = tu->tp.size + dsize;
+ if (size - esize > tu->tp.size + ucb->dsize) {
+ int len = tu->tp.size + ucb->dsize;

memset(data + len, 0, size - esize - len);
}
@@ -1395,21 +1413,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,

/* uprobe profile handler */
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer *ucb)
{
if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
return UPROBE_HANDLER_REMOVE;

if (!is_ret_probe(tu))
- __uprobe_perf_func(tu, 0, regs, ucb, dsize);
+ __uprobe_perf_func(tu, 0, regs, ucb);
return 0;
}

static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer *ucb)
{
- __uprobe_perf_func(tu, func, regs, ucb, dsize);
+ __uprobe_perf_func(tu, func, regs, ucb);
}

int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
@@ -1475,10 +1493,8 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
struct uprobe_cpu_buffer *ucb;
- int dsize, esize;
int ret = 0;

-
tu = container_of(con, struct trace_uprobe, consumer);
tu->nhit++;

@@ -1490,18 +1506,14 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;

- dsize = __get_data_size(&tu->tp, regs);
- esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-
- ucb = uprobe_buffer_get();
- store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
+ ucb = prepare_uprobe_buffer(tu, regs);

if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
- ret |= uprobe_trace_func(tu, regs, ucb, dsize);
+ ret |= uprobe_trace_func(tu, regs, ucb);

#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
- ret |= uprobe_perf_func(tu, regs, ucb, dsize);
+ ret |= uprobe_perf_func(tu, regs, ucb);
#endif
uprobe_buffer_put(ucb);
return ret;
@@ -1513,7 +1525,6 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
struct uprobe_cpu_buffer *ucb;
- int dsize, esize;

tu = container_of(con, struct trace_uprobe, consumer);

@@ -1525,18 +1536,14 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;

- dsize = __get_data_size(&tu->tp, regs);
- esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-
- ucb = uprobe_buffer_get();
- store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
+ ucb = prepare_uprobe_buffer(tu, regs);

if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
- uretprobe_trace_func(tu, func, regs, ucb, dsize);
+ uretprobe_trace_func(tu, func, regs, ucb);

#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
- uretprobe_perf_func(tu, func, regs, ucb, dsize);
+ uretprobe_perf_func(tu, func, regs, ucb);
#endif
uprobe_buffer_put(ucb);
return 0;
--
2.43.0


2024-03-12 21:03:51

by Andrii Nakryiko

[permalink] [raw]
Subject: [PATCH bpf-next 2/3] uprobes: prepare uprobe args buffer lazily

uprobe_cpu_buffer and corresponding logic to store uprobe args into it
are used for uprobes/uretprobes that are created through tracefs or
perf events.

BPF is yet another user of uprobe/uretprobe infrastructure, but doesn't
need uprobe_cpu_buffer and associated data. For BPF-only use cases this
buffer handling and preparation is a pure overhead. At the same time,
BPF-only uprobe/uretprobe usage is very common in practice. Also, for
a lot of cases applications are very senstivie to performance overheads,
as they might be tracing a very high frequency functions like
malloc()/free(), so every bit of performance improvement matters.

All that is to say that this uprobe_cpu_buffer preparation is an
unnecessary overhead that each BPF user of uprobes/uretprobe has to pay.
This patch is changing this by making uprobe_cpu_buffer preparation
optional. It will happen only if either tracefs-based or perf event-based
uprobe/uretprobe consumer is registered for given uprobe/uretprobe. For
BPF-only use cases this step will be skipped.

We used uprobe/uretprobe benchmark which is part of BPF selftests (see [0])
to estimate the improvements. We have 3 uprobe and 3 uretprobe
scenarios, which vary an instruction that is replaced by uprobe: nop
(fastest uprobe case), `push rbp` (typical case), and non-simulated
`ret` instruction (slowest case). Benchmark thread is constantly calling
user space function in a tight loop. User space function has attached
BPF uprobe or uretprobe program doing nothing but atomic counter
increments to count number of triggering calls. Benchmark emits
throughput in millions of executions per second.

BEFORE these changes
====================
uprobe-nop : 2.657 ± 0.024M/s
uprobe-push : 2.499 ± 0.018M/s
uprobe-ret : 1.100 ± 0.006M/s
uretprobe-nop : 1.356 ± 0.004M/s
uretprobe-push : 1.317 ± 0.019M/s
uretprobe-ret : 0.785 ± 0.007M/s

AFTER these changes
===================
uprobe-nop : 2.732 ± 0.022M/s (+2.8%)
uprobe-push : 2.621 ± 0.016M/s (+4.9%)
uprobe-ret : 1.105 ± 0.007M/s (+0.5%)
uretprobe-nop : 1.396 ± 0.007M/s (+2.9%)
uretprobe-push : 1.347 ± 0.008M/s (+2.3%)
uretprobe-ret : 0.800 ± 0.006M/s (+1.9)

So the improvements on this particular machine seems to be between 2% and 5%.

[0] https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c

Signed-off-by: Andrii Nakryiko <[email protected]>
---
kernel/trace/trace_uprobe.c | 56 ++++++++++++++++++++++---------------
1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a0f60bb10158..f2875349d124 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -963,15 +963,22 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,

static void __uprobe_trace_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb,
+ struct uprobe_cpu_buffer **ucbp,
struct trace_event_file *trace_file)
{
struct uprobe_trace_entry_head *entry;
struct trace_event_buffer fbuffer;
+ struct uprobe_cpu_buffer *ucb;
void *data;
int size, esize;
struct trace_event_call *call = trace_probe_event_call(&tu->tp);

+ ucb = *ucbp;
+ if (!ucb) {
+ ucb = prepare_uprobe_buffer(tu, regs);
+ *ucbp = ucb;
+ }
+
WARN_ON(call != trace_file->event_call);

if (WARN_ON_ONCE(tu->tp.size + ucb->dsize > PAGE_SIZE))
@@ -1002,7 +1009,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,

/* uprobe handler */
static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb)
+ struct uprobe_cpu_buffer **ucbp)
{
struct event_file_link *link;

@@ -1011,7 +1018,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,

rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
- __uprobe_trace_func(tu, 0, regs, ucb, link->file);
+ __uprobe_trace_func(tu, 0, regs, ucbp, link->file);
rcu_read_unlock();

return 0;
@@ -1019,13 +1026,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,

static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb)
+ struct uprobe_cpu_buffer **ucbp)
{
struct event_file_link *link;

rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
- __uprobe_trace_func(tu, func, regs, ucb, link->file);
+ __uprobe_trace_func(tu, func, regs, ucbp, link->file);
rcu_read_unlock();
}

@@ -1353,10 +1360,11 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,

static void __uprobe_perf_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb)
+ struct uprobe_cpu_buffer **ucbp)
{
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
struct uprobe_trace_entry_head *entry;
+ struct uprobe_cpu_buffer *ucb;
struct hlist_head *head;
void *data;
int size, esize;
@@ -1372,6 +1380,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
}
#endif /* CONFIG_BPF_EVENTS */

+ ucb = *ucbp;
+ if (!ucb) {
+ ucb = prepare_uprobe_buffer(tu, regs);
+ *ucbp = ucb;
+ }
+
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));

size = esize + tu->tp.size + ucb->dsize;
@@ -1413,21 +1427,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,

/* uprobe profile handler */
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb)
+ struct uprobe_cpu_buffer **ucbp)
{
if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
return UPROBE_HANDLER_REMOVE;

if (!is_ret_probe(tu))
- __uprobe_perf_func(tu, 0, regs, ucb);
+ __uprobe_perf_func(tu, 0, regs, ucbp);
return 0;
}

static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb)
+ struct uprobe_cpu_buffer **ucbp)
{
- __uprobe_perf_func(tu, func, regs, ucb);
+ __uprobe_perf_func(tu, func, regs, ucbp);
}

int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
@@ -1492,7 +1506,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
- struct uprobe_cpu_buffer *ucb;
+ struct uprobe_cpu_buffer *ucb = NULL;
int ret = 0;

tu = container_of(con, struct trace_uprobe, consumer);
@@ -1506,16 +1520,15 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;

- ucb = prepare_uprobe_buffer(tu, regs);
-
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
- ret |= uprobe_trace_func(tu, regs, ucb);
+ ret |= uprobe_trace_func(tu, regs, &ucb);

#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
- ret |= uprobe_perf_func(tu, regs, ucb);
+ ret |= uprobe_perf_func(tu, regs, &ucb);
#endif
- uprobe_buffer_put(ucb);
+ if (ucb)
+ uprobe_buffer_put(ucb);
return ret;
}

@@ -1524,7 +1537,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
- struct uprobe_cpu_buffer *ucb;
+ struct uprobe_cpu_buffer *ucb = NULL;

tu = container_of(con, struct trace_uprobe, consumer);

@@ -1536,16 +1549,15 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;

- ucb = prepare_uprobe_buffer(tu, regs);
-
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
- uretprobe_trace_func(tu, func, regs, ucb);
+ uretprobe_trace_func(tu, func, regs, &ucb);

#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
- uretprobe_perf_func(tu, func, regs, ucb);
+ uretprobe_perf_func(tu, func, regs, &ucb);
#endif
- uprobe_buffer_put(ucb);
+ if (ucb)
+ uprobe_buffer_put(ucb);
return 0;
}

--
2.43.0


2024-03-12 21:03:53

by Andrii Nakryiko

[permalink] [raw]
Subject: [PATCH bpf-next 3/3] uprobes: add speculative lockless system-wide uprobe filter check

It's very common with BPF-based uprobe/uretprobe use cases to have
a system-wide (not PID specific) probes used. In this case uprobe's
trace_uprobe_filter->nr_systemwide counter is bumped at registration
time, and actual filtering is short circuited at the time when
uprobe/uretprobe is triggered.

This is a great optimization, and the only issue with it is that to even
get to checking this counter uprobe subsystem is taking
read-side trace_uprobe_filter->rwlock. This is actually noticeable in
profiles and is just another point of contention when uprobe is
triggered on multiple CPUs simultaneously.

This patch adds a speculative check before grabbing that rwlock. If
nr_systemwide is non-zero, lock is skipped and event is passed through.
From examining existing logic it looks correct and safe to do. If
nr_systemwide is being modified under rwlock in parallel, we have to
consider basically just one important race condition: the case when
nr_systemwide is dropped from one to zero (from
trace_uprobe_filter_remove()) under filter->rwlock, but
uprobe_perf_filter() raced and saw it as >0.

In this case, we'll proceed with uprobe/uretprobe execution, while
uprobe_perf_close() and uprobe_apply() will be blocked on trying to grab
uprobe->register_rwsem as a writer. It will be blocked because
uprobe_dispatcher() (and, similarly, uretprobe_dispatcher()) runs with
uprobe->register_rwsem taken as a reader. So there is no real race
besides uprobe/uretprobe might execute one last time before it's
removed, which is fine because from user space perspective
uprobe/uretprobe hasn't been yet deactivated.

In case we speculatively read nr_systemwide as zero, while it was
incremented in parallel, we'll proceed to grabbing filter->rwlock and
re-doing the check, this time in lock-protected and non-racy way.

As such, it looks safe to do a quick short circuiting check and save
some performance in a very common system-wide case, not sacrificing hot
path performance due to much rarer possibility of registration or
unregistration of uprobes.

Again, confirming with BPF selftests's based benchmarks.

BEFORE (based on changes in previous patch)
===========================================
uprobe-nop : 2.732 ± 0.022M/s
uprobe-push : 2.621 ± 0.016M/s
uprobe-ret : 1.105 ± 0.007M/s
uretprobe-nop : 1.396 ± 0.007M/s
uretprobe-push : 1.347 ± 0.008M/s
uretprobe-ret : 0.800 ± 0.006M/s

AFTER
=====
uprobe-nop : 2.878 ± 0.017M/s (+5.5%, total +8.3%)
uprobe-push : 2.753 ± 0.013M/s (+5.3%, total +10.2%)
uprobe-ret : 1.142 ± 0.010M/s (+3.8%, total +3.8%)
uretprobe-nop : 1.444 ± 0.008M/s (+3.5%, total +6.5%)
uretprobe-push : 1.410 ± 0.010M/s (+4.8%, total +7.1%)
uretprobe-ret : 0.816 ± 0.002M/s (+2.0%, total +3.9%)

In the above, first percentage value is based on top of previous patch
(lazy uprobe buffer optimization), while the "total" percentage is
based on kernel without any of the changes in this patch set.

As can be seen, we get about 4% - 10% speed up, in total, with both lazy
uprobe buffer and speculative filter check optimizations.

Signed-off-by: Andrii Nakryiko <[email protected]>
---
kernel/trace/trace_uprobe.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index f2875349d124..be28e6d0578e 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1351,6 +1351,10 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
tu = container_of(uc, struct trace_uprobe, consumer);
filter = tu->tp.event->filter;

+ /* speculative check */
+ if (READ_ONCE(filter->nr_systemwide))
+ return true;
+
read_lock(&filter->rwlock);
ret = __uprobe_perf_filter(filter, mm);
read_unlock(&filter->rwlock);
--
2.43.0


2024-03-13 09:42:11

by Jiri Olsa

[permalink] [raw]
Subject: Re: [PATCH bpf-next 0/3] uprobes: two common case speed ups

On Tue, Mar 12, 2024 at 02:02:30PM -0700, Andrii Nakryiko wrote:
> This patch set implements two speed ups for uprobe/uretprobe runtime execution
> path for some common scenarios: BPF-only uprobes (patches #1 and #2) and
> system-wide (non-PID-specific) uprobes (patch #3). Please see individual
> patches for details.
>
> Given I haven't worked with uprobe code before, I'm unfamiliar with
> conventions in this subsystem, including which kernel tree patches should be
> sent to. For now I based all the changes on top of bpf-next/master, which is
> where I tested and benchmarked everything anyways. Please advise what should
> I use as a base for subsequent revision. Thanks.
>
> Andrii Nakryiko (3):
> uprobes: encapsulate preparation of uprobe args buffer
> uprobes: prepare uprobe args buffer lazily
> uprobes: add speculative lockless system-wide uprobe filter check

nice cleanup and speed up, lgtm

Reviewed-by: Jiri Olsa <[email protected]>

jirka

>
> kernel/trace/trace_uprobe.c | 103 ++++++++++++++++++++++--------------
> 1 file changed, 63 insertions(+), 40 deletions(-)
>
> --
> 2.43.0
>
>

2024-03-13 13:21:12

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH bpf-next 3/3] uprobes: add speculative lockless system-wide uprobe filter check

I forgot everything about this code, plus it has changed a lot since
I looked at it many years ago, but ...

I think this change is fine but the changelog looks a bit confusing
(overcomplicated) to me.

On 03/12, Andrii Nakryiko wrote:
>
> This patch adds a speculative check before grabbing that rwlock. If
> nr_systemwide is non-zero, lock is skipped and event is passed through.
> From examining existing logic it looks correct and safe to do. If
> nr_systemwide is being modified under rwlock in parallel, we have to
> consider basically just one important race condition: the case when
> nr_systemwide is dropped from one to zero (from
> trace_uprobe_filter_remove()) under filter->rwlock, but
> uprobe_perf_filter() raced and saw it as >0.

Unless I am totally confused, there is nothing new. Even without
this change trace_uprobe_filter_remove() can clear nr_systemwide
right after uprobe_perf_filter() drops filter->rwlock.

And of course, trace_uprobe_filter_add() can change nr_systemwide
from 0 to 1. In this case uprobe_perf_func() can "wrongly" return
UPROBE_HANDLER_REMOVE but we can't avoid this and afaics this is
fine even if handler_chain() does unapply_uprobe(), uprobe_perf_open()
will do uprobe_apply() after that, we can rely on ->register_rwsem.

> In case we speculatively read nr_systemwide as zero, while it was
> incremented in parallel, we'll proceed to grabbing filter->rwlock and
> re-doing the check, this time in lock-protected and non-racy way.

See above...


So I think uprobe_perf_filter() needs filter->rwlock only to iterate
the list, it can check nr_systemwide lockless and this means that you
can also remove the same check in __uprobe_perf_filter(), other callers
trace_uprobe_filter_add/remove check it themselves.


> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -1351,6 +1351,10 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
> tu = container_of(uc, struct trace_uprobe, consumer);
> filter = tu->tp.event->filter;
>
> + /* speculative check */
> + if (READ_ONCE(filter->nr_systemwide))
> + return true;
> +
> read_lock(&filter->rwlock);
> ret = __uprobe_perf_filter(filter, mm);
> read_unlock(&filter->rwlock);

ACK,

but see above. I think the changelog should be simplified and the
filter->nr_systemwide check in __uprobe_perf_filter() should be
removed. But I won't insist and perhaps I missed something...

Oleg.


2024-03-13 15:17:01

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH bpf-next 1/3] uprobes: encapsulate preparation of uprobe args buffer

LGTM, one nit below.

On 03/12, Andrii Nakryiko wrote:
>
> +static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
> + struct pt_regs *regs)
> +{
> + struct uprobe_cpu_buffer *ucb;
> + int dsize, esize;
> +
> + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
> + dsize = __get_data_size(&tu->tp, regs);
> +
> + ucb = uprobe_buffer_get();
> + ucb->dsize = dsize;
> +
> + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
> +
> + return ucb;
> +}

OK, but note that every user of ->dsize adds tp.size. So I think you can
simplify this code a bit more if you change prepare_uprobe_buffer() to do

ucb->dsize = tu->tp.size + dsize;

and update the users.

Oleg.


2024-03-13 15:49:04

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH bpf-next 2/3] uprobes: prepare uprobe args buffer lazily

Again, looks good to me, but I have a minor nit. Feel free to ignore.

On 03/12, Andrii Nakryiko wrote:
>
> static void __uprobe_trace_func(struct trace_uprobe *tu,
> unsigned long func, struct pt_regs *regs,
> - struct uprobe_cpu_buffer *ucb,
> + struct uprobe_cpu_buffer **ucbp,
> struct trace_event_file *trace_file)
> {
> struct uprobe_trace_entry_head *entry;
> struct trace_event_buffer fbuffer;
> + struct uprobe_cpu_buffer *ucb;
> void *data;
> int size, esize;
> struct trace_event_call *call = trace_probe_event_call(&tu->tp);
>
> + ucb = *ucbp;
> + if (!ucb) {
> + ucb = prepare_uprobe_buffer(tu, regs);
> + *ucbp = ucb;
> + }

perhaps it would be more clean to pass ucbp to prepare_uprobe_buffer()
and change it to do

if (*ucbp)
return *ucbp;

at the start. Then __uprobe_trace_func() and __uprobe_perf_func() can
simply do

ucb = prepare_uprobe_buffer(tu, regs, ucbp);

> - uprobe_buffer_put(ucb);
> + if (ucb)
> + uprobe_buffer_put(ucb);

Similarly, I think the "ucb != NULL" check should be shifted into
uprobe_buffer_put().

Oleg.


2024-03-13 18:00:07

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [PATCH bpf-next 1/3] uprobes: encapsulate preparation of uprobe args buffer

On Wed, Mar 13, 2024 at 8:16 AM Oleg Nesterov <[email protected]> wrote:
>
> LGTM, one nit below.
>
> On 03/12, Andrii Nakryiko wrote:
> >
> > +static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
> > + struct pt_regs *regs)
> > +{
> > + struct uprobe_cpu_buffer *ucb;
> > + int dsize, esize;
> > +
> > + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
> > + dsize = __get_data_size(&tu->tp, regs);
> > +
> > + ucb = uprobe_buffer_get();
> > + ucb->dsize = dsize;
> > +
> > + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
> > +
> > + return ucb;
> > +}
>
> OK, but note that every user of ->dsize adds tp.size. So I think you can
> simplify this code a bit more if you change prepare_uprobe_buffer() to do
>
> ucb->dsize = tu->tp.size + dsize;
>
> and update the users.
>

makes sense, done

> Oleg.
>

2024-03-13 18:01:28

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [PATCH bpf-next 2/3] uprobes: prepare uprobe args buffer lazily

On Wed, Mar 13, 2024 at 8:48 AM Oleg Nesterov <[email protected]> wrote:
>
> Again, looks good to me, but I have a minor nit. Feel free to ignore.
>
> On 03/12, Andrii Nakryiko wrote:
> >
> > static void __uprobe_trace_func(struct trace_uprobe *tu,
> > unsigned long func, struct pt_regs *regs,
> > - struct uprobe_cpu_buffer *ucb,
> > + struct uprobe_cpu_buffer **ucbp,
> > struct trace_event_file *trace_file)
> > {
> > struct uprobe_trace_entry_head *entry;
> > struct trace_event_buffer fbuffer;
> > + struct uprobe_cpu_buffer *ucb;
> > void *data;
> > int size, esize;
> > struct trace_event_call *call = trace_probe_event_call(&tu->tp);
> >
> > + ucb = *ucbp;
> > + if (!ucb) {
> > + ucb = prepare_uprobe_buffer(tu, regs);
> > + *ucbp = ucb;
> > + }
>
> perhaps it would be more clean to pass ucbp to prepare_uprobe_buffer()
> and change it to do
>
> if (*ucbp)
> return *ucbp;
>
> at the start. Then __uprobe_trace_func() and __uprobe_perf_func() can
> simply do
>
> ucb = prepare_uprobe_buffer(tu, regs, ucbp);

ok, will do

>
> > - uprobe_buffer_put(ucb);
> > + if (ucb)
> > + uprobe_buffer_put(ucb);
>
> Similarly, I think the "ucb != NULL" check should be shifted into
> uprobe_buffer_put().

sure, will hide it inside uprobe_buffer_put()

>
> Oleg.
>

2024-03-13 18:02:44

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [PATCH bpf-next 3/3] uprobes: add speculative lockless system-wide uprobe filter check

On Wed, Mar 13, 2024 at 6:20 AM Oleg Nesterov <[email protected]> wrote:
>
> I forgot everything about this code, plus it has changed a lot since
> I looked at it many years ago, but ...
>
> I think this change is fine but the changelog looks a bit confusing
> (overcomplicated) to me.

It's a new piece of code and logic, so I tried to do my due diligence
and argue why I think it's fine. I'll drop the overcomplicated
explanation, as I agree with you that it's inherently racy even
without my changes (and use-after-free safety is provided with
uprobe->register_rwsem independent from all this).

>
> On 03/12, Andrii Nakryiko wrote:
> >
> > This patch adds a speculative check before grabbing that rwlock. If
> > nr_systemwide is non-zero, lock is skipped and event is passed through.
> > From examining existing logic it looks correct and safe to do. If
> > nr_systemwide is being modified under rwlock in parallel, we have to
> > consider basically just one important race condition: the case when
> > nr_systemwide is dropped from one to zero (from
> > trace_uprobe_filter_remove()) under filter->rwlock, but
> > uprobe_perf_filter() raced and saw it as >0.
>
> Unless I am totally confused, there is nothing new. Even without
> this change trace_uprobe_filter_remove() can clear nr_systemwide
> right after uprobe_perf_filter() drops filter->rwlock.
>
> And of course, trace_uprobe_filter_add() can change nr_systemwide
> from 0 to 1. In this case uprobe_perf_func() can "wrongly" return
> UPROBE_HANDLER_REMOVE but we can't avoid this and afaics this is
> fine even if handler_chain() does unapply_uprobe(), uprobe_perf_open()
> will do uprobe_apply() after that, we can rely on ->register_rwsem.
>

yep, agreed

> > In case we speculatively read nr_systemwide as zero, while it was
> > incremented in parallel, we'll proceed to grabbing filter->rwlock and
> > re-doing the check, this time in lock-protected and non-racy way.
>
> See above...
>
>
> So I think uprobe_perf_filter() needs filter->rwlock only to iterate
> the list, it can check nr_systemwide lockless and this means that you
> can also remove the same check in __uprobe_perf_filter(), other callers
> trace_uprobe_filter_add/remove check it themselves.
>

makes sense, will do

>
> > --- a/kernel/trace/trace_uprobe.c
> > +++ b/kernel/trace/trace_uprobe.c
> > @@ -1351,6 +1351,10 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
> > tu = container_of(uc, struct trace_uprobe, consumer);
> > filter = tu->tp.event->filter;
> >
> > + /* speculative check */
> > + if (READ_ONCE(filter->nr_systemwide))
> > + return true;
> > +
> > read_lock(&filter->rwlock);
> > ret = __uprobe_perf_filter(filter, mm);
> > read_unlock(&filter->rwlock);
>
> ACK,
>
> but see above. I think the changelog should be simplified and the
> filter->nr_systemwide check in __uprobe_perf_filter() should be
> removed. But I won't insist and perhaps I missed something...
>

I think you are right, I'll move the check

> Oleg.
>

2024-03-13 18:44:57

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [PATCH bpf-next 0/3] uprobes: two common case speed ups

On Wed, Mar 13, 2024 at 2:41 AM Jiri Olsa <[email protected]> wrote:
>
> On Tue, Mar 12, 2024 at 02:02:30PM -0700, Andrii Nakryiko wrote:
> > This patch set implements two speed ups for uprobe/uretprobe runtime execution
> > path for some common scenarios: BPF-only uprobes (patches #1 and #2) and
> > system-wide (non-PID-specific) uprobes (patch #3). Please see individual
> > patches for details.
> >
> > Given I haven't worked with uprobe code before, I'm unfamiliar with
> > conventions in this subsystem, including which kernel tree patches should be
> > sent to. For now I based all the changes on top of bpf-next/master, which is
> > where I tested and benchmarked everything anyways. Please advise what should
> > I use as a base for subsequent revision. Thanks.

Steven, Masami,

Is this the kind of patches that should go through your tree(s)? Or
you'd be fine with this going through bpf-next? I'd appreciate the
link to the specific GIT repo I should use as a base in the former
case, thank you!

> >
> > Andrii Nakryiko (3):
> > uprobes: encapsulate preparation of uprobe args buffer
> > uprobes: prepare uprobe args buffer lazily
> > uprobes: add speculative lockless system-wide uprobe filter check
>
> nice cleanup and speed up, lgtm
>
> Reviewed-by: Jiri Olsa <[email protected]>
>
> jirka
>
> >
> > kernel/trace/trace_uprobe.c | 103 ++++++++++++++++++++++--------------
> > 1 file changed, 63 insertions(+), 40 deletions(-)
> >
> > --
> > 2.43.0
> >
> >