Send user_event data to attached eBPF programs for user_event based perf
events.
Add BPF_ITER flag to allow user_event data to have a zero copy path into
eBPF programs if required.
Update documentation to describe new flags and structures for eBPF
integration.
Signed-off-by: Beau Belgrave <[email protected]>
---
Documentation/trace/user_events.rst | 14 ++++--
include/uapi/linux/user_events.h | 53 +++++++++++++++++++++
kernel/trace/trace_events_user.c | 73 ++++++++++++++++++++++++++++-
3 files changed, 136 insertions(+), 4 deletions(-)
diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst
index c180936f49fc..bddedabaca80 100644
--- a/Documentation/trace/user_events.rst
+++ b/Documentation/trace/user_events.rst
@@ -7,7 +7,7 @@ user_events: User-based Event Tracing
Overview
--------
User based trace events allow user processes to create events and trace data
-that can be viewed via existing tools, such as ftrace and perf.
+that can be viewed via existing tools, such as ftrace, perf and eBPF.
To enable this feature, build your kernel with CONFIG_USER_EVENTS=y.
Programs can view status of the events via
@@ -67,7 +67,8 @@ The command string format is as follows::
Supported Flags
^^^^^^^^^^^^^^^
-None yet
+**BPF_ITER** - EBPF programs attached to this event will get the raw iovec
+struct instead of any data copies for max performance.
Field Format
^^^^^^^^^^^^
@@ -159,7 +160,7 @@ The following values are defined to aid in checking what has been attached:
**EVENT_STATUS_FTRACE** - Bit set if ftrace has been attached (Bit 0).
-**EVENT_STATUS_PERF** - Bit set if perf has been attached (Bit 1).
+**EVENT_STATUS_PERF** - Bit set if perf/eBPF has been attached (Bit 1).
Writing Data
------------
@@ -203,6 +204,13 @@ It's advised for user programs to do the following::
**NOTE:** *The write_index is not emitted out into the trace being recorded.*
+EBPF
+----
+EBPF programs that attach to a user-based event tracepoint are given a pointer
+to a struct user_bpf_context. The bpf context contains the data type (which can
+be a user or kernel buffer, or can be a pointer to the iovec) and the data
+length that was emitted (minus the write_index).
+
Example Code
------------
See sample code in samples/user_events.
diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index 736e05603463..e570840571e1 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -32,6 +32,9 @@
/* Create dynamic location entry within a 32-bit value */
#define DYN_LOC(offset, size) ((size) << 16 | (offset))
+/* Use raw iterator for attached BPF program(s), no affect on ftrace/perf */
+#define FLAG_BPF_ITER (1 << 0)
+
/*
* Describes an event registration and stores the results of the registration.
* This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum
@@ -60,4 +63,54 @@ struct user_reg {
/* Requests to delete a user_event */
#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*)
+/* Data type that was passed to the BPF program */
+enum {
+ /* Data resides in kernel space */
+ USER_BPF_DATA_KERNEL,
+
+ /* Data resides in user space */
+ USER_BPF_DATA_USER,
+
+ /* Data is a pointer to a user_bpf_iter structure */
+ USER_BPF_DATA_ITER,
+};
+
+/*
+ * Describes an iovec iterator that BPF programs can use to access data for
+ * a given user_event write() / writev() call.
+ */
+struct user_bpf_iter {
+
+ /* Offset of the data within the first iovec */
+ __u32 iov_offset;
+
+ /* Number of iovec structures */
+ __u32 nr_segs;
+
+ /* Pointer to iovec structures */
+ const struct iovec *iov;
+};
+
+/* Context that BPF programs receive when attached to a user_event */
+struct user_bpf_context {
+
+ /* Data type being passed (see union below) */
+ __u32 data_type;
+
+ /* Length of the data */
+ __u32 data_len;
+
+ /* Pointer to data, varies by data type */
+ union {
+ /* Kernel data (data_type == USER_BPF_DATA_KERNEL) */
+ void *kdata;
+
+ /* User data (data_type == USER_BPF_DATA_USER) */
+ void *udata;
+
+ /* Direct iovec (data_type == USER_BPF_DATA_ITER) */
+ struct user_bpf_iter *iter;
+ };
+};
+
#endif /* _UAPI_LINUX_USER_EVENTS_H */
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 3bc97e44253f..8b3d241a31c2 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -42,6 +42,9 @@
#define MAX_FIELD_ARRAY_SIZE 1024
#define MAX_FIELD_ARG_NAME 256
+#define MAX_BPF_COPY_SIZE PAGE_SIZE
+#define MAX_STACK_BPF_DATA 512
+
static char *register_page_data;
static DEFINE_MUTEX(reg_mutex);
@@ -402,6 +405,19 @@ static int user_event_parse_field(char *field, struct user_event *user,
type[0] != 'u', FILTER_OTHER);
}
+static void user_event_parse_flags(struct user_event *user, char *flags)
+{
+ char *flag;
+
+ if (flags == NULL)
+ return;
+
+ while ((flag = strsep(&flags, ",")) != NULL) {
+ if (strcmp(flag, "BPF_ITER") == 0)
+ user->flags |= FLAG_BPF_ITER;
+ }
+}
+
static int user_event_parse_fields(struct user_event *user, char *args)
{
char *field;
@@ -697,14 +713,64 @@ static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
}
#ifdef CONFIG_PERF_EVENTS
+static void user_event_bpf(struct user_event *user, struct iov_iter *i)
+{
+ struct user_bpf_context context;
+ struct user_bpf_iter bpf_i;
+ char fast_data[MAX_STACK_BPF_DATA];
+ void *temp = NULL;
+
+ if ((user->flags & FLAG_BPF_ITER) && iter_is_iovec(i)) {
+ /* Raw iterator */
+ context.data_type = USER_BPF_DATA_ITER;
+ context.data_len = i->count;
+ context.iter = &bpf_i;
+
+ bpf_i.iov_offset = i->iov_offset;
+ bpf_i.iov = i->iov;
+ bpf_i.nr_segs = i->nr_segs;
+ } else if (i->nr_segs == 1 && iter_is_iovec(i)) {
+ /* Single buffer from user */
+ context.data_type = USER_BPF_DATA_USER;
+ context.data_len = i->count;
+ context.udata = i->iov->iov_base + i->iov_offset;
+ } else {
+ /* Multi buffer from user */
+ struct iov_iter copy = *i;
+ size_t copy_size = min_t(size_t, i->count, MAX_BPF_COPY_SIZE);
+
+ context.data_type = USER_BPF_DATA_KERNEL;
+ context.kdata = fast_data;
+
+ if (unlikely(copy_size > sizeof(fast_data))) {
+ temp = kmalloc(copy_size, GFP_NOWAIT);
+
+ if (temp)
+ context.kdata = temp;
+ else
+ copy_size = sizeof(fast_data);
+ }
+
+ context.data_len = copy_nofault(context.kdata,
+ copy_size, ©);
+ }
+
+ trace_call_bpf(&user->call, &context);
+
+ kfree(temp);
+}
+
/*
- * Writes the user supplied payload out to perf ring buffer.
+ * Writes the user supplied payload out to perf ring buffer or eBPF program.
*/
static void user_event_perf(struct user_event *user, struct iov_iter *i,
void *tpdata, bool *faulted)
{
struct hlist_head *perf_head;
+ if (bpf_prog_array_valid(&user->call))
+ user_event_bpf(user, i);
+
perf_head = this_cpu_ptr(user->call.perf_events);
if (perf_head && !hlist_empty(perf_head)) {
@@ -1070,6 +1136,8 @@ static int user_event_parse(char *name, char *args, char *flags,
user->tracepoint.name = name;
+ user_event_parse_flags(user, flags);
+
ret = user_event_parse_fields(user, args);
if (ret)
@@ -1507,6 +1575,9 @@ static int user_seq_show(struct seq_file *m, void *p)
busy++;
}
+ if (flags & FLAG_BPF_ITER)
+ seq_puts(m, " FLAG:BPF_ITER");
+
seq_puts(m, "\n");
active++;
}
--
2.25.1
On Wed, Mar 30, 2022 at 2:27 PM Beau Belgrave <[email protected]> wrote:
>
> On Wed, Mar 30, 2022 at 01:39:49PM -0700, Alexei Starovoitov wrote:
> > On Wed, Mar 30, 2022 at 12:15 PM Beau Belgrave
> > <[email protected]> wrote:
> > >
> > > On Wed, Mar 30, 2022 at 11:22:32AM -0700, Alexei Starovoitov wrote:
> > > > On Wed, Mar 30, 2022 at 9:34 AM Beau Belgrave <[email protected]> wrote:
> > > > > > >
> > > > > > > But you are fine with uprobe costs? uprobes appear to be much more costly
> > > > > > > than a syscall approach on the hardware I've run on.
> > > >
> > > > Care to share the numbers?
> > > > uprobe over USDT is a single trap.
> > > > Not much slower compared to syscall with kpti.
> > > >
> > >
> > > Sure, these are the numbers we have from a production device.
> > >
> > > They are captured via perf via PERF_COUNT_HW_CPU_CYCLES.
> > > It's running a 20K loop emitting 4 bytes of data out.
> > > Each 4 byte event time is recorded via perf.
> > > At the end we have the total time and the max seen.
> > >
> > > null numbers represent a 20K loop with just perf start/stop ioctl costs.
> > >
> > > null: min=2863, avg=2953, max=30815
> > > uprobe: min=10994, avg=11376, max=146682
> >
> > I suspect it's a 3 trap case of uprobe.
> > USDT is a nop. It's a 1 trap case.
> >
> > > uevent: min=7043, avg=7320, max=95396
> > > lttng: min=6270, avg=6508, max=41951
> > >
> > > These costs include the data getting into a buffer, so they represent
> > > what we would see in production vs the trap cost alone. For uprobe this
> > > means we created a uprobe and attached it via tracefs to get the above
> > > numbers.
> > >
> > > There also seems to be some thinking around this as well from Song Liu.
> > > Link: https://lore.kernel.org/lkml/[email protected]/
> > >
> > > From the link:
> > > 1. User programs are faster. The new selftest added in 5/5, shows that a
> > > simple uprobe program takes 1400 nanoseconds, while user program only
> > > takes 300 nanoseconds.
> >
> >
> > Take a look at Song's code. It's 2 trap case.
> > The USDT is a half of that. ~700ns.
> > Compared to 300ns of syscall that difference
> > could be acceptable.
> >
> > >
> > > > > >
> > > > > > Can we achieve the same/similar performance with sys_bpf(BPF_PROG_RUN)?
> > > > > >
> > > > >
> > > > > I think so, the tough part is how do you let the user-space know which
> > > > > program is attached to run? In the current code this is done by the BPF
> > > > > program attaching to the event via perf and we run the one there if
> > > > > any when data is emitted out via write calls.
> > > > >
> > > > > I would want to make sure that operators can decide where the user-space
> > > > > data goes (perf/ftrace/eBPF) after the code has been written. With the
> > > > > current code this is done via the tracepoint callbacks that perf/ftrace
> > > > > hook up when operators enable recording via perf, tracefs, libbpf, etc.
> > > > >
> > > > > We have managed code (C#/Java) where we cannot utilize stubs or traps
> > > > > easily due to code movement. So we are limited in how we can approach
> > > > > this problem. Having the interface be mmap/write has enabled this
> > > > > for us, since it's easy to interact with in most languages and gives us
> > > > > lifetime management of the trace objects between user-space and the
> > > > > kernel.
> > > >
> > > > Then you should probably invest into making USDT work inside
> > > > java applications instead of reinventing the wheel.
> > > >
> > > > As an alternative you can do a dummy write or any other syscall
> > > > and attach bpf on the kernel side.
> > > > No kernel changes are necessary.
> > >
> > > We only want syscall/tracing overheads for the specific events that are
> > > hooked. I don't see how we could hook up a dummy write that is unique
> > > per-event without having a way to know when the event is being traced.
> >
> > You're adding writev-s to user apps. Keep that writev without
> > any user_events on the kernel side and pass -1 as FD.
> > Hook bpf prog to sys_writev and filter by pid.
>
> I see. That would have all events incur a syscall cost regardless if a
> BPF program is attached or not. We are typically monitoring all processes
> so we would not want that overhead on each writev invocation.
>
> We would also have to decode each writev payload to determine if it's
> the event we are interested in. The mmap part of user_events solves that
> part for us, the byte/bits get set to non-zero when the writev cost is
> worth it.
Please don't reinvent the wheel.
This problem is already solved by USDT semaphores.
On Wed, Mar 30, 2022 at 01:39:49PM -0700, Alexei Starovoitov wrote:
> On Wed, Mar 30, 2022 at 12:15 PM Beau Belgrave
> <[email protected]> wrote:
> >
> > On Wed, Mar 30, 2022 at 11:22:32AM -0700, Alexei Starovoitov wrote:
> > > On Wed, Mar 30, 2022 at 9:34 AM Beau Belgrave <[email protected]> wrote:
> > > > > >
> > > > > > But you are fine with uprobe costs? uprobes appear to be much more costly
> > > > > > than a syscall approach on the hardware I've run on.
> > >
> > > Care to share the numbers?
> > > uprobe over USDT is a single trap.
> > > Not much slower compared to syscall with kpti.
> > >
> >
> > Sure, these are the numbers we have from a production device.
> >
> > They are captured via perf via PERF_COUNT_HW_CPU_CYCLES.
> > It's running a 20K loop emitting 4 bytes of data out.
> > Each 4 byte event time is recorded via perf.
> > At the end we have the total time and the max seen.
> >
> > null numbers represent a 20K loop with just perf start/stop ioctl costs.
> >
> > null: min=2863, avg=2953, max=30815
> > uprobe: min=10994, avg=11376, max=146682
>
> I suspect it's a 3 trap case of uprobe.
> USDT is a nop. It's a 1 trap case.
>
> > uevent: min=7043, avg=7320, max=95396
> > lttng: min=6270, avg=6508, max=41951
> >
> > These costs include the data getting into a buffer, so they represent
> > what we would see in production vs the trap cost alone. For uprobe this
> > means we created a uprobe and attached it via tracefs to get the above
> > numbers.
> >
> > There also seems to be some thinking around this as well from Song Liu.
> > Link: https://lore.kernel.org/lkml/[email protected]/
> >
> > From the link:
> > 1. User programs are faster. The new selftest added in 5/5, shows that a
> > simple uprobe program takes 1400 nanoseconds, while user program only
> > takes 300 nanoseconds.
>
>
> Take a look at Song's code. It's 2 trap case.
> The USDT is a half of that. ~700ns.
> Compared to 300ns of syscall that difference
> could be acceptable.
>
> >
> > > > >
> > > > > Can we achieve the same/similar performance with sys_bpf(BPF_PROG_RUN)?
> > > > >
> > > >
> > > > I think so, the tough part is how do you let the user-space know which
> > > > program is attached to run? In the current code this is done by the BPF
> > > > program attaching to the event via perf and we run the one there if
> > > > any when data is emitted out via write calls.
> > > >
> > > > I would want to make sure that operators can decide where the user-space
> > > > data goes (perf/ftrace/eBPF) after the code has been written. With the
> > > > current code this is done via the tracepoint callbacks that perf/ftrace
> > > > hook up when operators enable recording via perf, tracefs, libbpf, etc.
> > > >
> > > > We have managed code (C#/Java) where we cannot utilize stubs or traps
> > > > easily due to code movement. So we are limited in how we can approach
> > > > this problem. Having the interface be mmap/write has enabled this
> > > > for us, since it's easy to interact with in most languages and gives us
> > > > lifetime management of the trace objects between user-space and the
> > > > kernel.
> > >
> > > Then you should probably invest into making USDT work inside
> > > java applications instead of reinventing the wheel.
> > >
> > > As an alternative you can do a dummy write or any other syscall
> > > and attach bpf on the kernel side.
> > > No kernel changes are necessary.
> >
> > We only want syscall/tracing overheads for the specific events that are
> > hooked. I don't see how we could hook up a dummy write that is unique
> > per-event without having a way to know when the event is being traced.
>
> You're adding writev-s to user apps. Keep that writev without
> any user_events on the kernel side and pass -1 as FD.
> Hook bpf prog to sys_writev and filter by pid.
I see. That would have all events incur a syscall cost regardless if a
BPF program is attached or not. We are typically monitoring all processes
so we would not want that overhead on each writev invocation.
We would also have to decode each writev payload to determine if it's
the event we are interested in. The mmap part of user_events solves that
part for us, the byte/bits get set to non-zero when the writev cost is
worth it.
Thanks,
-Beau
On Tue, Mar 29, 2022 at 11:19 AM Beau Belgrave
<[email protected]> wrote:
>
> Send user_event data to attached eBPF programs for user_event based perf
> events.
>
> Add BPF_ITER flag to allow user_event data to have a zero copy path into
> eBPF programs if required.
>
> Update documentation to describe new flags and structures for eBPF
> integration.
>
> Signed-off-by: Beau Belgrave <[email protected]>
The commit describes _what_ it does, but says nothing about _why_.
At present I see no use out of bpf and user_events connection.
The whole user_events feature looks redundant to me.
We have uprobes and usdt. It doesn't look to me that
user_events provide anything new that wasn't available earlier.
On Wed, Mar 30, 2022 at 09:06:24AM -0700, Song Liu wrote:
> On Tue, Mar 29, 2022 at 4:11 PM Beau Belgrave <[email protected]> wrote:
> >
> > On Tue, Mar 29, 2022 at 03:31:31PM -0700, Alexei Starovoitov wrote:
> > > On Tue, Mar 29, 2022 at 1:11 PM Beau Belgrave <[email protected]> wrote:
> > > >
> > > > On Tue, Mar 29, 2022 at 12:50:40PM -0700, Alexei Starovoitov wrote:
> > > > > On Tue, Mar 29, 2022 at 11:19 AM Beau Belgrave
> > > > > <[email protected]> wrote:
> > > > > >
> > > > > > Send user_event data to attached eBPF programs for user_event based perf
> > > > > > events.
> > > > > >
> > > > > > Add BPF_ITER flag to allow user_event data to have a zero copy path into
> > > > > > eBPF programs if required.
> > > > > >
> > > > > > Update documentation to describe new flags and structures for eBPF
> > > > > > integration.
> > > > > >
> > > > > > Signed-off-by: Beau Belgrave <[email protected]>
> > > > >
> > > > > The commit describes _what_ it does, but says nothing about _why_.
> > > > > At present I see no use out of bpf and user_events connection.
> > > > > The whole user_events feature looks redundant to me.
> > > > > We have uprobes and usdt. It doesn't look to me that
> > > > > user_events provide anything new that wasn't available earlier.
> > > >
> > > > A lot of the why, in general, for user_events is covered in the first
> > > > change in the series.
> > > > Link: https://lore.kernel.org/all/[email protected]/
> > > >
> > > > The why was also covered in Linux Plumbers Conference 2021 within the
> > > > tracing microconference.
> > > >
> > > > An example of why we want user_events:
> > > > Managed code running that emits data out via Open Telemetry.
> > > > Since it's managed there isn't a stub location to patch, it moves.
> > > > We watch the Open Telemetry spans in an eBPF program, when a span takes
> > > > too long we collect stack data and perform other actions.
> > > > With user_events and perf we can monitor the entire system from the root
> > > > container without having to have relay agents within each
> > > > cgroup/namespace taking up resources.
> > > > We do not need to enter each cgroup mnt space and determine the correct
> > > > patch location or the right version of each binary for processes that
> > > > use user_events.
> > > >
> > > > An example of why we want eBPF integration:
> > > > We also have scenarios where we are live decoding the data quickly.
> > > > Having user_data fed directly to eBPF lets us cast the data coming in to
> > > > a struct and decode very very quickly to determine if something is
> > > > wrong.
> > > > We can take that data quickly and put it into maps to perform further
> > > > aggregation as required.
> > > > We have scenarios that have "skid" problems, where we need to grab
> > > > further data exactly when the process that had the problem was running.
> > > > eBPF lets us do all of this that we cannot easily do otherwise.
> > > >
> > > > Another benefit from user_events is the tracing is much faster than
> > > > uprobes or others using int 3 traps. This is critical to us to enable on
> > > > production systems.
> > >
> > > None of it makes sense to me.
> >
> > Sorry.
> >
> > > To take advantage of user_events user space has to be modified
> > > and writev syscalls inserted.
> >
> > Yes, both user_events and lttng require user space modifications to do
> > tracing correctly. The syscall overheads are real, and the cost depends
> > on the mitigations around spectre/meltdown.
> >
> > > This is not cheap and I cannot see a production system using this interface.
> >
> > But you are fine with uprobe costs? uprobes appear to be much more costly
> > than a syscall approach on the hardware I've run on.
>
> Can we achieve the same/similar performance with sys_bpf(BPF_PROG_RUN)?
>
I think so, the tough part is how do you let the user-space know which
program is attached to run? In the current code this is done by the BPF
program attaching to the event via perf and we run the one there if
any when data is emitted out via write calls.
I would want to make sure that operators can decide where the user-space
data goes (perf/ftrace/eBPF) after the code has been written. With the
current code this is done via the tracepoint callbacks that perf/ftrace
hook up when operators enable recording via perf, tracefs, libbpf, etc.
We have managed code (C#/Java) where we cannot utilize stubs or traps
easily due to code movement. So we are limited in how we can approach
this problem. Having the interface be mmap/write has enabled this
for us, since it's easy to interact with in most languages and gives us
lifetime management of the trace objects between user-space and the
kernel.
> Thanks,
> Song
>
> >
> > > All you did is a poor man version of lttng that doesn't rely
> > > on such heavy instrumentation.
> >
> > Well I am a frugal person. :)
> >
> > This work has solved some critical issues we've been having, and I would
> > appreciate a review of the code if possible.
> >
> > Thanks,
> > -Beau
Thanks,
-Beau
----- On Mar 30, 2022, at 3:15 PM, Beau Belgrave [email protected] wrote:
> On Wed, Mar 30, 2022 at 11:22:32AM -0700, Alexei Starovoitov wrote:
>> On Wed, Mar 30, 2022 at 9:34 AM Beau Belgrave <[email protected]> wrote:
>> > > >
>> > > > But you are fine with uprobe costs? uprobes appear to be much more costly
>> > > > than a syscall approach on the hardware I've run on.
>>
>> Care to share the numbers?
>> uprobe over USDT is a single trap.
>> Not much slower compared to syscall with kpti.
>>
>
> Sure, these are the numbers we have from a production device.
>
> They are captured via perf via PERF_COUNT_HW_CPU_CYCLES.
> It's running a 20K loop emitting 4 bytes of data out.
> Each 4 byte event time is recorded via perf.
> At the end we have the total time and the max seen.
>
> null numbers represent a 20K loop with just perf start/stop ioctl costs.
>
> null: min=2863, avg=2953, max=30815
> uprobe: min=10994, avg=11376, max=146682
> uevent: min=7043, avg=7320, max=95396
> lttng: min=6270, avg=6508, max=41951
>
> These costs include the data getting into a buffer, so they represent
> what we would see in production vs the trap cost alone. For uprobe this
> means we created a uprobe and attached it via tracefs to get the above
> numbers.
[...]
I assume here that by "lttng" you specifically refer to lttng-ust (LTTng's
user-space tracer), am I correct ?
By removing the "null" baseline overhead, my rough calculations are that the
average overhead for lttng-ust in your results is (in cpu cycles):
6270-2863 = 3555
So I'm unsure what is the frequency of your CPU, but guessing around 3.5GHz
this is in the area of 1 microsecond. On an Intel CPU, this is much larger
than what I would expect.
Can you share your test program, hardware characteristics, kernel version,
glibc version, and whether the program is compiled as a 32-bit or 64-bit
binary ?
Can you confirm that lttng-ust is not calling one getcpu system call per
event ? This might be the case if run a 32-bit x86 binary and have a
glibc < 2.35, or a kernel too old to provide CONFIG_RSEQ or don't have
CONFIG_RSEQ=y in your kernel configuration. You can validate this by
running your lttng-ust test program with a system call tracer.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
On Wed, Mar 30, 2022 at 9:34 AM Beau Belgrave <[email protected]> wrote:
> > >
> > > But you are fine with uprobe costs? uprobes appear to be much more costly
> > > than a syscall approach on the hardware I've run on.
Care to share the numbers?
uprobe over USDT is a single trap.
Not much slower compared to syscall with kpti.
> >
> > Can we achieve the same/similar performance with sys_bpf(BPF_PROG_RUN)?
> >
>
> I think so, the tough part is how do you let the user-space know which
> program is attached to run? In the current code this is done by the BPF
> program attaching to the event via perf and we run the one there if
> any when data is emitted out via write calls.
>
> I would want to make sure that operators can decide where the user-space
> data goes (perf/ftrace/eBPF) after the code has been written. With the
> current code this is done via the tracepoint callbacks that perf/ftrace
> hook up when operators enable recording via perf, tracefs, libbpf, etc.
>
> We have managed code (C#/Java) where we cannot utilize stubs or traps
> easily due to code movement. So we are limited in how we can approach
> this problem. Having the interface be mmap/write has enabled this
> for us, since it's easy to interact with in most languages and gives us
> lifetime management of the trace objects between user-space and the
> kernel.
Then you should probably invest into making USDT work inside
java applications instead of reinventing the wheel.
As an alternative you can do a dummy write or any other syscall
and attach bpf on the kernel side.
No kernel changes are necessary.
On Wed, Mar 30, 2022 at 11:22:32AM -0700, Alexei Starovoitov wrote:
> On Wed, Mar 30, 2022 at 9:34 AM Beau Belgrave <[email protected]> wrote:
> > > >
> > > > But you are fine with uprobe costs? uprobes appear to be much more costly
> > > > than a syscall approach on the hardware I've run on.
>
> Care to share the numbers?
> uprobe over USDT is a single trap.
> Not much slower compared to syscall with kpti.
>
Sure, these are the numbers we have from a production device.
They are captured via perf via PERF_COUNT_HW_CPU_CYCLES.
It's running a 20K loop emitting 4 bytes of data out.
Each 4 byte event time is recorded via perf.
At the end we have the total time and the max seen.
null numbers represent a 20K loop with just perf start/stop ioctl costs.
null: min=2863, avg=2953, max=30815
uprobe: min=10994, avg=11376, max=146682
uevent: min=7043, avg=7320, max=95396
lttng: min=6270, avg=6508, max=41951
These costs include the data getting into a buffer, so they represent
what we would see in production vs the trap cost alone. For uprobe this
means we created a uprobe and attached it via tracefs to get the above
numbers.
There also seems to be some thinking around this as well from Song Liu.
Link: https://lore.kernel.org/lkml/[email protected]/
From the link:
1. User programs are faster. The new selftest added in 5/5, shows that a
simple uprobe program takes 1400 nanoseconds, while user program only
takes 300 nanoseconds.
> > >
> > > Can we achieve the same/similar performance with sys_bpf(BPF_PROG_RUN)?
> > >
> >
> > I think so, the tough part is how do you let the user-space know which
> > program is attached to run? In the current code this is done by the BPF
> > program attaching to the event via perf and we run the one there if
> > any when data is emitted out via write calls.
> >
> > I would want to make sure that operators can decide where the user-space
> > data goes (perf/ftrace/eBPF) after the code has been written. With the
> > current code this is done via the tracepoint callbacks that perf/ftrace
> > hook up when operators enable recording via perf, tracefs, libbpf, etc.
> >
> > We have managed code (C#/Java) where we cannot utilize stubs or traps
> > easily due to code movement. So we are limited in how we can approach
> > this problem. Having the interface be mmap/write has enabled this
> > for us, since it's easy to interact with in most languages and gives us
> > lifetime management of the trace objects between user-space and the
> > kernel.
>
> Then you should probably invest into making USDT work inside
> java applications instead of reinventing the wheel.
>
> As an alternative you can do a dummy write or any other syscall
> and attach bpf on the kernel side.
> No kernel changes are necessary.
We only want syscall/tracing overheads for the specific events that are
hooked. I don't see how we could hook up a dummy write that is unique
per-event without having a way to know when the event is being traced.
Thanks,
-Beau
On Wed, Mar 30, 2022 at 12:15 PM Beau Belgrave
<[email protected]> wrote:
>
> On Wed, Mar 30, 2022 at 11:22:32AM -0700, Alexei Starovoitov wrote:
> > On Wed, Mar 30, 2022 at 9:34 AM Beau Belgrave <[email protected]> wrote:
> > > > >
> > > > > But you are fine with uprobe costs? uprobes appear to be much more costly
> > > > > than a syscall approach on the hardware I've run on.
> >
> > Care to share the numbers?
> > uprobe over USDT is a single trap.
> > Not much slower compared to syscall with kpti.
> >
>
> Sure, these are the numbers we have from a production device.
>
> They are captured via perf via PERF_COUNT_HW_CPU_CYCLES.
> It's running a 20K loop emitting 4 bytes of data out.
> Each 4 byte event time is recorded via perf.
> At the end we have the total time and the max seen.
>
> null numbers represent a 20K loop with just perf start/stop ioctl costs.
>
> null: min=2863, avg=2953, max=30815
> uprobe: min=10994, avg=11376, max=146682
I suspect it's a 3 trap case of uprobe.
USDT is a nop. It's a 1 trap case.
> uevent: min=7043, avg=7320, max=95396
> lttng: min=6270, avg=6508, max=41951
>
> These costs include the data getting into a buffer, so they represent
> what we would see in production vs the trap cost alone. For uprobe this
> means we created a uprobe and attached it via tracefs to get the above
> numbers.
>
> There also seems to be some thinking around this as well from Song Liu.
> Link: https://lore.kernel.org/lkml/[email protected]/
>
> From the link:
> 1. User programs are faster. The new selftest added in 5/5, shows that a
> simple uprobe program takes 1400 nanoseconds, while user program only
> takes 300 nanoseconds.
Take a look at Song's code. It's 2 trap case.
The USDT is a half of that. ~700ns.
Compared to 300ns of syscall that difference
could be acceptable.
>
> > > >
> > > > Can we achieve the same/similar performance with sys_bpf(BPF_PROG_RUN)?
> > > >
> > >
> > > I think so, the tough part is how do you let the user-space know which
> > > program is attached to run? In the current code this is done by the BPF
> > > program attaching to the event via perf and we run the one there if
> > > any when data is emitted out via write calls.
> > >
> > > I would want to make sure that operators can decide where the user-space
> > > data goes (perf/ftrace/eBPF) after the code has been written. With the
> > > current code this is done via the tracepoint callbacks that perf/ftrace
> > > hook up when operators enable recording via perf, tracefs, libbpf, etc.
> > >
> > > We have managed code (C#/Java) where we cannot utilize stubs or traps
> > > easily due to code movement. So we are limited in how we can approach
> > > this problem. Having the interface be mmap/write has enabled this
> > > for us, since it's easy to interact with in most languages and gives us
> > > lifetime management of the trace objects between user-space and the
> > > kernel.
> >
> > Then you should probably invest into making USDT work inside
> > java applications instead of reinventing the wheel.
> >
> > As an alternative you can do a dummy write or any other syscall
> > and attach bpf on the kernel side.
> > No kernel changes are necessary.
>
> We only want syscall/tracing overheads for the specific events that are
> hooked. I don't see how we could hook up a dummy write that is unique
> per-event without having a way to know when the event is being traced.
You're adding writev-s to user apps. Keep that writev without
any user_events on the kernel side and pass -1 as FD.
Hook bpf prog to sys_writev and filter by pid.
On Wed, Mar 30, 2022 at 03:57:26PM -0400, Mathieu Desnoyers wrote:
> ----- On Mar 30, 2022, at 3:15 PM, Beau Belgrave [email protected] wrote:
>
> > On Wed, Mar 30, 2022 at 11:22:32AM -0700, Alexei Starovoitov wrote:
> >> On Wed, Mar 30, 2022 at 9:34 AM Beau Belgrave <[email protected]> wrote:
> >> > > >
> >> > > > But you are fine with uprobe costs? uprobes appear to be much more costly
> >> > > > than a syscall approach on the hardware I've run on.
> >>
> >> Care to share the numbers?
> >> uprobe over USDT is a single trap.
> >> Not much slower compared to syscall with kpti.
> >>
> >
> > Sure, these are the numbers we have from a production device.
> >
> > They are captured via perf via PERF_COUNT_HW_CPU_CYCLES.
> > It's running a 20K loop emitting 4 bytes of data out.
> > Each 4 byte event time is recorded via perf.
> > At the end we have the total time and the max seen.
> >
> > null numbers represent a 20K loop with just perf start/stop ioctl costs.
> >
> > null: min=2863, avg=2953, max=30815
> > uprobe: min=10994, avg=11376, max=146682
> > uevent: min=7043, avg=7320, max=95396
> > lttng: min=6270, avg=6508, max=41951
> >
> > These costs include the data getting into a buffer, so they represent
> > what we would see in production vs the trap cost alone. For uprobe this
> > means we created a uprobe and attached it via tracefs to get the above
> > numbers.
>
> [...]
>
> I assume here that by "lttng" you specifically refer to lttng-ust (LTTng's
> user-space tracer), am I correct ?
>
Yes, this is ust.
> By removing the "null" baseline overhead, my rough calculations are that the
> average overhead for lttng-ust in your results is (in cpu cycles):
>
> 6270-2863 = 3555
>
> So I'm unsure what is the frequency of your CPU, but guessing around 3.5GHz
> this is in the area of 1 microsecond. On an Intel CPU, this is much larger
> than what I would expect.
>
> Can you share your test program, hardware characteristics, kernel version,
> glibc version, and whether the program is compiled as a 32-bit or 64-bit
> binary ?
>
This is how we are measuring:
#define PERF_START() \
do \
{ \
ioctl(perf_fd, PERF_EVENT_IOC_RESET, 0); \
ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0); \
} while (0)
#define PERF_END(__cycles) \
do \
{ \
ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0); \
read(perf_fd, &__cycles, sizeof(__cycles)); \
} while (0)
struct perf_event_attr pe;
long long min, max, total;
int i, perf_fd;
memset(&pe, 0, sizeof(pe));
pe.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(pe);
pe.config = PERF_COUNT_HW_CPU_CYCLES;
pe.disabled = 1;
pe.exclude_hv = 1;
perf_fd = perf_event_open(&pe, 0, -1, -1, 0);
min = max = total = 0;
for (i = 0; i < 20000; ++i)
{
long long cycles;
PERF_START();
probe();
PERF_END(cycles);
if (i == 0 || cycles < min)
min = cycles;
if (cycles > max)
max = cycles;
total += cycles;
}
probe() here could be a call to writev or to the lttng trace call.
> Can you confirm that lttng-ust is not calling one getcpu system call per
> event ? This might be the case if run a 32-bit x86 binary and have a
> glibc < 2.35, or a kernel too old to provide CONFIG_RSEQ or don't have
> CONFIG_RSEQ=y in your kernel configuration. You can validate this by
> running your lttng-ust test program with a system call tracer.
>
We don't have CONFIG_RSEQ, so that is likely the cause. LTTng is always
going to be the fastest thing out there. It's pure user mode :)
> Thanks,
>
> Mathieu
>
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> http://www.efficios.com
Thanks,
-Beau