Add a benchmark to compare performance of
1) uprobe;
2) user program w/o args;
3) user program w/ args;
4) user program w/ args on random cpu.
Sample output:
./test_progs -t uprobe_vs_user_prog -v
test_uprobe_vs_user_prog:PASS:uprobe_vs_user_prog__open_and_load 0 nsec
test_uprobe_vs_user_prog:PASS:get_base_addr 0 nsec
test_uprobe_vs_user_prog:PASS:attach_uprobe 0 nsec
run_perf_test:PASS:uprobe 0 nsec
Each uprobe uses 1419 nanoseconds
run_perf_test:PASS:user_prog_no_args 0 nsec
Each user_prog_no_args uses 313 nanoseconds
run_perf_test:PASS:user_prog_with_args 0 nsec
Each user_prog_with_args uses 335 nanoseconds
run_perf_test:PASS:user_prog_with_args_on_cpu 0 nsec
Each user_prog_with_args_on_cpu uses 2821 nanoseconds
Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED
Signed-off-by: Song Liu <[email protected]>
---
.../bpf/prog_tests/uprobe_vs_user_prog.c | 101 ++++++++++++++++++
.../selftests/bpf/progs/uprobe_vs_user_prog.c | 21 ++++
2 files changed, 122 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/uprobe_vs_user_prog.c
create mode 100644 tools/testing/selftests/bpf/progs/uprobe_vs_user_prog.c
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_vs_user_prog.c b/tools/testing/selftests/bpf/prog_tests/uprobe_vs_user_prog.c
new file mode 100644
index 0000000000000..dadd7b56e69ec
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_vs_user_prog.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "uprobe_vs_user_prog.skel.h"
+
+#define REPEAT_CNT 10000ULL
+
+static int duration;
+
+static noinline void uprobe_target(void)
+{
+ asm ("");
+}
+
+struct bpf_prog_test_run_attr attr;
+
+static void call_user_prog(void)
+{
+ bpf_prog_test_run_xattr(&attr);
+}
+
+static int numcpu;
+
+static void call_user_prog_on_cpu(void)
+{
+ static int cpu = 0;
+
+ attr.cpu_plus = cpu + 1;
+ bpf_prog_test_run_xattr(&attr);
+ cpu = (cpu + 1) % numcpu;
+}
+
+typedef void (__run_func)(void);
+
+static void run_perf_test(struct uprobe_vs_user_prog *skel,
+ __run_func func, const char *name)
+{
+ __u64 start_time, total_time;
+ int i;
+
+ skel->bss->sum = 0;
+
+ start_time = time_get_ns();
+ for (i = 0; i < REPEAT_CNT; i++)
+ func();
+ total_time = time_get_ns() - start_time;
+
+ CHECK(skel->bss->sum != REPEAT_CNT, name,
+ "missed %llu times\n", REPEAT_CNT - skel->bss->sum);
+ printf("Each %s uses %llu nanoseconds\n", name, total_time / REPEAT_CNT);
+}
+
+void test_uprobe_vs_user_prog(void)
+{
+ struct bpf_user_prog_args args = {};
+ struct uprobe_vs_user_prog *skel;
+ struct bpf_link *uprobe_link;
+ size_t uprobe_offset;
+ ssize_t base_addr;
+
+ skel = uprobe_vs_user_prog__open_and_load();
+
+ if (CHECK(!skel, "uprobe_vs_user_prog__open_and_load",
+ "skeleton open_and_laod failed\n"))
+ return;
+
+ base_addr = get_base_addr();
+ if (CHECK(base_addr < 0, "get_base_addr",
+ "failed to find base addr: %zd", base_addr))
+ return;
+ uprobe_offset = (size_t)&uprobe_target - base_addr;
+ uprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uprobe,
+ false /* retprobe */,
+ 0 /* self pid */,
+ "/proc/self/exe",
+ uprobe_offset);
+
+ if (CHECK(IS_ERR(uprobe_link), "attach_uprobe",
+ "err %ld\n", PTR_ERR(uprobe_link)))
+ goto cleanup;
+ skel->links.handle_uprobe = uprobe_link;
+
+ run_perf_test(skel, uprobe_target, "uprobe");
+
+ attr.prog_fd = bpf_program__fd(skel->progs.user_prog);
+ run_perf_test(skel, call_user_prog, "user_prog_no_args");
+
+ attr.data_size_in = sizeof(args);
+ attr.data_in = &args;
+ run_perf_test(skel, call_user_prog, "user_prog_with_args");
+
+ numcpu = libbpf_num_possible_cpus();
+
+ if (numcpu <= 0)
+ goto cleanup;
+
+ run_perf_test(skel, call_user_prog_on_cpu,
+ "user_prog_with_args_on_cpu");
+
+cleanup:
+ uprobe_vs_user_prog__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/uprobe_vs_user_prog.c b/tools/testing/selftests/bpf/progs/uprobe_vs_user_prog.c
new file mode 100644
index 0000000000000..8b327b7cee30d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uprobe_vs_user_prog.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017 Facebook
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+volatile __u64 sum = 0;
+
+SEC("uprobe/func")
+int handle_uprobe(struct pt_regs *ctx)
+{
+ sum++;
+ return 0;
+}
+
+SEC("user")
+int user_prog(struct pt_regs *ctx)
+{
+ sum++;
+ return 0;
+}
--
2.24.1
On Sat, Aug 1, 2020 at 1:50 AM Song Liu <[email protected]> wrote:
>
> Add a benchmark to compare performance of
> 1) uprobe;
> 2) user program w/o args;
> 3) user program w/ args;
> 4) user program w/ args on random cpu.
>
Can you please add it to the existing benchmark runner instead, e.g.,
along the other bench_trigger benchmarks? No need to re-implement
benchmark setup. And also that would also allow to compare existing
ways of cheaply triggering a program vs this new _USER program?
If the performance is not significantly better than other ways, do you
think it still makes sense to add a new BPF program type? I think
triggering KPROBE/TRACEPOINT from bpf_prog_test_run() would be very
nice, maybe it's possible to add that instead of a new program type?
Either way, let's see comparison with other program triggering
mechanisms first.
> Sample output:
>
> ./test_progs -t uprobe_vs_user_prog -v
> test_uprobe_vs_user_prog:PASS:uprobe_vs_user_prog__open_and_load 0 nsec
> test_uprobe_vs_user_prog:PASS:get_base_addr 0 nsec
> test_uprobe_vs_user_prog:PASS:attach_uprobe 0 nsec
> run_perf_test:PASS:uprobe 0 nsec
> Each uprobe uses 1419 nanoseconds
> run_perf_test:PASS:user_prog_no_args 0 nsec
> Each user_prog_no_args uses 313 nanoseconds
> run_perf_test:PASS:user_prog_with_args 0 nsec
> Each user_prog_with_args uses 335 nanoseconds
> run_perf_test:PASS:user_prog_with_args_on_cpu 0 nsec
> Each user_prog_with_args_on_cpu uses 2821 nanoseconds
> Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED
>
> Signed-off-by: Song Liu <[email protected]>
> ---
> .../bpf/prog_tests/uprobe_vs_user_prog.c | 101 ++++++++++++++++++
> .../selftests/bpf/progs/uprobe_vs_user_prog.c | 21 ++++
> 2 files changed, 122 insertions(+)
> create mode 100644 tools/testing/selftests/bpf/prog_tests/uprobe_vs_user_prog.c
> create mode 100644 tools/testing/selftests/bpf/progs/uprobe_vs_user_prog.c
>
[...]
> On Aug 2, 2020, at 6:51 PM, Andrii Nakryiko <[email protected]> wrote:
>
> On Sat, Aug 1, 2020 at 1:50 AM Song Liu <[email protected]> wrote:
>>
>> Add a benchmark to compare performance of
>> 1) uprobe;
>> 2) user program w/o args;
>> 3) user program w/ args;
>> 4) user program w/ args on random cpu.
>>
>
> Can you please add it to the existing benchmark runner instead, e.g.,
> along the other bench_trigger benchmarks? No need to re-implement
> benchmark setup. And also that would also allow to compare existing
> ways of cheaply triggering a program vs this new _USER program?
Will try.
>
> If the performance is not significantly better than other ways, do you
> think it still makes sense to add a new BPF program type? I think
> triggering KPROBE/TRACEPOINT from bpf_prog_test_run() would be very
> nice, maybe it's possible to add that instead of a new program type?
> Either way, let's see comparison with other program triggering
> mechanisms first.
Triggering KPROBE and TRACEPOINT from bpf_prog_test_run() will be useful.
But I don't think they can be used instead of user program, for a couple
reasons. First, KPROBE/TRACEPOINT may be triggered by other programs
running in the system, so user will have to filter those noise out in
each program. Second, it is not easy to specify CPU for KPROBE/TRACEPOINT,
while this feature could be useful in many cases, e.g. get stack trace
on a given CPU.
Thanks,
Song
On Sun, Aug 2, 2020 at 9:47 PM Song Liu <[email protected]> wrote:
>
>
> > On Aug 2, 2020, at 6:51 PM, Andrii Nakryiko <[email protected]> wrote:
> >
> > On Sat, Aug 1, 2020 at 1:50 AM Song Liu <[email protected]> wrote:
> >>
> >> Add a benchmark to compare performance of
> >> 1) uprobe;
> >> 2) user program w/o args;
> >> 3) user program w/ args;
> >> 4) user program w/ args on random cpu.
> >>
> >
> > Can you please add it to the existing benchmark runner instead, e.g.,
> > along the other bench_trigger benchmarks? No need to re-implement
> > benchmark setup. And also that would also allow to compare existing
> > ways of cheaply triggering a program vs this new _USER program?
>
> Will try.
>
> >
> > If the performance is not significantly better than other ways, do you
> > think it still makes sense to add a new BPF program type? I think
> > triggering KPROBE/TRACEPOINT from bpf_prog_test_run() would be very
> > nice, maybe it's possible to add that instead of a new program type?
> > Either way, let's see comparison with other program triggering
> > mechanisms first.
>
> Triggering KPROBE and TRACEPOINT from bpf_prog_test_run() will be useful.
> But I don't think they can be used instead of user program, for a couple
> reasons. First, KPROBE/TRACEPOINT may be triggered by other programs
> running in the system, so user will have to filter those noise out in
> each program. Second, it is not easy to specify CPU for KPROBE/TRACEPOINT,
> while this feature could be useful in many cases, e.g. get stack trace
> on a given CPU.
>
Right, it's not as convenient with KPROBE/TRACEPOINT as with the USER
program you've added specifically with that feature in mind. But if
you pin user-space thread on the needed CPU and trigger kprobe/tp,
then you'll get what you want. As for the "noise", see how
bench_trigger() deals with that: it records thread ID and filters
everything not matching. You can do the same with CPU ID. It's not as
automatic as with a special BPF program type, but still pretty simple,
which is why I'm still deciding (for myself) whether USER program type
is necessary :)
> Thanks,
> Song
> On Aug 2, 2020, at 10:10 PM, Andrii Nakryiko <[email protected]> wrote:
>
> On Sun, Aug 2, 2020 at 9:47 PM Song Liu <[email protected]> wrote:
>>
>>
>>> On Aug 2, 2020, at 6:51 PM, Andrii Nakryiko <[email protected]> wrote:
>>>
>>> On Sat, Aug 1, 2020 at 1:50 AM Song Liu <[email protected]> wrote:
>>>>
>>>> Add a benchmark to compare performance of
>>>> 1) uprobe;
>>>> 2) user program w/o args;
>>>> 3) user program w/ args;
>>>> 4) user program w/ args on random cpu.
>>>>
>>>
>>> Can you please add it to the existing benchmark runner instead, e.g.,
>>> along the other bench_trigger benchmarks? No need to re-implement
>>> benchmark setup. And also that would also allow to compare existing
>>> ways of cheaply triggering a program vs this new _USER program?
>>
>> Will try.
>>
>>>
>>> If the performance is not significantly better than other ways, do you
>>> think it still makes sense to add a new BPF program type? I think
>>> triggering KPROBE/TRACEPOINT from bpf_prog_test_run() would be very
>>> nice, maybe it's possible to add that instead of a new program type?
>>> Either way, let's see comparison with other program triggering
>>> mechanisms first.
>>
>> Triggering KPROBE and TRACEPOINT from bpf_prog_test_run() will be useful.
>> But I don't think they can be used instead of user program, for a couple
>> reasons. First, KPROBE/TRACEPOINT may be triggered by other programs
>> running in the system, so user will have to filter those noise out in
>> each program. Second, it is not easy to specify CPU for KPROBE/TRACEPOINT,
>> while this feature could be useful in many cases, e.g. get stack trace
>> on a given CPU.
>>
>
> Right, it's not as convenient with KPROBE/TRACEPOINT as with the USER
> program you've added specifically with that feature in mind. But if
> you pin user-space thread on the needed CPU and trigger kprobe/tp,
> then you'll get what you want. As for the "noise", see how
> bench_trigger() deals with that: it records thread ID and filters
> everything not matching. You can do the same with CPU ID. It's not as
> automatic as with a special BPF program type, but still pretty simple,
> which is why I'm still deciding (for myself) whether USER program type
> is necessary :)
Here are some bench_trigger numbers:
base : 1.698 ± 0.001M/s
tp : 1.477 ± 0.001M/s
rawtp : 1.567 ± 0.001M/s
kprobe : 1.431 ± 0.000M/s
fentry : 1.691 ± 0.000M/s
fmodret : 1.654 ± 0.000M/s
user : 1.253 ± 0.000M/s
fentry-on-cpu: 0.022 ± 0.011M/s
user-on-cpu: 0.315 ± 0.001M/s
The two "on-cpu" tests run the program on a different CPU (see the patch
at the end).
"user" is about 25% slower than "fentry". I think this is mostly because
getpgid() is a faster syscall than bpf(BPF_TEST_RUN).
"user-on-cpu" is more than 10x faster than "fentry-on-cpu", because IPI
is way faster than moving the process (via sched_setaffinity).
For use cases that we would like to call BPF program on specific CPU,
triggering it via IPI is a lot faster.
Thanks,
Song
========================== 8< ==========================
diff --git c/tools/testing/selftests/bpf/bench.c w/tools/testing/selftests/bpf/bench.c
index 944ad4721c83c..5394a1d2dfd21 100644
--- c/tools/testing/selftests/bpf/bench.c
+++ w/tools/testing/selftests/bpf/bench.c
@@ -317,7 +317,10 @@ extern const struct bench bench_trig_tp;
extern const struct bench bench_trig_rawtp;
extern const struct bench bench_trig_kprobe;
extern const struct bench bench_trig_fentry;
+extern const struct bench bench_trig_fentry_on_cpu;
extern const struct bench bench_trig_fmodret;
+extern const struct bench bench_trig_user;
+extern const struct bench bench_trig_user_on_cpu;
extern const struct bench bench_rb_libbpf;
extern const struct bench bench_rb_custom;
extern const struct bench bench_pb_libbpf;
@@ -338,7 +341,10 @@ static const struct bench *benchs[] = {
&bench_trig_rawtp,
&bench_trig_kprobe,
&bench_trig_fentry,
+ &bench_trig_fentry_on_cpu,
&bench_trig_fmodret,
+ &bench_trig_user,
+ &bench_trig_user_on_cpu,
&bench_rb_libbpf,
&bench_rb_custom,
&bench_pb_libbpf,
@@ -462,4 +468,3 @@ int main(int argc, char **argv)
return 0;
}
-
diff --git c/tools/testing/selftests/bpf/benchs/bench_trigger.c w/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 49c22832f2169..a1ebaebf6070c 100644
--- c/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ w/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+#define _GNU_SOURCE
+#include <sched.h>
#include "bench.h"
#include "trigger_bench.skel.h"
@@ -39,6 +41,22 @@ static void *trigger_producer(void *input)
return NULL;
}
+static void *trigger_on_cpu_producer(void *input)
+{
+ cpu_set_t set;
+ int i = 0, nr_cpu;
+
+ nr_cpu = libbpf_num_possible_cpus();
+ while (true) {
+ CPU_ZERO(&set);
+ CPU_SET(i, &set);
+ sched_setaffinity(0, sizeof(set), &set);
+ (void)syscall(__NR_getpgid);
+ i = (i + 1) % nr_cpu;
+ }
+ return NULL;
+}
+
static void trigger_measure(struct bench_res *res)
{
res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
@@ -96,6 +114,39 @@ static void trigger_fmodret_setup()
attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
}
+static void trigger_user_setup()
+{
+ setup_ctx();
+}
+
+static void *trigger_producer_user(void *input)
+{
+ struct bpf_prog_test_run_attr attr = {};
+
+ attr.prog_fd = bpf_program__fd(ctx.skel->progs.bench_trigger_user);
+
+ while (true)
+ (void)bpf_prog_test_run_xattr(&attr);
+ return NULL;
+}
+
+static void *trigger_producer_user_on_cpu(void *input)
+{
+ struct bpf_prog_test_run_attr attr = {};
+ int i = 0, nr_cpu;
+
+ nr_cpu = libbpf_num_possible_cpus();
+
+ attr.prog_fd = bpf_program__fd(ctx.skel->progs.bench_trigger_user);
+
+ while (true) {
+ attr.cpu_plus = i + 1;
+ (void)bpf_prog_test_run_xattr(&attr);
+ i = (i + 1) % nr_cpu;
+ }
+ return NULL;
+}
+
static void *trigger_consumer(void *input)
{
return NULL;
@@ -155,6 +206,17 @@ const struct bench bench_trig_fentry = {
.report_final = hits_drops_report_final,
};
+const struct bench bench_trig_fentry_on_cpu = {
+ .name = "trig-fentry-on-cpu",
+ .validate = trigger_validate,
+ .setup = trigger_fentry_setup,
+ .producer_thread = trigger_on_cpu_producer,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
const struct bench bench_trig_fmodret = {
.name = "trig-fmodret",
.validate = trigger_validate,
@@ -165,3 +227,25 @@ const struct bench bench_trig_fmodret = {
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
+
+const struct bench bench_trig_user = {
+ .name = "trig-user",
+ .validate = trigger_validate,
+ .setup = trigger_user_setup,
+ .producer_thread = trigger_producer_user,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_user_on_cpu = {
+ .name = "trig-user-on-cpu",
+ .validate = trigger_validate,
+ .setup = trigger_user_setup,
+ .producer_thread = trigger_producer_user_on_cpu,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
diff --git c/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh w/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
index 78e83f2432946..f10b7aea76aa3 100755
--- c/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
+++ w/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
@@ -2,7 +2,7 @@
set -eufo pipefail
-for i in base tp rawtp kprobe fentry fmodret
+for i in base tp rawtp kprobe fentry fmodret user fentry-on-cpu user-on-cpu
do
summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
printf "%-10s: %s\n" $i "$summary"
diff --git c/tools/testing/selftests/bpf/progs/trigger_bench.c w/tools/testing/selftests/bpf/progs/trigger_bench.c
index 8b36b6640e7e9..a6ac11e68d287 100644
--- c/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ w/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -45,3 +45,10 @@ int bench_trigger_fmodret(void *ctx)
__sync_add_and_fetch(&hits, 1);
return -22;
}
+
+SEC("user")
+int BPF_PROG(bench_trigger_user)
+{
+ __sync_add_and_fetch(&hits, 1);
+ return 0;
+}
~
On Tue, Aug 4, 2020 at 2:01 PM Song Liu <[email protected]> wrote:
>
>
>
> > On Aug 2, 2020, at 10:10 PM, Andrii Nakryiko <[email protected]> wrote:
> >
> > On Sun, Aug 2, 2020 at 9:47 PM Song Liu <[email protected]> wrote:
> >>
> >>
> >>> On Aug 2, 2020, at 6:51 PM, Andrii Nakryiko <[email protected]> wrote:
> >>>
> >>> On Sat, Aug 1, 2020 at 1:50 AM Song Liu <[email protected]> wrote:
> >>>>
> >>>> Add a benchmark to compare performance of
> >>>> 1) uprobe;
> >>>> 2) user program w/o args;
> >>>> 3) user program w/ args;
> >>>> 4) user program w/ args on random cpu.
> >>>>
> >>>
> >>> Can you please add it to the existing benchmark runner instead, e.g.,
> >>> along the other bench_trigger benchmarks? No need to re-implement
> >>> benchmark setup. And also that would also allow to compare existing
> >>> ways of cheaply triggering a program vs this new _USER program?
> >>
> >> Will try.
> >>
> >>>
> >>> If the performance is not significantly better than other ways, do you
> >>> think it still makes sense to add a new BPF program type? I think
> >>> triggering KPROBE/TRACEPOINT from bpf_prog_test_run() would be very
> >>> nice, maybe it's possible to add that instead of a new program type?
> >>> Either way, let's see comparison with other program triggering
> >>> mechanisms first.
> >>
> >> Triggering KPROBE and TRACEPOINT from bpf_prog_test_run() will be useful.
> >> But I don't think they can be used instead of user program, for a couple
> >> reasons. First, KPROBE/TRACEPOINT may be triggered by other programs
> >> running in the system, so user will have to filter those noise out in
> >> each program. Second, it is not easy to specify CPU for KPROBE/TRACEPOINT,
> >> while this feature could be useful in many cases, e.g. get stack trace
> >> on a given CPU.
> >>
> >
> > Right, it's not as convenient with KPROBE/TRACEPOINT as with the USER
> > program you've added specifically with that feature in mind. But if
> > you pin user-space thread on the needed CPU and trigger kprobe/tp,
> > then you'll get what you want. As for the "noise", see how
> > bench_trigger() deals with that: it records thread ID and filters
> > everything not matching. You can do the same with CPU ID. It's not as
> > automatic as with a special BPF program type, but still pretty simple,
> > which is why I'm still deciding (for myself) whether USER program type
> > is necessary :)
>
> Here are some bench_trigger numbers:
>
> base : 1.698 ± 0.001M/s
> tp : 1.477 ± 0.001M/s
> rawtp : 1.567 ± 0.001M/s
> kprobe : 1.431 ± 0.000M/s
> fentry : 1.691 ± 0.000M/s
> fmodret : 1.654 ± 0.000M/s
> user : 1.253 ± 0.000M/s
> fentry-on-cpu: 0.022 ± 0.011M/s
> user-on-cpu: 0.315 ± 0.001M/s
>
Ok, so basically all of raw_tp,tp,kprobe,fentry/fexit are
significantly faster than USER programs. Sure, when compared to
uprobe, they are faster, but not when doing on-specific-CPU run, it
seems (judging from this patch's description, if I'm reading it
right). Anyways, speed argument shouldn't be a reason for doing this,
IMO.
> The two "on-cpu" tests run the program on a different CPU (see the patch
> at the end).
>
> "user" is about 25% slower than "fentry". I think this is mostly because
> getpgid() is a faster syscall than bpf(BPF_TEST_RUN).
Yes, probably.
>
> "user-on-cpu" is more than 10x faster than "fentry-on-cpu", because IPI
> is way faster than moving the process (via sched_setaffinity).
I don't think that's a good comparison, because you are actually
testing sched_setaffinity performance on each iteration vs IPI in the
kernel, not a BPF overhead.
I think the fair comparison for this would be to create a thread and
pin it on necessary CPU, and only then BPF program calls in a loop.
But I bet any of existing program types would beat USER program.
>
> For use cases that we would like to call BPF program on specific CPU,
> triggering it via IPI is a lot faster.
So these use cases would be nice to expand on in the motivational part
of the patch set. It's not really emphasized and it's not at all clear
what you are trying to achieve. It also seems, depending on latency
requirements, it's totally possible to achieve comparable results by
pre-creating a thread for each CPU, pinning each one to its designated
CPU and then using any suitable user-space signaling mechanism (a
queue, condvar, etc) to ask a thread to trigger BPF program (fentry on
getpgid(), for instance). I bet in this case the performance would be
really nice for a lot of practical use cases. But then again, I don't
know details of the intended use case, so please provide some more
details.
>
> Thanks,
> Song
>
>
> ========================== 8< ==========================
>
> diff --git c/tools/testing/selftests/bpf/bench.c w/tools/testing/selftests/bpf/bench.c
> index 944ad4721c83c..5394a1d2dfd21 100644
> --- c/tools/testing/selftests/bpf/bench.c
> +++ w/tools/testing/selftests/bpf/bench.c
> @@ -317,7 +317,10 @@ extern const struct bench bench_trig_tp;
> extern const struct bench bench_trig_rawtp;
> extern const struct bench bench_trig_kprobe;
> extern const struct bench bench_trig_fentry;
> +extern const struct bench bench_trig_fentry_on_cpu;
> extern const struct bench bench_trig_fmodret;
> +extern const struct bench bench_trig_user;
> +extern const struct bench bench_trig_user_on_cpu;
> extern const struct bench bench_rb_libbpf;
> extern const struct bench bench_rb_custom;
> extern const struct bench bench_pb_libbpf;
> @@ -338,7 +341,10 @@ static const struct bench *benchs[] = {
> &bench_trig_rawtp,
> &bench_trig_kprobe,
> &bench_trig_fentry,
> + &bench_trig_fentry_on_cpu,
> &bench_trig_fmodret,
> + &bench_trig_user,
> + &bench_trig_user_on_cpu,
> &bench_rb_libbpf,
> &bench_rb_custom,
> &bench_pb_libbpf,
> @@ -462,4 +468,3 @@ int main(int argc, char **argv)
>
> return 0;
> }
> -
> diff --git c/tools/testing/selftests/bpf/benchs/bench_trigger.c w/tools/testing/selftests/bpf/benchs/bench_trigger.c
> index 49c22832f2169..a1ebaebf6070c 100644
> --- c/tools/testing/selftests/bpf/benchs/bench_trigger.c
> +++ w/tools/testing/selftests/bpf/benchs/bench_trigger.c
> @@ -1,5 +1,7 @@
> // SPDX-License-Identifier: GPL-2.0
> /* Copyright (c) 2020 Facebook */
> +#define _GNU_SOURCE
> +#include <sched.h>
> #include "bench.h"
> #include "trigger_bench.skel.h"
>
> @@ -39,6 +41,22 @@ static void *trigger_producer(void *input)
> return NULL;
> }
>
> +static void *trigger_on_cpu_producer(void *input)
> +{
> + cpu_set_t set;
> + int i = 0, nr_cpu;
> +
> + nr_cpu = libbpf_num_possible_cpus();
> + while (true) {
> + CPU_ZERO(&set);
> + CPU_SET(i, &set);
> + sched_setaffinity(0, sizeof(set), &set);
> + (void)syscall(__NR_getpgid);
> + i = (i + 1) % nr_cpu;
> + }
> + return NULL;
> +}
> +
> static void trigger_measure(struct bench_res *res)
> {
> res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
> @@ -96,6 +114,39 @@ static void trigger_fmodret_setup()
> attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
> }
>
> +static void trigger_user_setup()
> +{
> + setup_ctx();
> +}
> +
> +static void *trigger_producer_user(void *input)
> +{
> + struct bpf_prog_test_run_attr attr = {};
> +
> + attr.prog_fd = bpf_program__fd(ctx.skel->progs.bench_trigger_user);
> +
> + while (true)
> + (void)bpf_prog_test_run_xattr(&attr);
> + return NULL;
> +}
> +
> +static void *trigger_producer_user_on_cpu(void *input)
> +{
> + struct bpf_prog_test_run_attr attr = {};
> + int i = 0, nr_cpu;
> +
> + nr_cpu = libbpf_num_possible_cpus();
> +
> + attr.prog_fd = bpf_program__fd(ctx.skel->progs.bench_trigger_user);
> +
> + while (true) {
> + attr.cpu_plus = i + 1;
> + (void)bpf_prog_test_run_xattr(&attr);
> + i = (i + 1) % nr_cpu;
> + }
> + return NULL;
> +}
> +
> static void *trigger_consumer(void *input)
> {
> return NULL;
> @@ -155,6 +206,17 @@ const struct bench bench_trig_fentry = {
> .report_final = hits_drops_report_final,
> };
>
> +const struct bench bench_trig_fentry_on_cpu = {
> + .name = "trig-fentry-on-cpu",
> + .validate = trigger_validate,
> + .setup = trigger_fentry_setup,
> + .producer_thread = trigger_on_cpu_producer,
> + .consumer_thread = trigger_consumer,
> + .measure = trigger_measure,
> + .report_progress = hits_drops_report_progress,
> + .report_final = hits_drops_report_final,
> +};
> +
> const struct bench bench_trig_fmodret = {
> .name = "trig-fmodret",
> .validate = trigger_validate,
> @@ -165,3 +227,25 @@ const struct bench bench_trig_fmodret = {
> .report_progress = hits_drops_report_progress,
> .report_final = hits_drops_report_final,
> };
> +
> +const struct bench bench_trig_user = {
> + .name = "trig-user",
> + .validate = trigger_validate,
> + .setup = trigger_user_setup,
> + .producer_thread = trigger_producer_user,
> + .consumer_thread = trigger_consumer,
> + .measure = trigger_measure,
> + .report_progress = hits_drops_report_progress,
> + .report_final = hits_drops_report_final,
> +};
> +
> +const struct bench bench_trig_user_on_cpu = {
> + .name = "trig-user-on-cpu",
> + .validate = trigger_validate,
> + .setup = trigger_user_setup,
> + .producer_thread = trigger_producer_user_on_cpu,
> + .consumer_thread = trigger_consumer,
> + .measure = trigger_measure,
> + .report_progress = hits_drops_report_progress,
> + .report_final = hits_drops_report_final,
> +};
> diff --git c/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh w/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
> index 78e83f2432946..f10b7aea76aa3 100755
> --- c/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
> +++ w/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
> @@ -2,7 +2,7 @@
>
> set -eufo pipefail
>
> -for i in base tp rawtp kprobe fentry fmodret
> +for i in base tp rawtp kprobe fentry fmodret user fentry-on-cpu user-on-cpu
> do
> summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
> printf "%-10s: %s\n" $i "$summary"
> diff --git c/tools/testing/selftests/bpf/progs/trigger_bench.c w/tools/testing/selftests/bpf/progs/trigger_bench.c
> index 8b36b6640e7e9..a6ac11e68d287 100644
> --- c/tools/testing/selftests/bpf/progs/trigger_bench.c
> +++ w/tools/testing/selftests/bpf/progs/trigger_bench.c
> @@ -45,3 +45,10 @@ int bench_trigger_fmodret(void *ctx)
> __sync_add_and_fetch(&hits, 1);
> return -22;
> }
> +
> +SEC("user")
> +int BPF_PROG(bench_trigger_user)
> +{
> + __sync_add_and_fetch(&hits, 1);
> + return 0;
> +}
> ~
>
>
>
>
> On Aug 4, 2020, at 6:52 PM, Andrii Nakryiko <[email protected]> wrote:
>
> On Tue, Aug 4, 2020 at 2:01 PM Song Liu <[email protected]> wrote:
>>
>>
>>
>>> On Aug 2, 2020, at 10:10 PM, Andrii Nakryiko <[email protected]> wrote:
>>>
>>> On Sun, Aug 2, 2020 at 9:47 PM Song Liu <[email protected]> wrote:
>>>>
>>>>
>>>>> On Aug 2, 2020, at 6:51 PM, Andrii Nakryiko <[email protected]> wrote:
>>>>>
>>>>> On Sat, Aug 1, 2020 at 1:50 AM Song Liu <[email protected]> wrote:
>>>>>>
>>>>>> Add a benchmark to compare performance of
>>>>>> 1) uprobe;
>>>>>> 2) user program w/o args;
>>>>>> 3) user program w/ args;
>>>>>> 4) user program w/ args on random cpu.
>>>>>>
>>>>>
>>>>> Can you please add it to the existing benchmark runner instead, e.g.,
>>>>> along the other bench_trigger benchmarks? No need to re-implement
>>>>> benchmark setup. And also that would also allow to compare existing
>>>>> ways of cheaply triggering a program vs this new _USER program?
>>>>
>>>> Will try.
>>>>
>>>>>
>>>>> If the performance is not significantly better than other ways, do you
>>>>> think it still makes sense to add a new BPF program type? I think
>>>>> triggering KPROBE/TRACEPOINT from bpf_prog_test_run() would be very
>>>>> nice, maybe it's possible to add that instead of a new program type?
>>>>> Either way, let's see comparison with other program triggering
>>>>> mechanisms first.
>>>>
>>>> Triggering KPROBE and TRACEPOINT from bpf_prog_test_run() will be useful.
>>>> But I don't think they can be used instead of user program, for a couple
>>>> reasons. First, KPROBE/TRACEPOINT may be triggered by other programs
>>>> running in the system, so user will have to filter those noise out in
>>>> each program. Second, it is not easy to specify CPU for KPROBE/TRACEPOINT,
>>>> while this feature could be useful in many cases, e.g. get stack trace
>>>> on a given CPU.
>>>>
>>>
>>> Right, it's not as convenient with KPROBE/TRACEPOINT as with the USER
>>> program you've added specifically with that feature in mind. But if
>>> you pin user-space thread on the needed CPU and trigger kprobe/tp,
>>> then you'll get what you want. As for the "noise", see how
>>> bench_trigger() deals with that: it records thread ID and filters
>>> everything not matching. You can do the same with CPU ID. It's not as
>>> automatic as with a special BPF program type, but still pretty simple,
>>> which is why I'm still deciding (for myself) whether USER program type
>>> is necessary :)
>>
>> Here are some bench_trigger numbers:
>>
>> base : 1.698 ± 0.001M/s
>> tp : 1.477 ± 0.001M/s
>> rawtp : 1.567 ± 0.001M/s
>> kprobe : 1.431 ± 0.000M/s
>> fentry : 1.691 ± 0.000M/s
>> fmodret : 1.654 ± 0.000M/s
>> user : 1.253 ± 0.000M/s
>> fentry-on-cpu: 0.022 ± 0.011M/s
>> user-on-cpu: 0.315 ± 0.001M/s
>>
>
> Ok, so basically all of raw_tp,tp,kprobe,fentry/fexit are
> significantly faster than USER programs. Sure, when compared to
> uprobe, they are faster, but not when doing on-specific-CPU run, it
> seems (judging from this patch's description, if I'm reading it
> right). Anyways, speed argument shouldn't be a reason for doing this,
> IMO.
>
>> The two "on-cpu" tests run the program on a different CPU (see the patch
>> at the end).
>>
>> "user" is about 25% slower than "fentry". I think this is mostly because
>> getpgid() is a faster syscall than bpf(BPF_TEST_RUN).
>
> Yes, probably.
>
>>
>> "user-on-cpu" is more than 10x faster than "fentry-on-cpu", because IPI
>> is way faster than moving the process (via sched_setaffinity).
>
> I don't think that's a good comparison, because you are actually
> testing sched_setaffinity performance on each iteration vs IPI in the
> kernel, not a BPF overhead.
>
> I think the fair comparison for this would be to create a thread and
> pin it on necessary CPU, and only then BPF program calls in a loop.
> But I bet any of existing program types would beat USER program.
>
>>
>> For use cases that we would like to call BPF program on specific CPU,
>> triggering it via IPI is a lot faster.
>
> So these use cases would be nice to expand on in the motivational part
> of the patch set. It's not really emphasized and it's not at all clear
> what you are trying to achieve. It also seems, depending on latency
> requirements, it's totally possible to achieve comparable results by
> pre-creating a thread for each CPU, pinning each one to its designated
> CPU and then using any suitable user-space signaling mechanism (a
> queue, condvar, etc) to ask a thread to trigger BPF program (fentry on
> getpgid(), for instance).
I don't see why user space signal plus fentry would be faster than IPI.
If the target cpu is running something, this gonna add two context
switches.
> I bet in this case the performance would be
> really nice for a lot of practical use cases. But then again, I don't
> know details of the intended use case, so please provide some more
> details.
Being able to trigger BPF program on a different CPU could enable many
use cases and optimizations. The use case I am looking at is to access
perf_event and percpu maps on the target CPU. For example:
0. trigger the program
1. read perf_event on cpu x;
2. (optional) check which process is running on cpu x;
3. add perf_event value to percpu map(s) on cpu x.
If we do these steps in a BPF program on cpu x, the cost is:
A.0) trigger BPF via IPI;
A.1) read perf_event locally;
A.2) local access current;
A.3) local access of percpu map(s).
If we can only do these on a different CPU, the cost will be:
B.0) trigger BPF locally;
B.1) read perf_event via IPI;
B.2) remote access current on cpu x;
B.3) remote access percpu map(s), or use non-percpu map(2).
Cost of (A.0 + A.1) is about same as (B.0 + B.1), maybe a little higher
(sys_bpf(), vs. sys_getpgid()). But A.2 and A.3 will be significantly
cheaper than B.2 and B.3.
Does this make sense?
OTOH, I do agree we can trigger bpftrace BEGIN/END with sys_getpgid()
or something similar.
Thanks,
Song
On Tue, Aug 4, 2020 at 9:47 PM Song Liu <[email protected]> wrote:
>
>
>
> > On Aug 4, 2020, at 6:52 PM, Andrii Nakryiko <[email protected]> wrote:
> >
> > On Tue, Aug 4, 2020 at 2:01 PM Song Liu <[email protected]> wrote:
> >>
> >>
> >>
> >>> On Aug 2, 2020, at 10:10 PM, Andrii Nakryiko <[email protected]> wrote:
> >>>
> >>> On Sun, Aug 2, 2020 at 9:47 PM Song Liu <[email protected]> wrote:
> >>>>
> >>>>
> >>>>> On Aug 2, 2020, at 6:51 PM, Andrii Nakryiko <[email protected]> wrote:
> >>>>>
> >>>>> On Sat, Aug 1, 2020 at 1:50 AM Song Liu <[email protected]> wrote:
> >>>>>>
> >>>>>> Add a benchmark to compare performance of
> >>>>>> 1) uprobe;
> >>>>>> 2) user program w/o args;
> >>>>>> 3) user program w/ args;
> >>>>>> 4) user program w/ args on random cpu.
> >>>>>>
> >>>>>
> >>>>> Can you please add it to the existing benchmark runner instead, e.g.,
> >>>>> along the other bench_trigger benchmarks? No need to re-implement
> >>>>> benchmark setup. And also that would also allow to compare existing
> >>>>> ways of cheaply triggering a program vs this new _USER program?
> >>>>
> >>>> Will try.
> >>>>
> >>>>>
> >>>>> If the performance is not significantly better than other ways, do you
> >>>>> think it still makes sense to add a new BPF program type? I think
> >>>>> triggering KPROBE/TRACEPOINT from bpf_prog_test_run() would be very
> >>>>> nice, maybe it's possible to add that instead of a new program type?
> >>>>> Either way, let's see comparison with other program triggering
> >>>>> mechanisms first.
> >>>>
> >>>> Triggering KPROBE and TRACEPOINT from bpf_prog_test_run() will be useful.
> >>>> But I don't think they can be used instead of user program, for a couple
> >>>> reasons. First, KPROBE/TRACEPOINT may be triggered by other programs
> >>>> running in the system, so user will have to filter those noise out in
> >>>> each program. Second, it is not easy to specify CPU for KPROBE/TRACEPOINT,
> >>>> while this feature could be useful in many cases, e.g. get stack trace
> >>>> on a given CPU.
> >>>>
> >>>
> >>> Right, it's not as convenient with KPROBE/TRACEPOINT as with the USER
> >>> program you've added specifically with that feature in mind. But if
> >>> you pin user-space thread on the needed CPU and trigger kprobe/tp,
> >>> then you'll get what you want. As for the "noise", see how
> >>> bench_trigger() deals with that: it records thread ID and filters
> >>> everything not matching. You can do the same with CPU ID. It's not as
> >>> automatic as with a special BPF program type, but still pretty simple,
> >>> which is why I'm still deciding (for myself) whether USER program type
> >>> is necessary :)
> >>
> >> Here are some bench_trigger numbers:
> >>
> >> base : 1.698 ± 0.001M/s
> >> tp : 1.477 ± 0.001M/s
> >> rawtp : 1.567 ± 0.001M/s
> >> kprobe : 1.431 ± 0.000M/s
> >> fentry : 1.691 ± 0.000M/s
> >> fmodret : 1.654 ± 0.000M/s
> >> user : 1.253 ± 0.000M/s
> >> fentry-on-cpu: 0.022 ± 0.011M/s
> >> user-on-cpu: 0.315 ± 0.001M/s
> >>
> >
> > Ok, so basically all of raw_tp,tp,kprobe,fentry/fexit are
> > significantly faster than USER programs. Sure, when compared to
> > uprobe, they are faster, but not when doing on-specific-CPU run, it
> > seems (judging from this patch's description, if I'm reading it
> > right). Anyways, speed argument shouldn't be a reason for doing this,
> > IMO.
> >
> >> The two "on-cpu" tests run the program on a different CPU (see the patch
> >> at the end).
> >>
> >> "user" is about 25% slower than "fentry". I think this is mostly because
> >> getpgid() is a faster syscall than bpf(BPF_TEST_RUN).
> >
> > Yes, probably.
> >
> >>
> >> "user-on-cpu" is more than 10x faster than "fentry-on-cpu", because IPI
> >> is way faster than moving the process (via sched_setaffinity).
> >
> > I don't think that's a good comparison, because you are actually
> > testing sched_setaffinity performance on each iteration vs IPI in the
> > kernel, not a BPF overhead.
> >
> > I think the fair comparison for this would be to create a thread and
> > pin it on necessary CPU, and only then BPF program calls in a loop.
> > But I bet any of existing program types would beat USER program.
> >
> >>
> >> For use cases that we would like to call BPF program on specific CPU,
> >> triggering it via IPI is a lot faster.
> >
> > So these use cases would be nice to expand on in the motivational part
> > of the patch set. It's not really emphasized and it's not at all clear
> > what you are trying to achieve. It also seems, depending on latency
> > requirements, it's totally possible to achieve comparable results by
> > pre-creating a thread for each CPU, pinning each one to its designated
> > CPU and then using any suitable user-space signaling mechanism (a
> > queue, condvar, etc) to ask a thread to trigger BPF program (fentry on
> > getpgid(), for instance).
>
> I don't see why user space signal plus fentry would be faster than IPI.
> If the target cpu is running something, this gonna add two context
> switches.
>
I didn't say faster, did I? I said it would be comparable and wouldn't
require a new program type. But then again, without knowing all the
details, it's a bit hard to discuss this. E.g., if you need to trigger
that BPF program periodically, you can sleep in those per-CPU threads,
or epoll, or whatever. Or maybe you can set up a per-CPU perf event
that would trigger your program on the desired CPU, etc. My point is
that I and others shouldn't be guessing this, I'd expect someone who's
proposing an entire new BPF program type to motivate why this new
program type is necessary and what problem it's solving that can't be
solved with existing means.
BTW, how frequently do you need to trigger the BPF program? Seems very
frequently, if 2 vs 1 context switches might be a problem?
> > I bet in this case the performance would be
> > really nice for a lot of practical use cases. But then again, I don't
> > know details of the intended use case, so please provide some more
> > details.
>
> Being able to trigger BPF program on a different CPU could enable many
> use cases and optimizations. The use case I am looking at is to access
> perf_event and percpu maps on the target CPU. For example:
> 0. trigger the program
> 1. read perf_event on cpu x;
> 2. (optional) check which process is running on cpu x;
> 3. add perf_event value to percpu map(s) on cpu x.
>
> If we do these steps in a BPF program on cpu x, the cost is:
> A.0) trigger BPF via IPI;
> A.1) read perf_event locally;
> A.2) local access current;
> A.3) local access of percpu map(s).
>
> If we can only do these on a different CPU, the cost will be:
> B.0) trigger BPF locally;
> B.1) read perf_event via IPI;
> B.2) remote access current on cpu x;
> B.3) remote access percpu map(s), or use non-percpu map(2).
>
> Cost of (A.0 + A.1) is about same as (B.0 + B.1), maybe a little higher
> (sys_bpf(), vs. sys_getpgid()). But A.2 and A.3 will be significantly
> cheaper than B.2 and B.3.
>
> Does this make sense?
It does, thanks. But what I was describing is still A, no? BPF program
will be triggered on your desired cpu X, wouldn't it?
>
>
> OTOH, I do agree we can trigger bpftrace BEGIN/END with sys_getpgid()
> or something similar.
Right.
>
> Thanks,
> Song
> On Aug 4, 2020, at 10:47 PM, Andrii Nakryiko <[email protected]> wrote:
>
> On Tue, Aug 4, 2020 at 9:47 PM Song Liu <[email protected]> wrote:
>>
>>
>>
>>> On Aug 4, 2020, at 6:52 PM, Andrii Nakryiko <[email protected]> wrote:
>>>
>>> On Tue, Aug 4, 2020 at 2:01 PM Song Liu <[email protected]> wrote:
>>>>
>>>>
>>>>
>>>>> On Aug 2, 2020, at 10:10 PM, Andrii Nakryiko <[email protected]> wrote:
>>>>>
>>>>> On Sun, Aug 2, 2020 at 9:47 PM Song Liu <[email protected]> wrote:
>>>>>>
>>>>>>
>>>>>>> On Aug 2, 2020, at 6:51 PM, Andrii Nakryiko <[email protected]> wrote:
>>>>>>>
>>>>>>> On Sat, Aug 1, 2020 at 1:50 AM Song Liu <[email protected]> wrote:
>>>>>>>>
>>>>>>>> Add a benchmark to compare performance of
>>>>>>>> 1) uprobe;
>>>>>>>> 2) user program w/o args;
>>>>>>>> 3) user program w/ args;
>>>>>>>> 4) user program w/ args on random cpu.
>>>>>>>>
>>>>>>>
>>>>>>> Can you please add it to the existing benchmark runner instead, e.g.,
>>>>>>> along the other bench_trigger benchmarks? No need to re-implement
>>>>>>> benchmark setup. And also that would also allow to compare existing
>>>>>>> ways of cheaply triggering a program vs this new _USER program?
>>>>>>
>>>>>> Will try.
>>>>>>
>>>>>>>
>>>>>>> If the performance is not significantly better than other ways, do you
>>>>>>> think it still makes sense to add a new BPF program type? I think
>>>>>>> triggering KPROBE/TRACEPOINT from bpf_prog_test_run() would be very
>>>>>>> nice, maybe it's possible to add that instead of a new program type?
>>>>>>> Either way, let's see comparison with other program triggering
>>>>>>> mechanisms first.
>>>>>>
>>>>>> Triggering KPROBE and TRACEPOINT from bpf_prog_test_run() will be useful.
>>>>>> But I don't think they can be used instead of user program, for a couple
>>>>>> reasons. First, KPROBE/TRACEPOINT may be triggered by other programs
>>>>>> running in the system, so user will have to filter those noise out in
>>>>>> each program. Second, it is not easy to specify CPU for KPROBE/TRACEPOINT,
>>>>>> while this feature could be useful in many cases, e.g. get stack trace
>>>>>> on a given CPU.
>>>>>>
>>>>>
>>>>> Right, it's not as convenient with KPROBE/TRACEPOINT as with the USER
>>>>> program you've added specifically with that feature in mind. But if
>>>>> you pin user-space thread on the needed CPU and trigger kprobe/tp,
>>>>> then you'll get what you want. As for the "noise", see how
>>>>> bench_trigger() deals with that: it records thread ID and filters
>>>>> everything not matching. You can do the same with CPU ID. It's not as
>>>>> automatic as with a special BPF program type, but still pretty simple,
>>>>> which is why I'm still deciding (for myself) whether USER program type
>>>>> is necessary :)
>>>>
>>>> Here are some bench_trigger numbers:
>>>>
>>>> base : 1.698 ± 0.001M/s
>>>> tp : 1.477 ± 0.001M/s
>>>> rawtp : 1.567 ± 0.001M/s
>>>> kprobe : 1.431 ± 0.000M/s
>>>> fentry : 1.691 ± 0.000M/s
>>>> fmodret : 1.654 ± 0.000M/s
>>>> user : 1.253 ± 0.000M/s
>>>> fentry-on-cpu: 0.022 ± 0.011M/s
>>>> user-on-cpu: 0.315 ± 0.001M/s
>>>>
>>>
>>> Ok, so basically all of raw_tp,tp,kprobe,fentry/fexit are
>>> significantly faster than USER programs. Sure, when compared to
>>> uprobe, they are faster, but not when doing on-specific-CPU run, it
>>> seems (judging from this patch's description, if I'm reading it
>>> right). Anyways, speed argument shouldn't be a reason for doing this,
>>> IMO.
>>>
>>>> The two "on-cpu" tests run the program on a different CPU (see the patch
>>>> at the end).
>>>>
>>>> "user" is about 25% slower than "fentry". I think this is mostly because
>>>> getpgid() is a faster syscall than bpf(BPF_TEST_RUN).
>>>
>>> Yes, probably.
>>>
>>>>
>>>> "user-on-cpu" is more than 10x faster than "fentry-on-cpu", because IPI
>>>> is way faster than moving the process (via sched_setaffinity).
>>>
>>> I don't think that's a good comparison, because you are actually
>>> testing sched_setaffinity performance on each iteration vs IPI in the
>>> kernel, not a BPF overhead.
>>>
>>> I think the fair comparison for this would be to create a thread and
>>> pin it on necessary CPU, and only then BPF program calls in a loop.
>>> But I bet any of existing program types would beat USER program.
>>>
>>>>
>>>> For use cases that we would like to call BPF program on specific CPU,
>>>> triggering it via IPI is a lot faster.
>>>
>>> So these use cases would be nice to expand on in the motivational part
>>> of the patch set. It's not really emphasized and it's not at all clear
>>> what you are trying to achieve. It also seems, depending on latency
>>> requirements, it's totally possible to achieve comparable results by
>>> pre-creating a thread for each CPU, pinning each one to its designated
>>> CPU and then using any suitable user-space signaling mechanism (a
>>> queue, condvar, etc) to ask a thread to trigger BPF program (fentry on
>>> getpgid(), for instance).
>>
>> I don't see why user space signal plus fentry would be faster than IPI.
>> If the target cpu is running something, this gonna add two context
>> switches.
>>
>
> I didn't say faster, did I? I said it would be comparable and wouldn't
> require a new program type.
Well, I don't think adding program type is that big a deal. If that is
really a problem, we can use a new attach type instead. The goal is to
trigger it with sys_bpf() on a different cpu. So we can call it kprobe
attach to nothing and hack that way. I add the new type because it makes
sense. The user just want to trigger a BPF program from user space.
> But then again, without knowing all the
> details, it's a bit hard to discuss this. E.g., if you need to trigger
> that BPF program periodically, you can sleep in those per-CPU threads,
> or epoll, or whatever. Or maybe you can set up a per-CPU perf event
> that would trigger your program on the desired CPU, etc.My point is
> that I and others shouldn't be guessing this, I'd expect someone who's
> proposing an entire new BPF program type to motivate why this new
> program type is necessary and what problem it's solving that can't be
> solved with existing means.
Yes, there are other options. But they all come with non-trivial cost.
Per-CPU-per-process threads and/or per-CPU perf event are cost we have
to pay in production. IMO, these cost are much higher than a new program
type (or attach type).
>
> BTW, how frequently do you need to trigger the BPF program? Seems very
> frequently, if 2 vs 1 context switches might be a problem?
The whole solution requires two BPF programs. One on each context switch,
the other is the user program. The user program will not trigger very
often.
>
>>> I bet in this case the performance would be
>>> really nice for a lot of practical use cases. But then again, I don't
>>> know details of the intended use case, so please provide some more
>>> details.
>>
>> Being able to trigger BPF program on a different CPU could enable many
>> use cases and optimizations. The use case I am looking at is to access
>> perf_event and percpu maps on the target CPU. For example:
>> 0. trigger the program
>> 1. read perf_event on cpu x;
>> 2. (optional) check which process is running on cpu x;
>> 3. add perf_event value to percpu map(s) on cpu x.
>>
>> If we do these steps in a BPF program on cpu x, the cost is:
>> A.0) trigger BPF via IPI;
>> A.1) read perf_event locally;
>> A.2) local access current;
>> A.3) local access of percpu map(s).
>>
>> If we can only do these on a different CPU, the cost will be:
>> B.0) trigger BPF locally;
>> B.1) read perf_event via IPI;
>> B.2) remote access current on cpu x;
>> B.3) remote access percpu map(s), or use non-percpu map(2).
>>
>> Cost of (A.0 + A.1) is about same as (B.0 + B.1), maybe a little higher
>> (sys_bpf(), vs. sys_getpgid()). But A.2 and A.3 will be significantly
>> cheaper than B.2 and B.3.
>>
>> Does this make sense?
>
> It does, thanks. But what I was describing is still A, no? BPF program
> will be triggered on your desired cpu X, wouldn't it?
Well, that would be option C, but C could not do step 2, because we context
switch to the dedicated thread.
On Wed, Aug 05, 2020 at 04:47:30AM +0000, Song Liu wrote:
>
> Being able to trigger BPF program on a different CPU could enable many
> use cases and optimizations. The use case I am looking at is to access
> perf_event and percpu maps on the target CPU. For example:
> 0. trigger the program
> 1. read perf_event on cpu x;
> 2. (optional) check which process is running on cpu x;
> 3. add perf_event value to percpu map(s) on cpu x.
If the whole thing is about doing the above then I don't understand why new
prog type is needed. Can prog_test_run support existing BPF_PROG_TYPE_KPROBE?
"enable many use cases" sounds vague. I don't think folks reading
the patches can guess those "use cases".
"Testing existing kprobe bpf progs" would sound more convincing to me.
If the test_run framework can be extended to trigger kprobe with correct pt_regs.
As part of it test_run would trigger on a given cpu with $ip pointing
to some test fuction in test_run.c. For local test_run the stack trace
would include bpf syscall chain. For IPI the stack trace would include
the corresponding kernel pieces where top is our special test function.
Sort of like pseudo kprobe where there is no actual kprobe logic,
since kprobe prog doesn't care about mechanism. It needs correct
pt_regs only as input context.
The kprobe prog output (return value) has special meaning though,
so may be kprobe prog type is not a good fit.
Something like fentry/fexit may be better, since verifier check_return_code()
enforces 'return 0'. So their return value is effectively "void".
Then prog_test_run would need to gain an ability to trigger
fentry/fexit prog on a given cpu.
On Wed, Aug 5, 2020 at 10:16 AM Alexei Starovoitov
<[email protected]> wrote:
>
> On Wed, Aug 05, 2020 at 04:47:30AM +0000, Song Liu wrote:
> >
> > Being able to trigger BPF program on a different CPU could enable many
> > use cases and optimizations. The use case I am looking at is to access
> > perf_event and percpu maps on the target CPU. For example:
> > 0. trigger the program
> > 1. read perf_event on cpu x;
> > 2. (optional) check which process is running on cpu x;
> > 3. add perf_event value to percpu map(s) on cpu x.
>
> If the whole thing is about doing the above then I don't understand why new
> prog type is needed. Can prog_test_run support existing BPF_PROG_TYPE_KPROBE?
> "enable many use cases" sounds vague. I don't think folks reading
> the patches can guess those "use cases".
> "Testing existing kprobe bpf progs" would sound more convincing to me.
Was just about to propose the same :) I wonder if generic test_run()
capability to trigger test programs of whatever supported type on a
specified CPU through IPI can be added. That way you can even use the
XDP program to do what Song seems to need.
TRACEPOINTs might also be a good fit here, given it seems simpler to
let users specify custom tracepoint data for test_run(). Having the
ability to unit-test KPROBE and TRACEPOINT, however rudimentary, is
already a big win.
> If the test_run framework can be extended to trigger kprobe with correct pt_regs.
> As part of it test_run would trigger on a given cpu with $ip pointing
> to some test fuction in test_run.c. For local test_run the stack trace
> would include bpf syscall chain. For IPI the stack trace would include
> the corresponding kernel pieces where top is our special test function.
> Sort of like pseudo kprobe where there is no actual kprobe logic,
> since kprobe prog doesn't care about mechanism. It needs correct
> pt_regs only as input context.
> The kprobe prog output (return value) has special meaning though,
> so may be kprobe prog type is not a good fit.
It does? I don't remember returning 1 from KPROBE changing anything. I
thought it's only the special bpf_override_return() that can influence
the kernel function return result.
> Something like fentry/fexit may be better, since verifier check_return_code()
> enforces 'return 0'. So their return value is effectively "void".
> Then prog_test_run would need to gain an ability to trigger
> fentry/fexit prog on a given cpu.
On Wed, Aug 5, 2020 at 12:01 AM Song Liu <[email protected]> wrote:
>
>
>
> > On Aug 4, 2020, at 10:47 PM, Andrii Nakryiko <[email protected]> wrote:
> >
> > On Tue, Aug 4, 2020 at 9:47 PM Song Liu <[email protected]> wrote:
> >>
> >>
> >>
> >>> On Aug 4, 2020, at 6:52 PM, Andrii Nakryiko <[email protected]> wrote:
> >>>
> >>> On Tue, Aug 4, 2020 at 2:01 PM Song Liu <[email protected]> wrote:
> >>>>
> >>>>
> >>>>
> >>>>> On Aug 2, 2020, at 10:10 PM, Andrii Nakryiko <[email protected]> wrote:
> >>>>>
> >>>>> On Sun, Aug 2, 2020 at 9:47 PM Song Liu <[email protected]> wrote:
> >>>>>>
> >>>>>>
> >>>>>>> On Aug 2, 2020, at 6:51 PM, Andrii Nakryiko <[email protected]> wrote:
> >>>>>>>
> >>>>>>> On Sat, Aug 1, 2020 at 1:50 AM Song Liu <[email protected]> wrote:
> >>>>>>>>
> >>>>>>>> Add a benchmark to compare performance of
> >>>>>>>> 1) uprobe;
> >>>>>>>> 2) user program w/o args;
> >>>>>>>> 3) user program w/ args;
> >>>>>>>> 4) user program w/ args on random cpu.
> >>>>>>>>
> >>>>>>>
> >>>>>>> Can you please add it to the existing benchmark runner instead, e.g.,
> >>>>>>> along the other bench_trigger benchmarks? No need to re-implement
> >>>>>>> benchmark setup. And also that would also allow to compare existing
> >>>>>>> ways of cheaply triggering a program vs this new _USER program?
> >>>>>>
> >>>>>> Will try.
> >>>>>>
> >>>>>>>
> >>>>>>> If the performance is not significantly better than other ways, do you
> >>>>>>> think it still makes sense to add a new BPF program type? I think
> >>>>>>> triggering KPROBE/TRACEPOINT from bpf_prog_test_run() would be very
> >>>>>>> nice, maybe it's possible to add that instead of a new program type?
> >>>>>>> Either way, let's see comparison with other program triggering
> >>>>>>> mechanisms first.
> >>>>>>
> >>>>>> Triggering KPROBE and TRACEPOINT from bpf_prog_test_run() will be useful.
> >>>>>> But I don't think they can be used instead of user program, for a couple
> >>>>>> reasons. First, KPROBE/TRACEPOINT may be triggered by other programs
> >>>>>> running in the system, so user will have to filter those noise out in
> >>>>>> each program. Second, it is not easy to specify CPU for KPROBE/TRACEPOINT,
> >>>>>> while this feature could be useful in many cases, e.g. get stack trace
> >>>>>> on a given CPU.
> >>>>>>
> >>>>>
> >>>>> Right, it's not as convenient with KPROBE/TRACEPOINT as with the USER
> >>>>> program you've added specifically with that feature in mind. But if
> >>>>> you pin user-space thread on the needed CPU and trigger kprobe/tp,
> >>>>> then you'll get what you want. As for the "noise", see how
> >>>>> bench_trigger() deals with that: it records thread ID and filters
> >>>>> everything not matching. You can do the same with CPU ID. It's not as
> >>>>> automatic as with a special BPF program type, but still pretty simple,
> >>>>> which is why I'm still deciding (for myself) whether USER program type
> >>>>> is necessary :)
> >>>>
> >>>> Here are some bench_trigger numbers:
> >>>>
> >>>> base : 1.698 ± 0.001M/s
> >>>> tp : 1.477 ± 0.001M/s
> >>>> rawtp : 1.567 ± 0.001M/s
> >>>> kprobe : 1.431 ± 0.000M/s
> >>>> fentry : 1.691 ± 0.000M/s
> >>>> fmodret : 1.654 ± 0.000M/s
> >>>> user : 1.253 ± 0.000M/s
> >>>> fentry-on-cpu: 0.022 ± 0.011M/s
> >>>> user-on-cpu: 0.315 ± 0.001M/s
> >>>>
> >>>
> >>> Ok, so basically all of raw_tp,tp,kprobe,fentry/fexit are
> >>> significantly faster than USER programs. Sure, when compared to
> >>> uprobe, they are faster, but not when doing on-specific-CPU run, it
> >>> seems (judging from this patch's description, if I'm reading it
> >>> right). Anyways, speed argument shouldn't be a reason for doing this,
> >>> IMO.
> >>>
> >>>> The two "on-cpu" tests run the program on a different CPU (see the patch
> >>>> at the end).
> >>>>
> >>>> "user" is about 25% slower than "fentry". I think this is mostly because
> >>>> getpgid() is a faster syscall than bpf(BPF_TEST_RUN).
> >>>
> >>> Yes, probably.
> >>>
> >>>>
> >>>> "user-on-cpu" is more than 10x faster than "fentry-on-cpu", because IPI
> >>>> is way faster than moving the process (via sched_setaffinity).
> >>>
> >>> I don't think that's a good comparison, because you are actually
> >>> testing sched_setaffinity performance on each iteration vs IPI in the
> >>> kernel, not a BPF overhead.
> >>>
> >>> I think the fair comparison for this would be to create a thread and
> >>> pin it on necessary CPU, and only then BPF program calls in a loop.
> >>> But I bet any of existing program types would beat USER program.
> >>>
> >>>>
> >>>> For use cases that we would like to call BPF program on specific CPU,
> >>>> triggering it via IPI is a lot faster.
> >>>
> >>> So these use cases would be nice to expand on in the motivational part
> >>> of the patch set. It's not really emphasized and it's not at all clear
> >>> what you are trying to achieve. It also seems, depending on latency
> >>> requirements, it's totally possible to achieve comparable results by
> >>> pre-creating a thread for each CPU, pinning each one to its designated
> >>> CPU and then using any suitable user-space signaling mechanism (a
> >>> queue, condvar, etc) to ask a thread to trigger BPF program (fentry on
> >>> getpgid(), for instance).
> >>
> >> I don't see why user space signal plus fentry would be faster than IPI.
> >> If the target cpu is running something, this gonna add two context
> >> switches.
> >>
> >
> > I didn't say faster, did I? I said it would be comparable and wouldn't
> > require a new program type.
>
> Well, I don't think adding program type is that big a deal. If that is
> really a problem, we can use a new attach type instead. The goal is to
> trigger it with sys_bpf() on a different cpu. So we can call it kprobe
> attach to nothing and hack that way. I add the new type because it makes
> sense. The user just want to trigger a BPF program from user space.
I thought we already concluded that it's not really "trigger a BPF
program from user space", because for that you have many existing and
even faster options. After a few rounds of emails, it seems it's more
about triggering the BPF program on another CPU without preempting
whatever is running on that CPU. It would be helpful to be clear and
upfront about the requirements.
>
> > But then again, without knowing all the
> > details, it's a bit hard to discuss this. E.g., if you need to trigger
> > that BPF program periodically, you can sleep in those per-CPU threads,
> > or epoll, or whatever. Or maybe you can set up a per-CPU perf event
> > that would trigger your program on the desired CPU, etc.My point is
> > that I and others shouldn't be guessing this, I'd expect someone who's
> > proposing an entire new BPF program type to motivate why this new
> > program type is necessary and what problem it's solving that can't be
> > solved with existing means.
>
> Yes, there are other options. But they all come with non-trivial cost.
> Per-CPU-per-process threads and/or per-CPU perf event are cost we have
> to pay in production. IMO, these cost are much higher than a new program
> type (or attach type).
>
So for threads I know the costs (a bit of memory for thread stack,
plus some internal book keeping stuff in kernel), which is arguable
how big of a deal is that if those threads do pretty much nothing most
of the time. But what's the exact cost of perf events and why it's
unacceptably high?
The reason I'm asking is that it seems to me that one alternative,
which is more generic (and thus potentially more useful) would be to
have a manually-triggerable perf event. Some sort of software event,
that's triggered from ioctl or some other syscall, that's appropriate
for perf subsystem. You'd pre-create a perf_event for each CPU,
remember their FDs, then would trigger the one you need (corresponding
to desired CPU). From the BPF side, you'd just use a normal perf_event
program to handle perf_event activation. But it could be used even
outside of the BPF ecosystem, which is a good sign for me, because it
allows more flexible composition of building blocks.
> >
> > BTW, how frequently do you need to trigger the BPF program? Seems very
> > frequently, if 2 vs 1 context switches might be a problem?
>
> The whole solution requires two BPF programs. One on each context switch,
> the other is the user program. The user program will not trigger very
> often.
Ok, so performance was never an objective, I wonder why it is put as
the main reason for this new type of BPF program?
>
> >
> >>> I bet in this case the performance would be
> >>> really nice for a lot of practical use cases. But then again, I don't
> >>> know details of the intended use case, so please provide some more
> >>> details.
> >>
> >> Being able to trigger BPF program on a different CPU could enable many
> >> use cases and optimizations. The use case I am looking at is to access
> >> perf_event and percpu maps on the target CPU. For example:
> >> 0. trigger the program
> >> 1. read perf_event on cpu x;
> >> 2. (optional) check which process is running on cpu x;
> >> 3. add perf_event value to percpu map(s) on cpu x.
> >>
> >> If we do these steps in a BPF program on cpu x, the cost is:
> >> A.0) trigger BPF via IPI;
> >> A.1) read perf_event locally;
> >> A.2) local access current;
> >> A.3) local access of percpu map(s).
> >>
> >> If we can only do these on a different CPU, the cost will be:
> >> B.0) trigger BPF locally;
> >> B.1) read perf_event via IPI;
> >> B.2) remote access current on cpu x;
> >> B.3) remote access percpu map(s), or use non-percpu map(2).
> >>
> >> Cost of (A.0 + A.1) is about same as (B.0 + B.1), maybe a little higher
> >> (sys_bpf(), vs. sys_getpgid()). But A.2 and A.3 will be significantly
> >> cheaper than B.2 and B.3.
> >>
> >> Does this make sense?
> >
> > It does, thanks. But what I was describing is still A, no? BPF program
> > will be triggered on your desired cpu X, wouldn't it?
>
> Well, that would be option C, but C could not do step 2, because we context
> switch to the dedicated thread.
>
So I think *this* is a real requirement. No preemption of the running
process on a different CPU. That does sound like what perf event does,
doesn't it? See above.
> On Aug 5, 2020, at 10:16 AM, Alexei Starovoitov <[email protected]> wrote:
>
> On Wed, Aug 05, 2020 at 04:47:30AM +0000, Song Liu wrote:
>>
>> Being able to trigger BPF program on a different CPU could enable many
>> use cases and optimizations. The use case I am looking at is to access
>> perf_event and percpu maps on the target CPU. For example:
>> 0. trigger the program
>> 1. read perf_event on cpu x;
>> 2. (optional) check which process is running on cpu x;
>> 3. add perf_event value to percpu map(s) on cpu x.
>
> If the whole thing is about doing the above then I don't understand why new
> prog type is needed.
I was under the (probably wrong) impression that adding prog type is not
that big a deal.
> Can prog_test_run support existing BPF_PROG_TYPE_KPROBE?
I haven't looked into all the details, but I bet this is possible.
> "enable many use cases" sounds vague. I don't think folks reading
> the patches can guess those "use cases".
> "Testing existing kprobe bpf progs" would sound more convincing to me.
> If the test_run framework can be extended to trigger kprobe with correct pt_regs.
> As part of it test_run would trigger on a given cpu with $ip pointing
> to some test fuction in test_run.c. For local test_run the stack trace
> would include bpf syscall chain. For IPI the stack trace would include
> the corresponding kernel pieces where top is our special test function.
> Sort of like pseudo kprobe where there is no actual kprobe logic,
> since kprobe prog doesn't care about mechanism. It needs correct
> pt_regs only as input context.
> The kprobe prog output (return value) has special meaning though,
> so may be kprobe prog type is not a good fit.
> Something like fentry/fexit may be better, since verifier check_return_code()
> enforces 'return 0'. So their return value is effectively "void".
> Then prog_test_run would need to gain an ability to trigger
> fentry/fexit prog on a given cpu.
Maybe we add a new attach type for BPF_PROG_TYPE_TRACING, which is in
parallel with BPF_TRACE_FENTRY and BPF_TRACE_EXIT? Say BPF_TRACE_USER?
(Just realized I like this name :-D, it matches USDT...). Then we can
enable test_run for most (if not all) tracing programs, including
fentry/fexit.
Does this sound like a good plan?
Thanks,
Song
On Wed, Aug 5, 2020 at 10:45 AM Alexei Starovoitov
<[email protected]> wrote:
>
> On Wed, Aug 05, 2020 at 10:27:28AM -0700, Andrii Nakryiko wrote:
> > On Wed, Aug 5, 2020 at 10:16 AM Alexei Starovoitov
> > <[email protected]> wrote:
> > >
> > > On Wed, Aug 05, 2020 at 04:47:30AM +0000, Song Liu wrote:
> > > >
> > > > Being able to trigger BPF program on a different CPU could enable many
> > > > use cases and optimizations. The use case I am looking at is to access
> > > > perf_event and percpu maps on the target CPU. For example:
> > > > 0. trigger the program
> > > > 1. read perf_event on cpu x;
> > > > 2. (optional) check which process is running on cpu x;
> > > > 3. add perf_event value to percpu map(s) on cpu x.
> > >
> > > If the whole thing is about doing the above then I don't understand why new
> > > prog type is needed. Can prog_test_run support existing BPF_PROG_TYPE_KPROBE?
> > > "enable many use cases" sounds vague. I don't think folks reading
> > > the patches can guess those "use cases".
> > > "Testing existing kprobe bpf progs" would sound more convincing to me.
> >
> > Was just about to propose the same :) I wonder if generic test_run()
> > capability to trigger test programs of whatever supported type on a
> > specified CPU through IPI can be added. That way you can even use the
> > XDP program to do what Song seems to need.
> >
> > TRACEPOINTs might also be a good fit here, given it seems simpler to
> > let users specify custom tracepoint data for test_run(). Having the
> > ability to unit-test KPROBE and TRACEPOINT, however rudimentary, is
> > already a big win.
> >
> > > If the test_run framework can be extended to trigger kprobe with correct pt_regs.
> > > As part of it test_run would trigger on a given cpu with $ip pointing
> > > to some test fuction in test_run.c. For local test_run the stack trace
> > > would include bpf syscall chain. For IPI the stack trace would include
> > > the corresponding kernel pieces where top is our special test function.
> > > Sort of like pseudo kprobe where there is no actual kprobe logic,
> > > since kprobe prog doesn't care about mechanism. It needs correct
> > > pt_regs only as input context.
> > > The kprobe prog output (return value) has special meaning though,
> > > so may be kprobe prog type is not a good fit.
> >
> > It does? I don't remember returning 1 from KPROBE changing anything. I
> > thought it's only the special bpf_override_return() that can influence
> > the kernel function return result.
>
> See comment in trace_call_bpf().
> And logic to handle it in kprobe_perf_func() for kprobes.
> and in perf_trace_run_bpf_submit() for tracepoints.
> It's historical and Song actually discovered an issue with such behavior.
> I don't remember whether we've concluded on the solution.
Oh, thanks for pointers. Never realized there is more going on with
those. I guess return 1; is not advised then, as it causes extra
overhead.
On Wed, Aug 05, 2020 at 10:27:28AM -0700, Andrii Nakryiko wrote:
> On Wed, Aug 5, 2020 at 10:16 AM Alexei Starovoitov
> <[email protected]> wrote:
> >
> > On Wed, Aug 05, 2020 at 04:47:30AM +0000, Song Liu wrote:
> > >
> > > Being able to trigger BPF program on a different CPU could enable many
> > > use cases and optimizations. The use case I am looking at is to access
> > > perf_event and percpu maps on the target CPU. For example:
> > > 0. trigger the program
> > > 1. read perf_event on cpu x;
> > > 2. (optional) check which process is running on cpu x;
> > > 3. add perf_event value to percpu map(s) on cpu x.
> >
> > If the whole thing is about doing the above then I don't understand why new
> > prog type is needed. Can prog_test_run support existing BPF_PROG_TYPE_KPROBE?
> > "enable many use cases" sounds vague. I don't think folks reading
> > the patches can guess those "use cases".
> > "Testing existing kprobe bpf progs" would sound more convincing to me.
>
> Was just about to propose the same :) I wonder if generic test_run()
> capability to trigger test programs of whatever supported type on a
> specified CPU through IPI can be added. That way you can even use the
> XDP program to do what Song seems to need.
>
> TRACEPOINTs might also be a good fit here, given it seems simpler to
> let users specify custom tracepoint data for test_run(). Having the
> ability to unit-test KPROBE and TRACEPOINT, however rudimentary, is
> already a big win.
>
> > If the test_run framework can be extended to trigger kprobe with correct pt_regs.
> > As part of it test_run would trigger on a given cpu with $ip pointing
> > to some test fuction in test_run.c. For local test_run the stack trace
> > would include bpf syscall chain. For IPI the stack trace would include
> > the corresponding kernel pieces where top is our special test function.
> > Sort of like pseudo kprobe where there is no actual kprobe logic,
> > since kprobe prog doesn't care about mechanism. It needs correct
> > pt_regs only as input context.
> > The kprobe prog output (return value) has special meaning though,
> > so may be kprobe prog type is not a good fit.
>
> It does? I don't remember returning 1 from KPROBE changing anything. I
> thought it's only the special bpf_override_return() that can influence
> the kernel function return result.
See comment in trace_call_bpf().
And logic to handle it in kprobe_perf_func() for kprobes.
and in perf_trace_run_bpf_submit() for tracepoints.
It's historical and Song actually discovered an issue with such behavior.
I don't remember whether we've concluded on the solution.
> On Aug 5, 2020, at 10:39 AM, Andrii Nakryiko <[email protected]> wrote:
>
> On Wed, Aug 5, 2020 at 12:01 AM Song Liu <[email protected]> wrote:
>>
>>
>>
>>> On Aug 4, 2020, at 10:47 PM, Andrii Nakryiko <[email protected]> wrote:
>>>
>>> On Tue, Aug 4, 2020 at 9:47 PM Song Liu <[email protected]> wrote:
>>>>
>>>>
>>>>
>>>>> On Aug 4, 2020, at 6:52 PM, Andrii Nakryiko <[email protected]> wrote:
>>>>>
>>>>> On Tue, Aug 4, 2020 at 2:01 PM Song Liu <[email protected]> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>>> On Aug 2, 2020, at 10:10 PM, Andrii Nakryiko <[email protected]> wrote:
>>>>>>>
>>>>>>> On Sun, Aug 2, 2020 at 9:47 PM Song Liu <[email protected]> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>> On Aug 2, 2020, at 6:51 PM, Andrii Nakryiko <[email protected]> wrote:
>>>>>>>>>
>>>>>>>>> On Sat, Aug 1, 2020 at 1:50 AM Song Liu <[email protected]> wrote:
>>>>>>>>>>
>>>>>>>>>> Add a benchmark to compare performance of
>>>>>>>>>> 1) uprobe;
>>>>>>>>>> 2) user program w/o args;
>>>>>>>>>> 3) user program w/ args;
>>>>>>>>>> 4) user program w/ args on random cpu.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Can you please add it to the existing benchmark runner instead, e.g.,
>>>>>>>>> along the other bench_trigger benchmarks? No need to re-implement
>>>>>>>>> benchmark setup. And also that would also allow to compare existing
>>>>>>>>> ways of cheaply triggering a program vs this new _USER program?
>>>>>>>>
>>>>>>>> Will try.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> If the performance is not significantly better than other ways, do you
>>>>>>>>> think it still makes sense to add a new BPF program type? I think
>>>>>>>>> triggering KPROBE/TRACEPOINT from bpf_prog_test_run() would be very
>>>>>>>>> nice, maybe it's possible to add that instead of a new program type?
>>>>>>>>> Either way, let's see comparison with other program triggering
>>>>>>>>> mechanisms first.
>>>>>>>>
>>>>>>>> Triggering KPROBE and TRACEPOINT from bpf_prog_test_run() will be useful.
>>>>>>>> But I don't think they can be used instead of user program, for a couple
>>>>>>>> reasons. First, KPROBE/TRACEPOINT may be triggered by other programs
>>>>>>>> running in the system, so user will have to filter those noise out in
>>>>>>>> each program. Second, it is not easy to specify CPU for KPROBE/TRACEPOINT,
>>>>>>>> while this feature could be useful in many cases, e.g. get stack trace
>>>>>>>> on a given CPU.
>>>>>>>>
>>>>>>>
>>>>>>> Right, it's not as convenient with KPROBE/TRACEPOINT as with the USER
>>>>>>> program you've added specifically with that feature in mind. But if
>>>>>>> you pin user-space thread on the needed CPU and trigger kprobe/tp,
>>>>>>> then you'll get what you want. As for the "noise", see how
>>>>>>> bench_trigger() deals with that: it records thread ID and filters
>>>>>>> everything not matching. You can do the same with CPU ID. It's not as
>>>>>>> automatic as with a special BPF program type, but still pretty simple,
>>>>>>> which is why I'm still deciding (for myself) whether USER program type
>>>>>>> is necessary :)
>>>>>>
>>>>>> Here are some bench_trigger numbers:
>>>>>>
>>>>>> base : 1.698 ± 0.001M/s
>>>>>> tp : 1.477 ± 0.001M/s
>>>>>> rawtp : 1.567 ± 0.001M/s
>>>>>> kprobe : 1.431 ± 0.000M/s
>>>>>> fentry : 1.691 ± 0.000M/s
>>>>>> fmodret : 1.654 ± 0.000M/s
>>>>>> user : 1.253 ± 0.000M/s
>>>>>> fentry-on-cpu: 0.022 ± 0.011M/s
>>>>>> user-on-cpu: 0.315 ± 0.001M/s
>>>>>>
>>>>>
>>>>> Ok, so basically all of raw_tp,tp,kprobe,fentry/fexit are
>>>>> significantly faster than USER programs. Sure, when compared to
>>>>> uprobe, they are faster, but not when doing on-specific-CPU run, it
>>>>> seems (judging from this patch's description, if I'm reading it
>>>>> right). Anyways, speed argument shouldn't be a reason for doing this,
>>>>> IMO.
>>>>>
>>>>>> The two "on-cpu" tests run the program on a different CPU (see the patch
>>>>>> at the end).
>>>>>>
>>>>>> "user" is about 25% slower than "fentry". I think this is mostly because
>>>>>> getpgid() is a faster syscall than bpf(BPF_TEST_RUN).
>>>>>
>>>>> Yes, probably.
>>>>>
>>>>>>
>>>>>> "user-on-cpu" is more than 10x faster than "fentry-on-cpu", because IPI
>>>>>> is way faster than moving the process (via sched_setaffinity).
>>>>>
>>>>> I don't think that's a good comparison, because you are actually
>>>>> testing sched_setaffinity performance on each iteration vs IPI in the
>>>>> kernel, not a BPF overhead.
>>>>>
>>>>> I think the fair comparison for this would be to create a thread and
>>>>> pin it on necessary CPU, and only then BPF program calls in a loop.
>>>>> But I bet any of existing program types would beat USER program.
>>>>>
>>>>>>
>>>>>> For use cases that we would like to call BPF program on specific CPU,
>>>>>> triggering it via IPI is a lot faster.
>>>>>
>>>>> So these use cases would be nice to expand on in the motivational part
>>>>> of the patch set. It's not really emphasized and it's not at all clear
>>>>> what you are trying to achieve. It also seems, depending on latency
>>>>> requirements, it's totally possible to achieve comparable results by
>>>>> pre-creating a thread for each CPU, pinning each one to its designated
>>>>> CPU and then using any suitable user-space signaling mechanism (a
>>>>> queue, condvar, etc) to ask a thread to trigger BPF program (fentry on
>>>>> getpgid(), for instance).
>>>>
>>>> I don't see why user space signal plus fentry would be faster than IPI.
>>>> If the target cpu is running something, this gonna add two context
>>>> switches.
>>>>
>>>
>>> I didn't say faster, did I? I said it would be comparable and wouldn't
>>> require a new program type.
>>
>> Well, I don't think adding program type is that big a deal. If that is
>> really a problem, we can use a new attach type instead. The goal is to
>> trigger it with sys_bpf() on a different cpu. So we can call it kprobe
>> attach to nothing and hack that way. I add the new type because it makes
>> sense. The user just want to trigger a BPF program from user space.
>
> I thought we already concluded that it's not really "trigger a BPF
> program from user space", because for that you have many existing and
> even faster options. After a few rounds of emails, it seems it's more
> about triggering the BPF program on another CPU without preempting
> whatever is running on that CPU. It would be helpful to be clear and
> upfront about the requirements.
>
>>
>>> But then again, without knowing all the
>>> details, it's a bit hard to discuss this. E.g., if you need to trigger
>>> that BPF program periodically, you can sleep in those per-CPU threads,
>>> or epoll, or whatever. Or maybe you can set up a per-CPU perf event
>>> that would trigger your program on the desired CPU, etc.My point is
>>> that I and others shouldn't be guessing this, I'd expect someone who's
>>> proposing an entire new BPF program type to motivate why this new
>>> program type is necessary and what problem it's solving that can't be
>>> solved with existing means.
>>
>> Yes, there are other options. But they all come with non-trivial cost.
>> Per-CPU-per-process threads and/or per-CPU perf event are cost we have
>> to pay in production. IMO, these cost are much higher than a new program
>> type (or attach type).
>>
>
> So for threads I know the costs (a bit of memory for thread stack,
> plus some internal book keeping stuff in kernel), which is arguable
> how big of a deal is that if those threads do pretty much nothing most
> of the time. But what's the exact cost of perf events and why it's
> unacceptably high?
perf_events use limited hardware counters. We had real production issues
in the past when we run out of hardware counters.
>
> The reason I'm asking is that it seems to me that one alternative,
> which is more generic (and thus potentially more useful) would be to
> have a manually-triggerable perf event. Some sort of software event,
> that's triggered from ioctl or some other syscall, that's appropriate
> for perf subsystem. You'd pre-create a perf_event for each CPU,
> remember their FDs, then would trigger the one you need (corresponding
> to desired CPU). From the BPF side, you'd just use a normal perf_event
> program to handle perf_event activation. But it could be used even
> outside of the BPF ecosystem, which is a good sign for me, because it
> allows more flexible composition of building blocks.
This is an interesting idea. But I don't really see use cases outside of
BPF ecosystem. It is not easy to pass extra arguments into the perf_event.
Well we can probably pass the arguments via maps, but that is not ideal.
Also, each perf_event is 1kB+ memory. For a system with 100 cores, the
perf_events may use 25x more memory (100kB) than the bpf program (4kB).
>
>>>
>>> BTW, how frequently do you need to trigger the BPF program? Seems very
>>> frequently, if 2 vs 1 context switches might be a problem?
>>
>> The whole solution requires two BPF programs. One on each context switch,
>> the other is the user program. The user program will not trigger very
>> often.
>
> Ok, so performance was never an objective, I wonder why it is put as
> the main reason for this new type of BPF program?
User program (or user program triggered BPF program) could also bring
free performance saving to USDT with semaphore. Basically, we can replace
if (semaphore) {
dummy_func(arg1, arg2, ...);
}
with
if (semaphore) {
bpf_test_run(arg1, arg2, ...);
}
The first one uses uprobe, which is hundreds nanoseconds slower that the
latter.
>
>>>
>>>>> I bet in this case the performance would be
>>>>> really nice for a lot of practical use cases. But then again, I don't
>>>>> know details of the intended use case, so please provide some more
>>>>> details.
>>>>
>>>> Being able to trigger BPF program on a different CPU could enable many
>>>> use cases and optimizations. The use case I am looking at is to access
>>>> perf_event and percpu maps on the target CPU. For example:
>>>> 0. trigger the program
>>>> 1. read perf_event on cpu x;
>>>> 2. (optional) check which process is running on cpu x;
>>>> 3. add perf_event value to percpu map(s) on cpu x.
>>>>
>>>> If we do these steps in a BPF program on cpu x, the cost is:
>>>> A.0) trigger BPF via IPI;
>>>> A.1) read perf_event locally;
>>>> A.2) local access current;
>>>> A.3) local access of percpu map(s).
>>>>
>>>> If we can only do these on a different CPU, the cost will be:
>>>> B.0) trigger BPF locally;
>>>> B.1) read perf_event via IPI;
>>>> B.2) remote access current on cpu x;
>>>> B.3) remote access percpu map(s), or use non-percpu map(2).
>>>>
>>>> Cost of (A.0 + A.1) is about same as (B.0 + B.1), maybe a little higher
>>>> (sys_bpf(), vs. sys_getpgid()). But A.2 and A.3 will be significantly
>>>> cheaper than B.2 and B.3.
>>>>
>>>> Does this make sense?
>>>
>>> It does, thanks. But what I was describing is still A, no? BPF program
>>> will be triggered on your desired cpu X, wouldn't it?
>>
>> Well, that would be option C, but C could not do step 2, because we context
>> switch to the dedicated thread.
>>
>
> So I think *this* is a real requirement. No preemption of the running
> process on a different CPU. That does sound like what perf event does,
> doesn't it? See above.
On Wed, Aug 05, 2020 at 06:56:26PM +0000, Song Liu wrote:
>
>
> > On Aug 5, 2020, at 10:16 AM, Alexei Starovoitov <[email protected]> wrote:
> >
> > On Wed, Aug 05, 2020 at 04:47:30AM +0000, Song Liu wrote:
> >>
> >> Being able to trigger BPF program on a different CPU could enable many
> >> use cases and optimizations. The use case I am looking at is to access
> >> perf_event and percpu maps on the target CPU. For example:
> >> 0. trigger the program
> >> 1. read perf_event on cpu x;
> >> 2. (optional) check which process is running on cpu x;
> >> 3. add perf_event value to percpu map(s) on cpu x.
> >
> > If the whole thing is about doing the above then I don't understand why new
> > prog type is needed.
>
> I was under the (probably wrong) impression that adding prog type is not
> that big a deal.
Not a big deal when it's necessary.
> > Can prog_test_run support existing BPF_PROG_TYPE_KPROBE?
>
> I haven't looked into all the details, but I bet this is possible.
>
> > "enable many use cases" sounds vague. I don't think folks reading
> > the patches can guess those "use cases".
> > "Testing existing kprobe bpf progs" would sound more convincing to me.
> > If the test_run framework can be extended to trigger kprobe with correct pt_regs.
> > As part of it test_run would trigger on a given cpu with $ip pointing
> > to some test fuction in test_run.c. For local test_run the stack trace
> > would include bpf syscall chain. For IPI the stack trace would include
> > the corresponding kernel pieces where top is our special test function.
> > Sort of like pseudo kprobe where there is no actual kprobe logic,
> > since kprobe prog doesn't care about mechanism. It needs correct
> > pt_regs only as input context.
> > The kprobe prog output (return value) has special meaning though,
> > so may be kprobe prog type is not a good fit.
> > Something like fentry/fexit may be better, since verifier check_return_code()
> > enforces 'return 0'. So their return value is effectively "void".
> > Then prog_test_run would need to gain an ability to trigger
> > fentry/fexit prog on a given cpu.
>
> Maybe we add a new attach type for BPF_PROG_TYPE_TRACING, which is in
> parallel with BPF_TRACE_FENTRY and BPF_TRACE_EXIT? Say BPF_TRACE_USER?
> (Just realized I like this name :-D, it matches USDT...). Then we can
> enable test_run for most (if not all) tracing programs, including
> fentry/fexit.
Why new hook? Why prog_test_run cmd cannot be made to work
BPF_PROG_TYPE_TRACING when it's loaded as BPF_TRACE_FENTRY and attach_btf_id
points to special test function?
The test_run cmd will trigger execution of that special function.
Devil is in details of course. How attach, trampoline, etc going to work
that all needs to be figured out. Parallel test_run cmd ideally shouldn't
affect each other, etc.
> On Aug 5, 2020, at 3:50 PM, Alexei Starovoitov <[email protected]> wrote:
>
> On Wed, Aug 05, 2020 at 06:56:26PM +0000, Song Liu wrote:
>>
>>
>>> On Aug 5, 2020, at 10:16 AM, Alexei Starovoitov <[email protected]> wrote:
>>>
>>> On Wed, Aug 05, 2020 at 04:47:30AM +0000, Song Liu wrote:
>>>>
>>>> Being able to trigger BPF program on a different CPU could enable many
>>>> use cases and optimizations. The use case I am looking at is to access
>>>> perf_event and percpu maps on the target CPU. For example:
>>>> 0. trigger the program
>>>> 1. read perf_event on cpu x;
>>>> 2. (optional) check which process is running on cpu x;
>>>> 3. add perf_event value to percpu map(s) on cpu x.
>>>
>>> If the whole thing is about doing the above then I don't understand why new
>>> prog type is needed.
>>
>> I was under the (probably wrong) impression that adding prog type is not
>> that big a deal.
>
> Not a big deal when it's necessary.
>
>>> Can prog_test_run support existing BPF_PROG_TYPE_KPROBE?
>>
>> I haven't looked into all the details, but I bet this is possible.
>>
>>> "enable many use cases" sounds vague. I don't think folks reading
>>> the patches can guess those "use cases".
>>> "Testing existing kprobe bpf progs" would sound more convincing to me.
>>> If the test_run framework can be extended to trigger kprobe with correct pt_regs.
>>> As part of it test_run would trigger on a given cpu with $ip pointing
>>> to some test fuction in test_run.c. For local test_run the stack trace
>>> would include bpf syscall chain. For IPI the stack trace would include
>>> the corresponding kernel pieces where top is our special test function.
>>> Sort of like pseudo kprobe where there is no actual kprobe logic,
>>> since kprobe prog doesn't care about mechanism. It needs correct
>>> pt_regs only as input context.
>>> The kprobe prog output (return value) has special meaning though,
>>> so may be kprobe prog type is not a good fit.
>>> Something like fentry/fexit may be better, since verifier check_return_code()
>>> enforces 'return 0'. So their return value is effectively "void".
>>> Then prog_test_run would need to gain an ability to trigger
>>> fentry/fexit prog on a given cpu.
>>
>> Maybe we add a new attach type for BPF_PROG_TYPE_TRACING, which is in
>> parallel with BPF_TRACE_FENTRY and BPF_TRACE_EXIT? Say BPF_TRACE_USER?
>> (Just realized I like this name :-D, it matches USDT...). Then we can
>> enable test_run for most (if not all) tracing programs, including
>> fentry/fexit.
>
> Why new hook? Why prog_test_run cmd cannot be made to work
> BPF_PROG_TYPE_TRACING when it's loaded as BPF_TRACE_FENTRY and attach_btf_id
> points to special test function?
> The test_run cmd will trigger execution of that special function.
I am not sure I am following 100%. IIUC, the special test function is a
kernel function, and we attach fentry program to it. When multiple fentry
programs attach to the function, these programs will need proper filter
logic.
Alternatively, if test_run just prepare the ctx and call BPF_PROG_RUN(),
like in bpf_test_run(), we don't need the special test function.
So I do think the new attach type requires new hook. It is just like
BPF_TRACE_FENTRY without valid attach_btf_id. Of course, we can reserve
a test function and use it for attach_btf_id. If test_run just calls
BPF_PROG_RUN(), we will probably never touch the test function.
IMO, we are choosing from two options.
1. FENTRY on special function. User will specify attach_btf_id on the
special function.
2. new attach type (BPF_TRACE_USER), that do not require attach_btf_id;
and there is no need for a special function.
I personally think #2 is cleaner API. But I have no objection if #1 is
better in other means.
Thanks,
Song