From: Feng Zhou <[email protected]>
We encountered bad case on big system with 96 CPUs that
alloc_htab_elem() would last for 1ms. The reason is that after the
prealloc hashtab has no free elems, when trying to update, it will still
grab spin_locks of all cpus. If there are multiple update users, the
competition is very serious.
0001: Add is_empty to check whether the free list is empty or not before taking
the lock.
0002: Add benchmark to reproduce this worst case.
Changelog:
v1->v2: Addressed comments from Alexei Starovoitov.
- add a benchmark to reproduce the issue.
- Adjust the code format that avoid adding indent.
some details in here:
https://lore.kernel.org/all/[email protected]/T/
Feng Zhou (2):
bpf: avoid grabbing spin_locks of all cpus when no free elems
selftest/bpf/benchs: Add bpf_map benchmark
kernel/bpf/percpu_freelist.c | 28 ++++++-
kernel/bpf/percpu_freelist.h | 1 +
tools/testing/selftests/bpf/Makefile | 4 +-
tools/testing/selftests/bpf/bench.c | 2 +
.../selftests/bpf/benchs/bench_bpf_map.c | 78 +++++++++++++++++++
.../selftests/bpf/benchs/run_bench_bpf_map.sh | 10 +++
.../selftests/bpf/progs/bpf_map_bench.c | 27 +++++++
7 files changed, 146 insertions(+), 4 deletions(-)
create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_map.c
create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_bpf_map.sh
create mode 100644 tools/testing/selftests/bpf/progs/bpf_map_bench.c
--
2.20.1
From: Feng Zhou <[email protected]>
Add benchmark for hash_map to reproduce the worst case
that non-stop update when map's free is zero.
Signed-off-by: Feng Zhou <[email protected]>
---
tools/testing/selftests/bpf/Makefile | 4 +-
tools/testing/selftests/bpf/bench.c | 2 +
.../selftests/bpf/benchs/bench_bpf_map.c | 78 +++++++++++++++++++
.../selftests/bpf/benchs/run_bench_bpf_map.sh | 10 +++
.../selftests/bpf/progs/bpf_map_bench.c | 27 +++++++
5 files changed, 120 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_map.c
create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_bpf_map.sh
create mode 100644 tools/testing/selftests/bpf/progs/bpf_map_bench.c
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 3820608faf57..cd2fada21ed7 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -549,6 +549,7 @@ $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
$(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h
$(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h
$(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h
+$(OUTPUT)/bench_bpf_map.o: $(OUTPUT)/bpf_map_bench.skel.h
$(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
$(OUTPUT)/bench: LDLIBS += -lm
$(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -560,7 +561,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
$(OUTPUT)/bench_ringbufs.o \
$(OUTPUT)/bench_bloom_filter_map.o \
$(OUTPUT)/bench_bpf_loop.o \
- $(OUTPUT)/bench_strncmp.o
+ $(OUTPUT)/bench_strncmp.o \
+ $(OUTPUT)/bench_bpf_map.o
$(call msg,BINARY,,$@)
$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index f973320e6dbf..32644c4adc84 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -397,6 +397,7 @@ extern const struct bench bench_hashmap_with_bloom;
extern const struct bench bench_bpf_loop;
extern const struct bench bench_strncmp_no_helper;
extern const struct bench bench_strncmp_helper;
+extern const struct bench bench_bpf_map;
static const struct bench *benchs[] = {
&bench_count_global,
@@ -431,6 +432,7 @@ static const struct bench *benchs[] = {
&bench_bpf_loop,
&bench_strncmp_no_helper,
&bench_strncmp_helper,
+ &bench_bpf_map,
};
static void setup_benchmark()
diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_map.c b/tools/testing/selftests/bpf/benchs/bench_bpf_map.c
new file mode 100644
index 000000000000..4db08ed23f1f
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_map.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Bytedadnce */
+
+#include <argp.h>
+#include "bench.h"
+#include "bpf_map_bench.skel.h"
+
+/* BPF triggering benchmarks */
+static struct ctx {
+ struct bpf_map_bench *skel;
+ struct counter hits;
+} ctx;
+
+static void validate(void)
+{
+ if (env.consumer_cnt != 1) {
+ fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
+ exit(1);
+ }
+}
+
+static void *producer(void *input)
+{
+ while (true) {
+ /* trigger the bpf program */
+ syscall(__NR_getpgid);
+ atomic_inc(&ctx.hits.value);
+ }
+
+ return NULL;
+}
+
+static void *consumer(void *input)
+{
+ return NULL;
+}
+
+static void measure(struct bench_res *res)
+{
+ res->hits = atomic_swap(&ctx.hits.value, 0);
+}
+
+static void setup(void)
+{
+ struct bpf_link *link;
+ int map_fd, i, max_entries;
+
+ setup_libbpf();
+
+ ctx.skel = bpf_map_bench__open_and_load();
+ if (!ctx.skel) {
+ fprintf(stderr, "failed to open skeleton\n");
+ exit(1);
+ }
+
+ link = bpf_program__attach(ctx.skel->progs.benchmark);
+ if (!link) {
+ fprintf(stderr, "failed to attach program!\n");
+ exit(1);
+ }
+
+ //fill hash_map
+ map_fd = bpf_map__fd(ctx.skel->maps.hash_map_bench);
+ max_entries = bpf_map__max_entries(ctx.skel->maps.hash_map_bench);
+ for (i = 0; i < max_entries; i++)
+ bpf_map_update_elem(map_fd, &i, &i, BPF_ANY);
+}
+
+const struct bench bench_bpf_map = {
+ .name = "bpf-map",
+ .validate = validate,
+ .setup = setup,
+ .producer_thread = producer,
+ .consumer_thread = consumer,
+ .measure = measure,
+ .report_progress = ops_report_progress,
+ .report_final = ops_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_bpf_map.sh b/tools/testing/selftests/bpf/benchs/run_bench_bpf_map.sh
new file mode 100755
index 000000000000..d7cc969e4f85
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/run_bench_bpf_map.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source ./benchs/run_common.sh
+
+set -eufo pipefail
+
+nr_threads=`expr $(cat /proc/cpuinfo | grep "processor"| wc -l) - 1`
+summary=$($RUN_BENCH -p $nr_threads bpf-map)
+printf "$summary"
diff --git a/tools/testing/selftests/bpf/progs/bpf_map_bench.c b/tools/testing/selftests/bpf/progs/bpf_map_bench.c
new file mode 100644
index 000000000000..655366e6e0f4
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_map_bench.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Bytedance */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+#define MAX_ENTRIES 1000
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, MAX_ENTRIES);
+} hash_map_bench SEC(".maps");
+
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
+int benchmark(void *ctx)
+{
+ u32 key = bpf_get_prandom_u32();
+ u64 init_val = 1;
+
+ bpf_map_update_elem(&hash_map_bench, &key, &init_val, BPF_ANY);
+ return 0;
+}
--
2.20.1
From: Feng Zhou <[email protected]>
This patch add is_empty in pcpu_freelist_head to check freelist
having free or not. If having, grab spin_lock, or check next cpu's
freelist.
Before patch: hash_map performance
./map_perf_test 1
0:hash_map_perf pre-alloc 975345 events per sec
4:hash_map_perf pre-alloc 855367 events per sec
12:hash_map_perf pre-alloc 860862 events per sec
8:hash_map_perf pre-alloc 849561 events per sec
3:hash_map_perf pre-alloc 849074 events per sec
6:hash_map_perf pre-alloc 847120 events per sec
10:hash_map_perf pre-alloc 845047 events per sec
5:hash_map_perf pre-alloc 841266 events per sec
14:hash_map_perf pre-alloc 849740 events per sec
2:hash_map_perf pre-alloc 839598 events per sec
9:hash_map_perf pre-alloc 838695 events per sec
11:hash_map_perf pre-alloc 845390 events per sec
7:hash_map_perf pre-alloc 834865 events per sec
13:hash_map_perf pre-alloc 842619 events per sec
1:hash_map_perf pre-alloc 804231 events per sec
15:hash_map_perf pre-alloc 795314 events per sec
hash_map the worst: no free
./map_perf_test 2048
6:worse hash_map_perf pre-alloc 28628 events per sec
5:worse hash_map_perf pre-alloc 28553 events per sec
11:worse hash_map_perf pre-alloc 28543 events per sec
3:worse hash_map_perf pre-alloc 28444 events per sec
1:worse hash_map_perf pre-alloc 28418 events per sec
7:worse hash_map_perf pre-alloc 28427 events per sec
13:worse hash_map_perf pre-alloc 28330 events per sec
14:worse hash_map_perf pre-alloc 28263 events per sec
9:worse hash_map_perf pre-alloc 28211 events per sec
15:worse hash_map_perf pre-alloc 28193 events per sec
12:worse hash_map_perf pre-alloc 28190 events per sec
10:worse hash_map_perf pre-alloc 28129 events per sec
8:worse hash_map_perf pre-alloc 28116 events per sec
4:worse hash_map_perf pre-alloc 27906 events per sec
2:worse hash_map_perf pre-alloc 27801 events per sec
0:worse hash_map_perf pre-alloc 27416 events per sec
3:worse hash_map_perf pre-alloc 28188 events per sec
ftrace trace
0) | htab_map_update_elem() {
0) 0.198 us | migrate_disable();
0) | _raw_spin_lock_irqsave() {
0) 0.157 us | preempt_count_add();
0) 0.538 us | }
0) 0.260 us | lookup_elem_raw();
0) | alloc_htab_elem() {
0) | __pcpu_freelist_pop() {
0) | _raw_spin_lock() {
0) 0.152 us | preempt_count_add();
0) 0.352 us | native_queued_spin_lock_slowpath();
0) 1.065 us | }
| ...
0) | _raw_spin_unlock() {
0) 0.254 us | preempt_count_sub();
0) 0.555 us | }
0) + 25.188 us | }
0) + 25.486 us | }
0) | _raw_spin_unlock_irqrestore() {
0) 0.155 us | preempt_count_sub();
0) 0.454 us | }
0) 0.148 us | migrate_enable();
0) + 28.439 us | }
The test machine is 16C, trying to get spin_lock 17 times, in addition
to 16c, there is an extralist.
after patch: hash_map performance
./map_perf_test 1
0:hash_map_perf pre-alloc 969348 events per sec
10:hash_map_perf pre-alloc 906526 events per sec
11:hash_map_perf pre-alloc 904557 events per sec
9:hash_map_perf pre-alloc 902384 events per sec
15:hash_map_perf pre-alloc 912287 events per sec
14:hash_map_perf pre-alloc 905689 events per sec
12:hash_map_perf pre-alloc 903680 events per sec
13:hash_map_perf pre-alloc 902631 events per sec
8:hash_map_perf pre-alloc 875369 events per sec
4:hash_map_perf pre-alloc 862808 events per sec
1:hash_map_perf pre-alloc 857218 events per sec
2:hash_map_perf pre-alloc 852875 events per sec
5:hash_map_perf pre-alloc 846497 events per sec
6:hash_map_perf pre-alloc 828467 events per sec
3:hash_map_perf pre-alloc 812542 events per sec
7:hash_map_perf pre-alloc 805336 events per sec
hash_map worst: no free
./map_perf_test 2048
7:worse hash_map_perf pre-alloc 391104 events per sec
4:worse hash_map_perf pre-alloc 388073 events per sec
5:worse hash_map_perf pre-alloc 387038 events per sec
1:worse hash_map_perf pre-alloc 386546 events per sec
0:worse hash_map_perf pre-alloc 384590 events per sec
11:worse hash_map_perf pre-alloc 379378 events per sec
10:worse hash_map_perf pre-alloc 375480 events per sec
12:worse hash_map_perf pre-alloc 372394 events per sec
6:worse hash_map_perf pre-alloc 367692 events per sec
3:worse hash_map_perf pre-alloc 363970 events per sec
9:worse hash_map_perf pre-alloc 364008 events per sec
8:worse hash_map_perf pre-alloc 363759 events per sec
2:worse hash_map_perf pre-alloc 360743 events per sec
14:worse hash_map_perf pre-alloc 361195 events per sec
13:worse hash_map_perf pre-alloc 360276 events per sec
15:worse hash_map_perf pre-alloc 360057 events per sec
0:worse hash_map_perf pre-alloc 378177 events per sec
ftrace trace
0) | htab_map_update_elem() {
0) 0.317 us | migrate_disable();
0) | _raw_spin_lock_irqsave() {
0) 0.260 us | preempt_count_add();
0) 1.803 us | }
0) 0.276 us | lookup_elem_raw();
0) | alloc_htab_elem() {
0) 0.586 us | __pcpu_freelist_pop();
0) 0.945 us | }
0) | _raw_spin_unlock_irqrestore() {
0) 0.160 us | preempt_count_sub();
0) 0.972 us | }
0) 0.657 us | migrate_enable();
0) 8.669 us | }
It can be seen that after adding this patch, the map performance is
almost not degraded, and when free=0, first check is_empty instead of
directly acquiring spin_lock.
As for why to add is_empty instead of directly judging head->first, my
understanding is this, head->first is frequently modified during updating
map, which will lead to invalid other cpus's cache, and is_empty is after
freelist having no free elems will be changed, the performance will be better.
Co-developed-by: Chengming Zhou <[email protected]>
Signed-off-by: Chengming Zhou <[email protected]>
Signed-off-by: Feng Zhou <[email protected]>
---
kernel/bpf/percpu_freelist.c | 28 +++++++++++++++++++++++++---
kernel/bpf/percpu_freelist.h | 1 +
2 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 3d897de89061..f83eb63720d4 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -16,9 +16,11 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
raw_spin_lock_init(&head->lock);
head->first = NULL;
+ head->is_empty = true;
}
raw_spin_lock_init(&s->extralist.lock);
s->extralist.first = NULL;
+ s->extralist.is_empty = true;
return 0;
}
@@ -32,6 +34,8 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
{
node->next = head->first;
head->first = node;
+ if (head->is_empty)
+ head->is_empty = false;
}
static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
@@ -130,14 +134,19 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
+ if (head->is_empty)
+ goto next_cpu;
raw_spin_lock(&head->lock);
node = head->first;
if (node) {
head->first = node->next;
+ if (!head->first)
+ head->is_empty = true;
raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
+next_cpu:
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
@@ -146,10 +155,15 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
}
/* per cpu lists are all empty, try extralist */
+ if (s->extralist.is_empty)
+ return NULL;
raw_spin_lock(&s->extralist.lock);
node = s->extralist.first;
- if (node)
+ if (node) {
s->extralist.first = node->next;
+ if (!s->extralist.first)
+ s->extralist.is_empty = true;
+ }
raw_spin_unlock(&s->extralist.lock);
return node;
}
@@ -164,15 +178,20 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
+ if (head->is_empty)
+ goto next_cpu;
if (raw_spin_trylock(&head->lock)) {
node = head->first;
if (node) {
head->first = node->next;
+ if (!head->first)
+ head->is_empty = true;
raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
}
+next_cpu:
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
@@ -181,11 +200,14 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
}
/* cannot pop from per cpu lists, try extralist */
- if (!raw_spin_trylock(&s->extralist.lock))
+ if (s->extralist.is_empty || !raw_spin_trylock(&s->extralist.lock))
return NULL;
node = s->extralist.first;
- if (node)
+ if (node) {
s->extralist.first = node->next;
+ if (!s->extralist.first)
+ s->extralist.is_empty = true;
+ }
raw_spin_unlock(&s->extralist.lock);
return node;
}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3c76553cfe57..9e4545631ed5 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -9,6 +9,7 @@
struct pcpu_freelist_head {
struct pcpu_freelist_node *first;
raw_spinlock_t lock;
+ bool is_empty;
};
struct pcpu_freelist {
--
2.20.1
On Tue, May 24, 2022 at 12:53 AM Feng zhou <[email protected]> wrote:
>
> From: Feng Zhou <[email protected]>
>
> Add benchmark for hash_map to reproduce the worst case
> that non-stop update when map's free is zero.
>
> Signed-off-by: Feng Zhou <[email protected]>
> ---
> tools/testing/selftests/bpf/Makefile | 4 +-
> tools/testing/selftests/bpf/bench.c | 2 +
> .../selftests/bpf/benchs/bench_bpf_map.c | 78 +++++++++++++++++++
> .../selftests/bpf/benchs/run_bench_bpf_map.sh | 10 +++
> .../selftests/bpf/progs/bpf_map_bench.c | 27 +++++++
> 5 files changed, 120 insertions(+), 1 deletion(-)
> create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_map.c
> create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_bpf_map.sh
> create mode 100644 tools/testing/selftests/bpf/progs/bpf_map_bench.c
>
[...]
> +static void setup(void)
> +{
> + struct bpf_link *link;
> + int map_fd, i, max_entries;
> +
> + setup_libbpf();
> +
> + ctx.skel = bpf_map_bench__open_and_load();
> + if (!ctx.skel) {
> + fprintf(stderr, "failed to open skeleton\n");
> + exit(1);
> + }
> +
> + link = bpf_program__attach(ctx.skel->progs.benchmark);
> + if (!link) {
> + fprintf(stderr, "failed to attach program!\n");
> + exit(1);
> + }
> +
> + //fill hash_map
don't use C++ comments
> + map_fd = bpf_map__fd(ctx.skel->maps.hash_map_bench);
> + max_entries = bpf_map__max_entries(ctx.skel->maps.hash_map_bench);
> + for (i = 0; i < max_entries; i++)
> + bpf_map_update_elem(map_fd, &i, &i, BPF_ANY);
> +}
> +
> +const struct bench bench_bpf_map = {
> + .name = "bpf-map",
this is too generic name, it's testing one particular scenario, so
call this out in the name. bpf-hashmap-full-update or something (same
for all the relevant function and file names)
> + .validate = validate,
> + .setup = setup,
> + .producer_thread = producer,
> + .consumer_thread = consumer,
> + .measure = measure,
> + .report_progress = ops_report_progress,
> + .report_final = ops_report_final,
> +};
> diff --git a/tools/testing/selftests/bpf/benchs/run_bench_bpf_map.sh b/tools/testing/selftests/bpf/benchs/run_bench_bpf_map.sh
> new file mode 100755
> index 000000000000..d7cc969e4f85
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/benchs/run_bench_bpf_map.sh
> @@ -0,0 +1,10 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +
> +source ./benchs/run_common.sh
> +
> +set -eufo pipefail
> +
> +nr_threads=`expr $(cat /proc/cpuinfo | grep "processor"| wc -l) - 1`
> +summary=$($RUN_BENCH -p $nr_threads bpf-map)
> +printf "$summary"
> diff --git a/tools/testing/selftests/bpf/progs/bpf_map_bench.c b/tools/testing/selftests/bpf/progs/bpf_map_bench.c
> new file mode 100644
> index 000000000000..655366e6e0f4
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/bpf_map_bench.c
> @@ -0,0 +1,27 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2022 Bytedance */
> +
> +#include "vmlinux.h"
> +#include <bpf/bpf_helpers.h>
> +#include "bpf_misc.h"
> +
> +char _license[] SEC("license") = "GPL";
> +
> +#define MAX_ENTRIES 1000
> +
> +struct {
> + __uint(type, BPF_MAP_TYPE_HASH);
> + __type(key, u32);
> + __type(value, u64);
> + __uint(max_entries, MAX_ENTRIES);
> +} hash_map_bench SEC(".maps");
> +
> +SEC("fentry/" SYS_PREFIX "sys_getpgid")
> +int benchmark(void *ctx)
> +{
> + u32 key = bpf_get_prandom_u32();
> + u64 init_val = 1;
> +
> + bpf_map_update_elem(&hash_map_bench, &key, &init_val, BPF_ANY);
> + return 0;
> +}
> --
> 2.20.1
>
On Tue, May 24, 2022 at 12:53 AM Feng zhou <[email protected]> wrote:
> +static void setup(void)
> +{
> + struct bpf_link *link;
> + int map_fd, i, max_entries;
> +
> + setup_libbpf();
> +
> + ctx.skel = bpf_map_bench__open_and_load();
> + if (!ctx.skel) {
> + fprintf(stderr, "failed to open skeleton\n");
> + exit(1);
> + }
> +
> + link = bpf_program__attach(ctx.skel->progs.benchmark);
> + if (!link) {
> + fprintf(stderr, "failed to attach program!\n");
> + exit(1);
> + }
> +
> + //fill hash_map
> + map_fd = bpf_map__fd(ctx.skel->maps.hash_map_bench);
> + max_entries = bpf_map__max_entries(ctx.skel->maps.hash_map_bench);
> + for (i = 0; i < max_entries; i++)
> + bpf_map_update_elem(map_fd, &i, &i, BPF_ANY);
> +}
...
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
> +int benchmark(void *ctx)
> +{
> + u32 key = bpf_get_prandom_u32();
> + u64 init_val = 1;
> +
> + bpf_map_update_elem(&hash_map_bench, &key, &init_val, BPF_ANY);
> + return 0;
> +}
This benchmark is artificial at its extreme.
First it populates the map till max_entries and then
constantly bounces off the max_entries limit in a bpf prog.
Sometimes random_u32 will be less than max_entries
and map_update_elem will hit a fast path,
but most of the time it will fail to alloc_htab_elem()
and will fail to map_update_elem.
It does demonstrate that percpu_free_list is inefficient
when it's empty, but there is no way such a microbenchmark
justifies optimizing this corner case.
If there is a production use case please code it up in
a benchmark.
Also there is a lot of other overhead: syscall and atomic-s.
To stress map_update_elem please use a for() loop inside bpf prog.
在 2022/5/25 上午7:29, Andrii Nakryiko 写道:
> On Tue, May 24, 2022 at 12:53 AM Feng zhou <[email protected]> wrote:
>> From: Feng Zhou <[email protected]>
>>
>> Add benchmark for hash_map to reproduce the worst case
>> that non-stop update when map's free is zero.
>>
>> Signed-off-by: Feng Zhou <[email protected]>
>> ---
>> tools/testing/selftests/bpf/Makefile | 4 +-
>> tools/testing/selftests/bpf/bench.c | 2 +
>> .../selftests/bpf/benchs/bench_bpf_map.c | 78 +++++++++++++++++++
>> .../selftests/bpf/benchs/run_bench_bpf_map.sh | 10 +++
>> .../selftests/bpf/progs/bpf_map_bench.c | 27 +++++++
>> 5 files changed, 120 insertions(+), 1 deletion(-)
>> create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_map.c
>> create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_bpf_map.sh
>> create mode 100644 tools/testing/selftests/bpf/progs/bpf_map_bench.c
>>
> [...]
>
>> +static void setup(void)
>> +{
>> + struct bpf_link *link;
>> + int map_fd, i, max_entries;
>> +
>> + setup_libbpf();
>> +
>> + ctx.skel = bpf_map_bench__open_and_load();
>> + if (!ctx.skel) {
>> + fprintf(stderr, "failed to open skeleton\n");
>> + exit(1);
>> + }
>> +
>> + link = bpf_program__attach(ctx.skel->progs.benchmark);
>> + if (!link) {
>> + fprintf(stderr, "failed to attach program!\n");
>> + exit(1);
>> + }
>> +
>> + //fill hash_map
> don't use C++ comments
Ok, will do. Thanks.
>
>> + map_fd = bpf_map__fd(ctx.skel->maps.hash_map_bench);
>> + max_entries = bpf_map__max_entries(ctx.skel->maps.hash_map_bench);
>> + for (i = 0; i < max_entries; i++)
>> + bpf_map_update_elem(map_fd, &i, &i, BPF_ANY);
>> +}
>> +
>> +const struct bench bench_bpf_map = {
>> + .name = "bpf-map",
> this is too generic name, it's testing one particular scenario, so
> call this out in the name. bpf-hashmap-full-update or something (same
> for all the relevant function and file names)
>
Ok, will do. Thanks.
>> + .validate = validate,
>> + .setup = setup,
>> + .producer_thread = producer,
>> + .consumer_thread = consumer,
>> + .measure = measure,
>> + .report_progress = ops_report_progress,
>> + .report_final = ops_report_final,
>> +};
>> diff --git a/tools/testing/selftests/bpf/benchs/run_bench_bpf_map.sh b/tools/testing/selftests/bpf/benchs/run_bench_bpf_map.sh
>> new file mode 100755
>> index 000000000000..d7cc969e4f85
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/benchs/run_bench_bpf_map.sh
>> @@ -0,0 +1,10 @@
>> +#!/bin/bash
>> +# SPDX-License-Identifier: GPL-2.0
>> +
>> +source ./benchs/run_common.sh
>> +
>> +set -eufo pipefail
>> +
>> +nr_threads=`expr $(cat /proc/cpuinfo | grep "processor"| wc -l) - 1`
>> +summary=$($RUN_BENCH -p $nr_threads bpf-map)
>> +printf "$summary"
>> diff --git a/tools/testing/selftests/bpf/progs/bpf_map_bench.c b/tools/testing/selftests/bpf/progs/bpf_map_bench.c
>> new file mode 100644
>> index 000000000000..655366e6e0f4
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/progs/bpf_map_bench.c
>> @@ -0,0 +1,27 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright (c) 2022 Bytedance */
>> +
>> +#include "vmlinux.h"
>> +#include <bpf/bpf_helpers.h>
>> +#include "bpf_misc.h"
>> +
>> +char _license[] SEC("license") = "GPL";
>> +
>> +#define MAX_ENTRIES 1000
>> +
>> +struct {
>> + __uint(type, BPF_MAP_TYPE_HASH);
>> + __type(key, u32);
>> + __type(value, u64);
>> + __uint(max_entries, MAX_ENTRIES);
>> +} hash_map_bench SEC(".maps");
>> +
>> +SEC("fentry/" SYS_PREFIX "sys_getpgid")
>> +int benchmark(void *ctx)
>> +{
>> + u32 key = bpf_get_prandom_u32();
>> + u64 init_val = 1;
>> +
>> + bpf_map_update_elem(&hash_map_bench, &key, &init_val, BPF_ANY);
>> + return 0;
>> +}
>> --
>> 2.20.1
>>