2022-08-10 17:37:09

by Francis Laniel

[permalink] [raw]
Subject: [RFC PATCH v1 0/3] Make BPF ring buffer over writable

Hi.


First, I hope you are fine and the same for your relatives.

Normally, when BPF ring buffer are full, producers cannot write anymore and
need to wait for consumer to get some data.
As a consequence, calling bpf_ringbuf_reserve() from eBPF code returns NULL.

This contribution adds a new flag to make BPF ring buffer over writable.
When the buffer is full, the producer will over write the oldest data.
So, calling bpf_ringbuf_reserve() on an over writable BPF ring buffer never
returns NULL but consumer will loose some data.
This flag can be used to monitor lots of events, like all the syscalls done on
a given machine.

I tested it within a VM with the fourth patch which creates a "toy" eBPF
program:
you@home$ cd /path/to/iovisor/bcc
you@home$ git apply 0001-for-test-purpose-only-Add-toy-to-play-with-BPF-ring-.patch
you@home$ cd /path/to/linux/tools/lib/bpf
you@home$ make -j$(nproc)
you@home$ cp libbpf.a /path/to/iovisor/bcc/libbpf-tools/.output
you@home$ cd /path/to/iovisor/bcc/libbpf-tools/
you@home$ make -j toy
# Start your VM and copy toy executable inside it.
you@vm# ./share/toy
Press any key to begin consuming!
^Z
you@vm# for i in {1..16}; do true; done
you@vm# fg # Please press any key

8
9
10
11
12
13
14
15
16

^Z
you@vm# true && true
you@vm# fg
17
18

As you can see, the first eight events are overwritten.

If you any way to improve this contribution, feel free to share.

Francis Laniel (3):
bpf: Make ring buffer overwritable.
do not merge: Temporary fix for is_power_of_2.
libbpf: Make bpf ring buffer overwritable.

include/uapi/linux/bpf.h | 3 ++
kernel/bpf/ringbuf.c | 51 ++++++++++++++++++++++++++--------
tools/include/uapi/linux/bpf.h | 3 ++
tools/lib/bpf/libbpf.c | 2 +-
tools/lib/bpf/ringbuf.c | 35 ++++++++++++++++++++++-
5 files changed, 81 insertions(+), 13 deletions(-)


Best regards and thank you in advance.
--
2.25.1


2022-08-10 17:38:42

by Francis Laniel

[permalink] [raw]
Subject: [RFC PATCH v1 1/3] bpf: Make ring buffer overwritable.

By default, BPF ring buffer are size bounded, when producers already filled the
buffer, they need to wait for the consumer to get those data before adding new
ones.
In terms of API, bpf_ringbuf_reserve() returns NULL if the buffer is full.

This patch permits making BPF ring buffer overwritable.
When producers already wrote as many data as the buffer size, they will begin to
over write existing data, so the oldest will be replaced.
As a result, bpf_ringbuf_reserve() never returns NULL.

Signed-off-by: Francis Laniel <[email protected]>
---
include/uapi/linux/bpf.h | 3 +++
kernel/bpf/ringbuf.c | 51 +++++++++++++++++++++++++++++++---------
2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ef78e0e1a754..19c7039265d8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1226,6 +1226,9 @@ enum {

/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
+
+/* Create an over writable BPF_RINGBUF */
+ BFP_F_RB_OVER_WRITABLE = (1U << 13),
};

/* Flags for BPF_PROG_QUERY. */
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index ded4faeca192..e2d907df4989 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -12,7 +12,7 @@
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>

-#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
+#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BFP_F_RB_OVER_WRITABLE)

/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
#define RINGBUF_PGOFF \
@@ -37,6 +37,8 @@ struct bpf_ringbuf {
u64 mask;
struct page **pages;
int nr_pages;
+ __u8 over_writable: 1,
+ __reserved: 7;
spinlock_t spinlock ____cacheline_aligned_in_smp;
/* Consumer and producer counters are put into separate pages to allow
* mapping consumer page as r/w, but restrict producer page to r/o.
@@ -127,7 +129,12 @@ static void bpf_ringbuf_notify(struct irq_work *work)
wake_up_all(&rb->waitq);
}

-static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
+static inline bool is_over_writable(struct bpf_ringbuf *rb)
+{
+ return !!rb->over_writable;
+}
+
+static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node, __u32 flags)
{
struct bpf_ringbuf *rb;

@@ -142,6 +149,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
rb->mask = data_sz - 1;
rb->consumer_pos = 0;
rb->producer_pos = 0;
+ rb->over_writable = !!(flags & BFP_F_RB_OVER_WRITABLE);

return rb;
}
@@ -170,7 +178,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)

bpf_map_init_from_attr(&rb_map->map, attr);

- rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
+ rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node, attr->map_flags);
if (!rb_map->rb) {
kfree(rb_map);
return ERR_PTR(-ENOMEM);
@@ -244,11 +252,15 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)

static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
{
- unsigned long cons_pos, prod_pos;
+ unsigned long cons_pos, prod_pos, diff;

cons_pos = smp_load_acquire(&rb->consumer_pos);
prod_pos = smp_load_acquire(&rb->producer_pos);
- return prod_pos - cons_pos;
+ diff = prod_pos - cons_pos;
+
+ if (is_over_writable(rb) && diff > rb->mask)
+ return rb->mask;
+ return diff;
}

static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
@@ -327,12 +339,29 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
prod_pos = rb->producer_pos;
new_prod_pos = prod_pos + len;

- /* check for out of ringbuf space by ensuring producer position
- * doesn't advance more than (ringbuf_size - 1) ahead
- */
- if (new_prod_pos - cons_pos > rb->mask) {
- spin_unlock_irqrestore(&rb->spinlock, flags);
- return NULL;
+ if (!is_over_writable(rb)) {
+ /* check for out of ringbuf space by ensuring producer position
+ * doesn't advance more than (ringbuf_size - 1) ahead
+ */
+ if (new_prod_pos - cons_pos > rb->mask) {
+ spin_unlock_irqrestore(&rb->spinlock, flags);
+ return NULL;
+ }
+ } else {
+ /*
+ * Data length is already rounded to be divisible by 8, but in
+ * the case of over writing buffer we need to round it again.
+ * Indeed, when the producer position will cross the buffer
+ * size, it is possible new position will not be divisible by
+ * buffer size.
+ * For example, if len is 520 and buffer size is 4096, then the
+ * next position after 4096 is 4160.
+ * This is a problem as it will impede us to over write data
+ * (4160 & 4095 = 64 which is different from 0).
+ * So by substracting the modulo of len, we are able to over
+ * write existing data.
+ */
+ new_prod_pos -= (new_prod_pos & rb->mask) % len;
}

hdr = (void *)rb->data + (prod_pos & rb->mask);
--
2.25.1

2022-08-10 17:42:01

by Francis Laniel

[permalink] [raw]
Subject: [RFC PATCH v1 2/3] do not merge: Temporary fix for is_power_of_2.

Signed-off-by: Francis Laniel <[email protected]>
---
tools/lib/bpf/libbpf.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e89cc9c885b3..5d0e997c85ea 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -4945,7 +4945,7 @@ static void bpf_map__destroy(struct bpf_map *map);

static bool is_pow_of_2(size_t x)
{
- return x && (x & (x - 1));
+ return x && (x & (x - 1)) == 0;
}

static size_t adjust_ringbuf_sz(size_t sz)
--
2.25.1

2022-08-10 17:48:43

by Francis Laniel

[permalink] [raw]
Subject: [PATCH] for test purpose only: Add toy to play with BPF ring buffer.

This patch should be applied on iovisor/bcc.

Signed-off-by: Francis Laniel <[email protected]>
---
libbpf-tools/Makefile | 1 +
libbpf-tools/toy.bpf.c | 32 ++++++++++++++++++++
libbpf-tools/toy.c | 67 ++++++++++++++++++++++++++++++++++++++++++
libbpf-tools/toy.h | 4 +++
4 files changed, 104 insertions(+)
create mode 100644 libbpf-tools/toy.bpf.c
create mode 100644 libbpf-tools/toy.c
create mode 100644 libbpf-tools/toy.h

diff --git a/libbpf-tools/Makefile b/libbpf-tools/Makefile
index c3bbac27..904e7712 100644
--- a/libbpf-tools/Makefile
+++ b/libbpf-tools/Makefile
@@ -62,6 +62,7 @@ APPS = \
tcplife \
tcprtt \
tcpsynbl \
+ toy \
vfsstat \
#

diff --git a/libbpf-tools/toy.bpf.c b/libbpf-tools/toy.bpf.c
new file mode 100644
index 00000000..b6b8f92b
--- /dev/null
+++ b/libbpf-tools/toy.bpf.c
@@ -0,0 +1,32 @@
+#include <linux/types.h>
+#include <bpf/bpf_helpers.h>
+#include <linux/bpf.h>
+#include "toy.h"
+
+
+struct {
+ __uint(type, BPF_MAP_TYPE_RINGBUF);
+ __uint(max_entries, 4096);
+ __uint(map_flags, 1U << 13);
+} buffer SEC(".maps");
+
+static __u32 count = 0;
+
+SEC("tracepoint/syscalls/sys_enter_execve")
+int sys_enter_execve(void) {
+ count++;
+ struct event *event = bpf_ringbuf_reserve(&buffer, sizeof(struct event), 0);
+ if (!event) {
+ return 1;
+ }
+
+ event->count = count;
+ bpf_ringbuf_submit(event, 0);
+
+ bpf_printk("addr: %p; count: %u\n", event, count);
+ bpf_printk("available: %lu; cons pos: %lu; prod pos: %lu\n", bpf_ringbuf_query(&buffer, 0), bpf_ringbuf_query(&buffer, BPF_RB_CONS_POS), bpf_ringbuf_query(&buffer, BPF_RB_PROD_POS));
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
\ No newline at end of file
diff --git a/libbpf-tools/toy.c b/libbpf-tools/toy.c
new file mode 100644
index 00000000..7e4f7fdf
--- /dev/null
+++ b/libbpf-tools/toy.c
@@ -0,0 +1,67 @@
+#include <bpf/libbpf.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "toy.h"
+#include "toy.skel.h"
+#include "btf_helpers.h"
+
+
+static int buf_process_sample(void *ctx, void *data, size_t len) {
+ struct event *evt = (struct event *)data;
+ printf("%d\n", evt->count);
+
+ return 0;
+}
+
+int main(void) {
+ LIBBPF_OPTS(bpf_object_open_opts, open_opts);
+ int buffer_map_fd = -1;
+ struct toy_bpf *obj;
+ int err;
+
+ libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+ err = ensure_core_btf(&open_opts);
+ if (err) {
+ fprintf(stderr, "failed to fetch necessary BTF for CO-RE: %s\n", strerror(-err));
+ return 1;
+ }
+
+ obj = toy_bpf__open_opts(&open_opts);
+ if (!obj) {
+ fprintf(stderr, "failed to open BPF object\n");
+ return 1;
+ }
+
+ err = toy_bpf__load(obj);
+ if (err) {
+ fprintf(stderr, "failed to load BPF object: %d\n", err);
+ return 1;
+ }
+
+ struct ring_buffer *ring_buffer;
+
+ buffer_map_fd = bpf_object__find_map_fd_by_name(obj->obj, "buffer");
+ ring_buffer = ring_buffer__new(buffer_map_fd, buf_process_sample, NULL, NULL);
+
+ if(!ring_buffer) {
+ fprintf(stderr, "failed to create ring buffer\n");
+ return 1;
+ }
+
+ err = toy_bpf__attach(obj);
+ if (err) {
+ fprintf(stderr, "failed to attach BPF programs\n");
+ return 1;
+ }
+
+ puts("Press any key to begin consuming!");
+ getchar();
+
+ while(1) {
+ ring_buffer__consume(ring_buffer);
+ sleep(1);
+ }
+
+ return 0;
+}
diff --git a/libbpf-tools/toy.h b/libbpf-tools/toy.h
new file mode 100644
index 00000000..36998170
--- /dev/null
+++ b/libbpf-tools/toy.h
@@ -0,0 +1,4 @@
+struct event {
+ __u32 count;
+ char filler[4096 / 8 - sizeof(__u32)];
+};
\ No newline at end of file
--
2.25.1

2022-08-10 18:31:28

by Francis Laniel

[permalink] [raw]
Subject: [RFC PATCH v1 3/3] libbpf: Make bpf ring buffer overwritable.

This patch permits using over writable feature for BPF ring buffer from
userspace.

Signed-off-by: Francis Laniel <[email protected]>
---
tools/include/uapi/linux/bpf.h | 3 +++
tools/lib/bpf/ringbuf.c | 35 +++++++++++++++++++++++++++++++++-
2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ef78e0e1a754..19c7039265d8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1226,6 +1226,9 @@ enum {

/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
+
+/* Create an over writable BPF_RINGBUF */
+ BFP_F_RB_OVER_WRITABLE = (1U << 13),
};

/* Flags for BPF_PROG_QUERY. */
diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c
index 8bc117bcc7bc..2bd584f7250b 100644
--- a/tools/lib/bpf/ringbuf.c
+++ b/tools/lib/bpf/ringbuf.c
@@ -23,6 +23,8 @@

struct ring {
ring_buffer_sample_fn sample_cb;
+ __u8 over_writable: 1,
+ __reserved: 7;
void *ctx;
void *data;
unsigned long *consumer_pos;
@@ -95,6 +97,7 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
r->sample_cb = sample_cb;
r->ctx = ctx;
r->mask = info.max_entries - 1;
+ r->over_writable = !!(info.map_flags & BFP_F_RB_OVER_WRITABLE);

/* Map writable consumer page */
tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
@@ -202,6 +205,11 @@ static inline int roundup_len(__u32 len)
return (len + 7) / 8 * 8;
}

+static inline bool is_over_writable(struct ring *r)
+{
+ return !!r->over_writable;
+}
+
static int64_t ringbuf_process_ring(struct ring* r)
{
int *len_ptr, len, err;
@@ -209,12 +217,25 @@ static int64_t ringbuf_process_ring(struct ring* r)
int64_t cnt = 0;
unsigned long cons_pos, prod_pos;
bool got_new_data;
+ int rounded_len;
void *sample;

cons_pos = smp_load_acquire(r->consumer_pos);
do {
got_new_data = false;
prod_pos = smp_load_acquire(r->producer_pos);
+
+ /*
+ * If the difference between the producrer position and that of
+ * the consumer is higher than the buffer size, it means the
+ * producer already looped over the buffer.
+ * So, data at consumer position were already over written.
+ * We can then bump consumer position to be that of the producer
+ * minus the buffer size.
+ */
+ if (is_over_writable(r) && prod_pos - cons_pos > r->mask)
+ cons_pos = prod_pos - (r->mask + 1);
+
while (cons_pos < prod_pos) {
len_ptr = r->data + (cons_pos & r->mask);
len = smp_load_acquire(len_ptr);
@@ -224,7 +245,19 @@ static int64_t ringbuf_process_ring(struct ring* r)
goto done;

got_new_data = true;
- cons_pos += roundup_len(len);
+ rounded_len = roundup_len(len);
+ cons_pos += rounded_len;
+
+ /*
+ * rounded_len is rounded to be divisible by 8, but a
+ * length divisible by 8 can be not divisible by 4096.
+ * So, we need to round again to avoid writing at new
+ * places.
+ * See kernel implementation for more details.
+ */
+ if (is_over_writable(r)) {
+ cons_pos -= (cons_pos & r->mask) % rounded_len;
+ }

if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) {
sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ;
--
2.25.1

2022-08-16 02:54:29

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [RFC PATCH v1 1/3] bpf: Make ring buffer overwritable.

On Wed, Aug 10, 2022 at 10:18 AM Francis Laniel
<[email protected]> wrote:
>
> By default, BPF ring buffer are size bounded, when producers already filled the
> buffer, they need to wait for the consumer to get those data before adding new
> ones.
> In terms of API, bpf_ringbuf_reserve() returns NULL if the buffer is full.
>
> This patch permits making BPF ring buffer overwritable.
> When producers already wrote as many data as the buffer size, they will begin to
> over write existing data, so the oldest will be replaced.
> As a result, bpf_ringbuf_reserve() never returns NULL.
>

Part of BPF ringbuf record (first 8 bytes) stores information like
record size and offset in pages to the beginning of ringbuf map
metadata. This is used by consumer to know how much data belongs to
data record, but also for making sure that
bpf_ringbuf_reserve()/bpf_ringbuf_submit() work correctly and don't
corrupt kernel memory.

If we simply allow overwriting this information (and no, spinlock
doesn't protect from that, you can have multiple producers writing to
different parts of ringbuf data area in parallel after "reserving"
their respective records), it completely breaks any sort of
correctness, both for user-space consumer and kernel-side producers.

> Signed-off-by: Francis Laniel <[email protected]>
> ---
> include/uapi/linux/bpf.h | 3 +++
> kernel/bpf/ringbuf.c | 51 +++++++++++++++++++++++++++++++---------
> 2 files changed, 43 insertions(+), 11 deletions(-)
>

[...]

2022-08-16 10:58:17

by Francis Laniel

[permalink] [raw]
Subject: Re: [RFC PATCH v1 1/3] bpf: Make ring buffer overwritable.

Hi.


Le lundi 15 ao?t 2022, 23:52:22 CEST Andrii Nakryiko a ?crit :
> On Wed, Aug 10, 2022 at 10:18 AM Francis Laniel
>
> <[email protected]> wrote:
> > By default, BPF ring buffer are size bounded, when producers already
> > filled the buffer, they need to wait for the consumer to get those data
> > before adding new ones.
> > In terms of API, bpf_ringbuf_reserve() returns NULL if the buffer is full.
> >
> > This patch permits making BPF ring buffer overwritable.
> > When producers already wrote as many data as the buffer size, they will
> > begin to over write existing data, so the oldest will be replaced.
> > As a result, bpf_ringbuf_reserve() never returns NULL.
>
> Part of BPF ringbuf record (first 8 bytes) stores information like
> record size and offset in pages to the beginning of ringbuf map
> metadata. This is used by consumer to know how much data belongs to
> data record, but also for making sure that
> bpf_ringbuf_reserve()/bpf_ringbuf_submit() work correctly and don't
> corrupt kernel memory.
>
> If we simply allow overwriting this information (and no, spinlock
> doesn't protect from that, you can have multiple producers writing to
> different parts of ringbuf data area in parallel after "reserving"
> their respective records), it completely breaks any sort of
> correctness, both for user-space consumer and kernel-side producers.

Thank you for your answer.
My current implementation is indeed wrong as I based it on the wrong
assumption than BPF ring buffer could only store data of the same size...
With data of different size, we can have the troubles you described.

I will rework my patches and send a new version once polished but I
cannot give an ETA.

> > Signed-off-by: Francis Laniel <[email protected]>
> > ---
> >
> > include/uapi/linux/bpf.h | 3 +++
> > kernel/bpf/ringbuf.c | 51 +++++++++++++++++++++++++++++++---------
> > 2 files changed, 43 insertions(+), 11 deletions(-)
>
> [...]


Best regards.


2022-08-16 12:44:11

by Alban Crequy

[permalink] [raw]
Subject: Re: [RFC PATCH v1 1/3] bpf: Make ring buffer overwritable.

On Tue, 16 Aug 2022 at 04:22, Andrii Nakryiko <[email protected]> wrote:
>
> On Wed, Aug 10, 2022 at 10:18 AM Francis Laniel
> <[email protected]> wrote:
> >
> > By default, BPF ring buffer are size bounded, when producers already filled the
> > buffer, they need to wait for the consumer to get those data before adding new
> > ones.
> > In terms of API, bpf_ringbuf_reserve() returns NULL if the buffer is full.
> >
> > This patch permits making BPF ring buffer overwritable.
> > When producers already wrote as many data as the buffer size, they will begin to
> > over write existing data, so the oldest will be replaced.
> > As a result, bpf_ringbuf_reserve() never returns NULL.
> >
>
> Part of BPF ringbuf record (first 8 bytes) stores information like
> record size and offset in pages to the beginning of ringbuf map
> metadata. This is used by consumer to know how much data belongs to
> data record, but also for making sure that
> bpf_ringbuf_reserve()/bpf_ringbuf_submit() work correctly and don't
> corrupt kernel memory.
>
> If we simply allow overwriting this information (and no, spinlock
> doesn't protect from that, you can have multiple producers writing to
> different parts of ringbuf data area in parallel after "reserving"
> their respective records), it completely breaks any sort of
> correctness, both for user-space consumer and kernel-side producers.

The perf ring buffer solved this issue by adding an option to write
data backward with commit 9ecda41acb97 ("perf/core: Add
::write_backward attribute to perf event"). I'd like to see the BPF
ring buffer have a backward option as well to make overwrites work
without corruption. It's not completely clear to me if that will work
but I'd like to explore this with Francis. (Francis and I work in the
same team and we would like to use this for
https://github.com/kinvolk/traceloop).

Best regards,
Alban