The tracing ring-buffers can be stored on disk or sent to network without any
copy via splice. However the later doesn't allow real time processing of the
traces. A solution is to give access to userspace to the ring-buffer pages
directly via a mapping. A piece of software can now become a reader of the
ring-buffer, and drive a consuming or non-consuming read in a similar fashion to
what trace and trace_pipe offer.
Attached to this cover letter an example of consuming read for a ring-buffer,
using libtracefs.
Vincent
--
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <signal.h>
#include <errno.h>
#include <unistd.h>
#include <tracefs.h>
#include <kbuffer.h>
#include <event-parse.h>
#include <asm/types.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#define TRACE_MMAP_IOCTL_GET_READER_PAGE _IO('T', 0x1)
struct ring_buffer_meta_page {
__u64 entries;
__u64 overrun;
__u32 pages_touched;
__u32 reader_page;
__u32 nr_data_pages;
__u32 data_page_head;
__u32 data_pages[];
};
/* Need to access private struct to save counters */
struct kbuffer {
unsigned long long timestamp;
long long lost_events;
unsigned long flags;
void *subbuffer;
void *data;
unsigned int index;
unsigned int curr;
unsigned int next;
unsigned int size;
unsigned int start;
unsigned int first;
unsigned int (*read_4)(void *ptr);
unsigned long long (*read_8)(void *ptr);
unsigned long long (*read_long)(struct kbuffer *kbuf, void *ptr);
int (*next_event)(struct kbuffer *kbuf);
};
static char *argv0;
static bool need_exit;
static char *get_this_name(void)
{
static char *this_name;
char *arg;
char *p;
if (this_name)
return this_name;
arg = argv0;
p = arg+strlen(arg);
while (p >= arg && *p != '/')
p--;
p++;
this_name = p;
return p;
}
static void __vdie(const char *fmt, va_list ap, int err)
{
int ret = errno;
char *p = get_this_name();
if (err && errno)
perror(p);
else
ret = -1;
fprintf(stderr, " ");
vfprintf(stderr, fmt, ap);
fprintf(stderr, "\n");
exit(ret);
}
void pdie(const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
__vdie(fmt, ap, 1);
va_end(ap);
}
static void read_page(struct tep_handle *tep, struct kbuffer *kbuf,
void *data, int page)
{
static struct trace_seq seq;
struct tep_record record;
if (seq.buffer)
trace_seq_reset(&seq);
else
trace_seq_init(&seq);
while ((record.data = kbuffer_read_event(kbuf, &record.ts))) {
kbuffer_next_event(kbuf, NULL);
tep_print_event(tep, &seq, &record,
"%s-%d %9d\t%s\n", TEP_PRINT_COMM,
TEP_PRINT_PID, TEP_PRINT_TIME, TEP_PRINT_NAME);
trace_seq_do_printf(&seq);
trace_seq_reset(&seq);
}
}
static int next_reader_page(int fd, struct ring_buffer_meta_page *meta,
struct kbuffer *kbuf)
{
int prev_reader_page = meta->reader_page;
if (ioctl(fd, TRACE_MMAP_IOCTL_GET_READER_PAGE) < 0)
pdie("ioctl");
return meta->reader_page;
}
static void signal_handler(int unused)
{
printf("Exit!\n");
need_exit = true;
}
int main(int argc, char **argv)
{
int page_size, data_len, page, fd, start = -1;
struct ring_buffer_meta_page *map;
struct kbuffer *kbuf, prev_kbuf;
struct tep_handle *tep;
__u64 prev_entries;
void *meta, *data;
char *buf, path[32];
int cpu;
argv0 = argv[0];
cpu = atoi(argv[1]);
snprintf(path, 32, "per_cpu/cpu%d/trace_pipe_raw", cpu);
tep = tracefs_local_events(NULL);
kbuf = tep_kbuffer(tep);
page_size = getpagesize();
fd = tracefs_instance_file_open(NULL, path, O_RDONLY);
if (fd < 0)
pdie("raw");
meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
if (meta == MAP_FAILED)
pdie("mmap");
map = meta;
printf("entries: %llu\n", map->entries);
printf("overrun: %llu\n", map->overrun);
printf("pages_touched: %u\n", map->pages_touched);
printf("reader_page: %u\n", map->reader_page);
printf("nr_data_pages: %u\n\n", map->nr_data_pages);
data_len = page_size * (map->nr_data_pages + 1);
data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, fd, page_size);
if (data == MAP_FAILED)
pdie("mmap data");
signal(SIGINT, signal_handler);
page = ((struct ring_buffer_meta_page*)meta)->reader_page;
again:
do {
kbuffer_load_subbuffer(kbuf, data + page_size * page);
if (page != start) {
printf("READER PAGE: %d\n", map->reader_page);
} else {
kbuf->curr = prev_kbuf.curr;
kbuf->index = prev_kbuf.index;
kbuf->next = prev_kbuf.next;
kbuf->timestamp = prev_kbuf.timestamp;
kbuffer_next_event(kbuf, NULL);
}
prev_entries = map->entries;
start = page;
read_page(tep, kbuf, data, page);
} while ((page = next_reader_page(fd, meta, kbuf)) != start);
prev_kbuf.curr = kbuf->curr;
prev_kbuf.index = kbuf->index;
prev_kbuf.next = kbuf->next;
prev_kbuf.timestamp = kbuf->timestamp;
while (prev_entries == *(volatile __u64 *)&map->entries && !need_exit)
usleep(100000);
if (!need_exit)
goto again;
munmap(data, data_len);
munmap(meta, page_size);
close(fd);
return 0;
}
Vincent Donnefort (2):
ring-buffer: Introducing ring-buffer mapping functions
tracing: Allow user-space mapping of the ring-buffer
include/linux/ring_buffer.h | 8 +
include/uapi/linux/trace_mmap.h | 27 +++
kernel/trace/ring_buffer.c | 334 +++++++++++++++++++++++++++++++-
kernel/trace/trace.c | 73 ++++++-
4 files changed, 436 insertions(+), 6 deletions(-)
create mode 100644 include/uapi/linux/trace_mmap.h
--
2.40.0.rc1.284.g88254d51c5-goog
In preparation for allowing the user-space to map a ring-buffer, add
a set of mapping functions:
ring_buffer_{map,unmap}()
ring_buffer_map_fault()
And controls on the ring-buffer:
ring_buffer_get_reader_page() /* swap reader and head */
ring_buffer_update_meta_page()
Mapping the ring-buffer also involves:
A unique ID for each page of the ring-buffer, as currently the pages
are only identified through their in-kernel VA.
A meta-page, where are stored statistics about the ring-buffer and
a page IDs list, ordered. A field gives what page is the reader
one and one to gives where the ring-buffer starts in the list of data
pages.
The linear mapping exposes the meta-page, and each page of the
ring-buffer, ordered following their unique ID, assigned during the
first mapping.
Once mapped, no page can get in or out of the ring-buffer: the buffer
size will remain unmodified and the splice enabling functions will in
reality simply memcpy the data instead of swapping pages.
Also, the meta-page being... a single page, this limits at the moment the
number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
system.
Signed-off-by: Vincent Donnefort <[email protected]>
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 782e14f62201..4897e17ebdde 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -6,6 +6,8 @@
#include <linux/seq_file.h>
#include <linux/poll.h>
+#include <uapi/linux/trace_mmap.h>
+
struct trace_buffer;
struct ring_buffer_iter;
@@ -211,4 +213,10 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node);
#define trace_rb_cpu_prepare NULL
#endif
+int ring_buffer_map(struct trace_buffer *buffer, int cpu);
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
+struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
+ unsigned long pgoff);
+int ring_buffer_get_reader_page(struct trace_buffer *buffer, int cpu);
+int ring_buffer_update_meta_page(struct trace_buffer *buffer, int cpu);
#endif /* _LINUX_RING_BUFFER_H */
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
new file mode 100644
index 000000000000..b5caed17a066
--- /dev/null
+++ b/include/uapi/linux/trace_mmap.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_TRACE_MMAP_H_
+#define _UAPI_TRACE_MMAP_H_
+
+#include <asm/bitsperlong.h>
+
+#include <linux/types.h>
+
+struct ring_buffer_meta_page {
+#if __BITS_PER_LONG == 64
+ __u64 entries;
+ __u64 overrun;
+#else
+ __u32 entries;
+ __u32 overrun;
+#endif
+ __u32 pages_touched;
+ __u32 reader_page;
+ __u32 nr_data_pages; /* doesn't take into account the reader_page */
+ __u32 data_page_head; /* index of data_pages[] */
+ __u32 data_pages[];
+};
+
+#endif /* _UAPI_TRACE_MMAP_H_ */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index af50d931b020..08765310380b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -332,6 +332,7 @@ struct buffer_page {
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
struct buffer_data_page *page; /* Actual data page */
+ u32 id; /* ID for external mapping */
};
/*
@@ -529,6 +530,12 @@ struct ring_buffer_per_cpu {
rb_time_t before_stamp;
u64 event_stamp[MAX_NEST];
u64 read_stamp;
+
+ int mapped;
+ struct mutex mapping_lock;
+ unsigned long *page_ids; /* ID to addr */
+ struct ring_buffer_meta_page *meta_page;
+
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -1452,12 +1459,37 @@ static inline void rb_inc_page(struct buffer_page **bpage)
*bpage = list_entry(p, struct buffer_page, list);
}
+static inline void
+rb_meta_page_head_move(struct ring_buffer_per_cpu *cpu_buffer, unsigned long num)
+{
+ unsigned long head_id;
+
+ if (!READ_ONCE(cpu_buffer->mapped))
+ return;
+
+ head_id = cpu_buffer->meta_page->data_page_head;
+ cpu_buffer->meta_page->data_page_head = (head_id + num) % cpu_buffer->nr_pages;
+}
+
+static inline void
+rb_meta_page_head_swap(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_meta_page *meta = cpu_buffer->meta_page;
+
+ if (!READ_ONCE(cpu_buffer->mapped))
+ return;
+
+ meta->reader_page = cpu_buffer->head_page->id;
+ meta->data_pages[meta->data_page_head] = cpu_buffer->reader_page->id;
+}
+
static struct buffer_page *
rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *head;
struct buffer_page *page;
struct list_head *list;
+ unsigned long cnt = 0;
int i;
if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
@@ -1479,9 +1511,12 @@ rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
do {
if (rb_is_head_page(page, page->list.prev)) {
cpu_buffer->head_page = page;
+ rb_meta_page_head_move(cpu_buffer, cnt);
+
return page;
}
rb_inc_page(&page);
+ cnt++;
} while (page != head);
}
@@ -1567,6 +1602,13 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
/* Again, either we update tail_page or an interrupt does */
(void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
}
+
+ if (READ_ONCE(cpu_buffer->mapped)) {
+ /* Ensure the meta_page is ready */
+ smp_rmb();
+ WRITE_ONCE(cpu_buffer->meta_page->pages_touched,
+ local_read(&cpu_buffer->pages_touched));
+ }
}
static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1735,6 +1777,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&cpu_buffer->irq_work.waiters);
init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
+ mutex_init(&cpu_buffer->mapping_lock);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
@@ -2173,7 +2216,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
-
if (cpu_id == RING_BUFFER_ALL_CPUS) {
/*
* Don't succeed if resizing is disabled, as a reader might be
@@ -2523,6 +2565,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
local_inc(&cpu_buffer->pages_lost);
+ if (READ_ONCE(cpu_buffer->mapped)) {
+ /* Ensure the meta_page is ready */
+ smp_rmb();
+ WRITE_ONCE(cpu_buffer->meta_page->overrun,
+ local_read(&cpu_buffer->overrun));
+ }
+
/*
* The entries will be zeroed out when we move the
* tail page.
@@ -3179,6 +3228,14 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
local_inc(&cpu_buffer->entries);
+
+ if (READ_ONCE(cpu_buffer->mapped)) {
+ /* Ensure the meta_page is ready */
+ smp_rmb();
+ WRITE_ONCE(cpu_buffer->meta_page->entries,
+ local_read(&cpu_buffer->entries));
+ }
+
rb_end_commit(cpu_buffer);
}
@@ -3482,7 +3539,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
return;
/*
- * If this interrupted another event,
+ * If this interrupted another event,
*/
if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
goto out;
@@ -4643,7 +4700,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
* Now make the new head point back to the reader page.
*/
rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
+ rb_meta_page_head_swap(cpu_buffer);
rb_inc_page(&cpu_buffer->head_page);
+ rb_meta_page_head_move(cpu_buffer, 1);
local_inc(&cpu_buffer->pages_read);
@@ -5285,6 +5344,12 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->lost_events = 0;
cpu_buffer->last_overrun = 0;
+ if (READ_ONCE(cpu_buffer->mapped)) {
+ WRITE_ONCE(cpu_buffer->meta_page->entries, 0);
+ WRITE_ONCE(cpu_buffer->meta_page->pages_touched, 0);
+ WRITE_ONCE(cpu_buffer->meta_page->overrun, 0);
+ }
+
rb_head_page_activate(cpu_buffer);
}
@@ -5489,6 +5554,11 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
cpu_buffer_a = buffer_a->buffers[cpu];
cpu_buffer_b = buffer_b->buffers[cpu];
+ if (READ_ONCE(cpu_buffer_a->mapped) || READ_ONCE(cpu_buffer_b->mapped)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* At least make sure the two buffers are somewhat the same */
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
@@ -5722,7 +5792,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
* Otherwise, we can simply swap the page with the one passed in.
*/
if (read || (len < (commit - read)) ||
- cpu_buffer->reader_page == cpu_buffer->commit_page) {
+ cpu_buffer->reader_page == cpu_buffer->commit_page ||
+ READ_ONCE(cpu_buffer->mapped)) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
@@ -5839,6 +5910,263 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+#define META_PAGE_MAX_PAGES \
+ ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
+
+static void rb_free_page_ids(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ kfree(cpu_buffer->page_ids);
+ cpu_buffer->page_ids = NULL;
+}
+
+static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ if (cpu_buffer->meta_page)
+ return 0;
+
+ if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES)
+ return -E2BIG;
+
+ cpu_buffer->meta_page = page_to_virt(alloc_page(GFP_USER));
+ if (!cpu_buffer->meta_page)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ free_page((unsigned long)cpu_buffer->meta_page);
+ cpu_buffer->meta_page = NULL;
+}
+
+static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long *page_ids)
+{
+ struct ring_buffer_meta_page *meta = cpu_buffer->meta_page;
+ struct buffer_page *first_page, *bpage;
+ int id = 0;
+
+ page_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
+ cpu_buffer->reader_page->id = id++;
+
+ first_page = bpage = rb_set_head_page(cpu_buffer);
+ do {
+ if (id > META_PAGE_MAX_PAGES) {
+ WARN_ON(1);
+ break;
+ }
+
+ page_ids[id] = (unsigned long)bpage->page;
+ bpage->id = id;
+ meta->data_pages[id - 1] = id;
+
+ rb_inc_page(&bpage);
+ id++;
+ } while (bpage != first_page);
+
+ /* install page ID to kern VA translation */
+ cpu_buffer->page_ids = page_ids;
+
+ meta->entries = 0;
+ meta->overrun = 0;
+ meta->pages_touched = 0;
+ meta->reader_page = cpu_buffer->reader_page->id;
+ meta->nr_data_pages = cpu_buffer->nr_pages;
+ meta->data_page_head = 0;
+}
+
+static inline struct ring_buffer_per_cpu *
+rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return ERR_PTR(-EINVAL);
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (!cpu_buffer->mapped) {
+ mutex_unlock(&cpu_buffer->mapping_lock);
+ return ERR_PTR(-ENODEV);
+ }
+
+ return cpu_buffer;
+}
+
+static inline void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ mutex_unlock(&cpu_buffer->mapping_lock);
+}
+
+int ring_buffer_map(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags, *page_ids;
+ int err = 0;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (cpu_buffer->mapped) {
+ WRITE_ONCE(cpu_buffer->mapped, cpu_buffer->mapped + 1);
+ goto unlock;
+ }
+
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+ atomic_inc(&cpu_buffer->resize_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ err = rb_alloc_meta_page(cpu_buffer);
+ if (err) {
+ atomic_dec(&cpu_buffer->resize_disabled);
+ goto unlock;
+ }
+
+ /* page_ids include the reader page while nr_pages does not */
+ page_ids = kzalloc(sizeof(*page_ids) * (cpu_buffer->nr_pages + 1),
+ GFP_KERNEL);
+ if (!page_ids) {
+ rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ /*
+ * Lock all readers to block any page swap until the page IDs are
+ * assigned.
+ */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ rb_setup_ids_meta_page(cpu_buffer, page_ids);
+ /*
+ * Ensure the writer will observe the meta-page before
+ * cpu_buffer->mapped.
+ */
+ smp_wmb();
+ WRITE_ONCE(cpu_buffer->mapped, 1);
+
+ /* Init meta_page values unless the writer did it already */
+ cmpxchg(&cpu_buffer->meta_page->entries, 0,
+ local_read(&cpu_buffer->entries));
+ cmpxchg(&cpu_buffer->meta_page->overrun, 0,
+ local_read(&cpu_buffer->overrun));
+ cmpxchg(&cpu_buffer->meta_page->pages_touched, 0,
+ local_read(&cpu_buffer->pages_touched));
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+unlock:
+ mutex_unlock(&cpu_buffer->mapping_lock);
+
+ return err;
+}
+
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int err = 0;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (!cpu_buffer->mapped) {
+ err = -ENODEV;
+ goto unlock;
+ }
+
+ WRITE_ONCE(cpu_buffer->mapped, cpu_buffer->mapped - 1);
+ if (!cpu_buffer->mapped) {
+ /* Wait the writer and readers to observe !mapped */
+ synchronize_rcu();
+
+ rb_free_page_ids(cpu_buffer);
+ rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ }
+
+unlock:
+ mutex_unlock(&cpu_buffer->mapping_lock);
+
+ return err;
+}
+
+/*
+ * +--------------+
+ * | meta page | pgoff=0
+ * +--------------+
+ * | data page1 | pgoff=1 page_ids=0
+ * +--------------+
+ * | data page2 | pgoff=2 page_ids=1
+ * ...
+ */
+struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
+ unsigned long pgoff)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+
+ if (!pgoff)
+ return virt_to_page(cpu_buffer->meta_page);
+
+ pgoff--;
+ if (pgoff > cpu_buffer->nr_pages)
+ return NULL;
+
+ return virt_to_page(cpu_buffer->page_ids[pgoff]);
+}
+
+int ring_buffer_get_reader_page(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *reader;
+ unsigned long flags;
+
+ cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
+ if (IS_ERR(cpu_buffer))
+ return (int)PTR_ERR(cpu_buffer);
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ reader = cpu_buffer->reader_page;
+ reader->read = rb_page_size(reader);
+ if (!rb_per_cpu_empty(cpu_buffer))
+ WARN_ON(!rb_get_reader_page(cpu_buffer));
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ rb_put_mapped_buffer(cpu_buffer);
+
+ return 0;
+}
+
+int ring_buffer_update_meta_page(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+
+ cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
+ if (IS_ERR(cpu_buffer))
+ return PTR_ERR(cpu_buffer);
+
+ /* Update the head page if the writer moved it */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ rb_set_head_page(cpu_buffer);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ rb_put_mapped_buffer(cpu_buffer);
+
+ return 0;
+}
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
--
2.40.0.rc1.284.g88254d51c5-goog
Currently, user-space extracts data from the ring-buffer via splice,
which is handy for storage or network sharing. However, due to splice
limitations, it is imposible to do real-time analysis without a copy.
A solution for that problem is to let the user-space map the ring-buffer
directly.
The mapping exposed via the per-CPU file trace_pipe_raw. The first page
is the meta-page and is followed by each page of the ring-buffer,
ordered by their unique page ID. It is therefore easy to translate a
page-ID to an offset in the mapping.
* Meta-page -- include/uapi/linux/trace_mmap.h for a description
* Page ID 0
* Page ID 1
...
The mapper must then do what use to be the kernel jobs: swap the reader
with the head. This is done with a newly introduced ioctl:
TRACE_MMAP_IOCTL_GET_READER_PAGE.
Entries, pages_touched and overrun fields are automatically updated by
the writer. Only readers keep the head page field updated. An additional
ioctl TRACE_MMAP_IOCTL_UPDATE_META_PAGE allows to query that update,
enabling non-consuming read from userspace.
Signed-off-by: Vincent Donnefort <[email protected]>
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
index b5caed17a066..24bcec754a35 100644
--- a/include/uapi/linux/trace_mmap.h
+++ b/include/uapi/linux/trace_mmap.h
@@ -21,4 +21,7 @@ struct ring_buffer_meta_page {
__u32 data_pages[];
};
+#define TRACE_MMAP_IOCTL_GET_READER_PAGE _IO('T', 0x1)
+#define TRACE_MMAP_IOCTL_UPDATE_META_PAGE _IO('T', 0x2)
+
#endif /* _UAPI_TRACE_MMAP_H_ */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 45551c7b4c36..51d06a2a7545 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6399,7 +6399,7 @@ static void tracing_set_nop(struct trace_array *tr)
{
if (tr->current_trace == &nop_trace)
return;
-
+
tr->current_trace->enabled--;
if (tr->current_trace->reset)
@@ -8432,15 +8432,27 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return ret;
}
-/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
+ switch (cmd) {
+ case TRACE_MMAP_IOCTL_GET_READER_PAGE:
+ return ring_buffer_get_reader_page(iter->array_buffer->buffer,
+ iter->cpu_file);
+ case TRACE_MMAP_IOCTL_UPDATE_META_PAGE:
+ return ring_buffer_update_meta_page(iter->array_buffer->buffer,
+ iter->cpu_file);
+ }
+
if (cmd)
- return -ENOIOCTLCMD;
+ return -ENOTTY;
+ /*
+ * An ioctl call with cmd 0 to the ring buffer file will wake up all
+ * waiters
+ */
mutex_lock(&trace_types_lock);
iter->wait_index++;
@@ -8453,6 +8465,60 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
return 0;
}
+static vm_fault_t tracing_buffers_mmap_fault(struct vm_fault *vmf)
+{
+ struct ftrace_buffer_info *info = vmf->vma->vm_file->private_data;
+ struct trace_iterator *iter = &info->iter;
+ vm_fault_t ret = VM_FAULT_SIGBUS;
+ struct page *page;
+
+ page = ring_buffer_map_fault(iter->array_buffer->buffer, iter->cpu_file,
+ vmf->pgoff);
+ if (!page)
+ return ret;
+
+ get_page(page);
+ vmf->page = page;
+
+ return 0;
+}
+
+static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
+{
+ struct ftrace_buffer_info *info = vma->vm_file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file);
+}
+
+static void tracing_buffers_mmap_open(struct vm_area_struct *vma)
+{
+ struct ftrace_buffer_info *info = vma->vm_file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ WARN_ON(ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file));
+}
+
+static const struct vm_operations_struct tracing_buffers_vmops = {
+ .open = tracing_buffers_mmap_open,
+ .close = tracing_buffers_mmap_close,
+ .fault = tracing_buffers_mmap_fault,
+};
+
+static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct ftrace_buffer_info *info = filp->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ if (vma->vm_flags & VM_WRITE)
+ return -EPERM;
+
+ vm_flags_mod(vma, VM_DONTCOPY | VM_DONTDUMP, VM_MAYWRITE);
+ vma->vm_ops = &tracing_buffers_vmops;
+
+ return ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file);
+}
+
static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open,
.read = tracing_buffers_read,
@@ -8461,6 +8527,7 @@ static const struct file_operations tracing_buffers_fops = {
.splice_read = tracing_buffers_splice_read,
.unlocked_ioctl = tracing_buffers_ioctl,
.llseek = no_llseek,
+ .mmap = tracing_buffers_mmap,
};
static ssize_t
--
2.40.0.rc1.284.g88254d51c5-goog
On Fri, 17 Mar 2023 14:33:09 +0000
Vincent Donnefort <[email protected]> wrote:
> Also, the meta-page being... a single page, this limits at the moment the
> number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
> system.
I hate this limitation, so I fixed it ;-)
I added a meta_page_size field to the meta page, and user space can do:
meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
if (meta == MAP_FAILED)
pdie("mmap");
map = meta;
meta_len = map->meta_page_size;
if (meta_len > page_size) {
munmap(meta, page_size);
meta = mmap(NULL, meta_len, PROT_READ, MAP_SHARED, fd, 0);
if (meta == MAP_FAILED)
pdie("mmap");
map = meta;
}
This appears to work (but I'm still testing it).
-- Steve
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
index 24bcec754a35..12f3f7ee33d9 100644
--- a/include/uapi/linux/trace_mmap.h
+++ b/include/uapi/linux/trace_mmap.h
@@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
__u32 reader_page;
__u32 nr_data_pages; /* doesn't take into account the reader_page */
__u32 data_page_head; /* index of data_pages[] */
+ __u32 meta_page_size; /* size of the meta page */
__u32 data_pages[];
};
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 10a17e78cfe6..77c92e4a7adc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -526,6 +526,7 @@ struct ring_buffer_per_cpu {
u64 read_stamp;
int mapped;
+ int meta_order;
struct mutex mapping_lock;
unsigned long *page_ids; /* ID to addr */
struct ring_buffer_meta_page *meta_page;
@@ -5898,7 +5899,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
#define META_PAGE_MAX_PAGES \
- ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
+ ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_pages))) >> 2)
static void rb_free_page_ids(struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -5908,22 +5909,34 @@ static void rb_free_page_ids(struct ring_buffer_per_cpu *cpu_buffer)
static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
+ struct page *meta_pages;
+ int pages;
+ int order = 0;
+
if (cpu_buffer->meta_page)
return 0;
- if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES)
- return -E2BIG;
-
- cpu_buffer->meta_page = page_to_virt(alloc_page(GFP_USER));
- if (!cpu_buffer->meta_page)
+ if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES) {
+ /* Calcualte how many more pages we need to hold indexes */
+ pages = DIV_ROUND_UP(cpu_buffer->nr_pages - META_PAGE_MAX_PAGES,
+ PAGE_SIZE / sizeof(u32));
+ /* Add back the meta_page itself */
+ pages++;
+ order = fls(pages) - 1;
+ }
+ meta_pages = alloc_pages(GFP_USER, order);
+ if (!meta_pages)
return -ENOMEM;
+ cpu_buffer->meta_page = page_to_virt(meta_pages);
+ cpu_buffer->meta_order = order;
+
return 0;
}
static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
- free_page((unsigned long)cpu_buffer->meta_page);
+ free_pages((unsigned long)cpu_buffer->meta_page, cpu_buffer->meta_order);
cpu_buffer->meta_page = NULL;
}
@@ -5932,14 +5945,20 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
{
struct ring_buffer_meta_page *meta = cpu_buffer->meta_page;
struct buffer_page *first_page, *bpage;
+ int data_page_end;
int id = 0;
page_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
cpu_buffer->reader_page->id = id++;
+ /* Calculate the last index of data_pages[] */
+ data_page_end = (1 << (cpu_buffer->meta_order + PAGE_SHIFT)) -
+ offsetof(struct ring_buffer_meta_page, data_pages);
+ data_page_end /= sizeof(u32);
+
first_page = bpage = rb_set_head_page(cpu_buffer);
do {
- if (id > META_PAGE_MAX_PAGES) {
+ if (id > data_page_end) {
WARN_ON(1);
break;
}
@@ -5960,6 +5979,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
meta->pages_touched = 0;
meta->reader_page = cpu_buffer->reader_page->id;
meta->nr_data_pages = cpu_buffer->nr_pages;
+ meta->meta_page_size = 1 << (cpu_buffer->meta_order + PAGE_SHIFT);
meta->data_page_head = 0;
}
@@ -6092,10 +6112,12 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
/*
* +--------------+
* | meta page | pgoff=0
+ * | ... |
+ * | | pgoff=(1<<cpu_buffer->meta_order - 1)
* +--------------+
- * | data page1 | pgoff=1 page_ids=0
+ * | data page1 | page_ids=0
* +--------------+
- * | data page2 | pgoff=2 page_ids=1
+ * | data page2 | page_ids=1
* ...
*/
struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
@@ -6103,10 +6125,11 @@ struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
- if (!pgoff)
- return virt_to_page(cpu_buffer->meta_page);
+ if (pgoff < (1 << cpu_buffer->meta_order) + 1)
+ return virt_to_page((void *)cpu_buffer->meta_page + (pgoff << PAGE_SHIFT));
+
+ pgoff -= (1 << cpu_buffer->meta_order);
- pgoff--;
if (pgoff > cpu_buffer->nr_pages)
return NULL;
On Mon, Mar 20, 2023 at 09:45:16PM -0400, Steven Rostedt wrote:
> On Fri, 17 Mar 2023 14:33:09 +0000
> Vincent Donnefort <[email protected]> wrote:
>
> > Also, the meta-page being... a single page, this limits at the moment the
> > number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
> > system.
>
> I hate this limitation, so I fixed it ;-)
Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?
>
> I added a meta_page_size field to the meta page, and user space can do:
>
> meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> if (meta == MAP_FAILED)
> pdie("mmap");
>
> map = meta;
> meta_len = map->meta_page_size;
>
> if (meta_len > page_size) {
> munmap(meta, page_size);
> meta = mmap(NULL, meta_len, PROT_READ, MAP_SHARED, fd, 0);
> if (meta == MAP_FAILED)
> pdie("mmap");
> map = meta;
> }
>
> This appears to work (but I'm still testing it).
>
> -- Steve
>
> diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
> index 24bcec754a35..12f3f7ee33d9 100644
> --- a/include/uapi/linux/trace_mmap.h
> +++ b/include/uapi/linux/trace_mmap.h
> @@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
> __u32 reader_page;
> __u32 nr_data_pages; /* doesn't take into account the reader_page */
> __u32 data_page_head; /* index of data_pages[] */
> + __u32 meta_page_size; /* size of the meta page */
Do we want a specific field here? That could be deduced from nr_data_pages()
quite easily?
> __u32 data_pages[];
> };
>
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 10a17e78cfe6..77c92e4a7adc 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -526,6 +526,7 @@ struct ring_buffer_per_cpu {
> u64 read_stamp;
>
> int mapped;
> + int meta_order;
> struct mutex mapping_lock;
> unsigned long *page_ids; /* ID to addr */
> struct ring_buffer_meta_page *meta_page;
> @@ -5898,7 +5899,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
> EXPORT_SYMBOL_GPL(ring_buffer_read_page);
>
> #define META_PAGE_MAX_PAGES \
> - ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
> + ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_pages))) >> 2)
>
[...]
On Tue, 21 Mar 2023 15:17:15 +0000
Vincent Donnefort <[email protected]> wrote:
> On Mon, Mar 20, 2023 at 09:45:16PM -0400, Steven Rostedt wrote:
> > On Fri, 17 Mar 2023 14:33:09 +0000
> > Vincent Donnefort <[email protected]> wrote:
> >
> > > Also, the meta-page being... a single page, this limits at the moment the
> > > number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
> > > system.
> >
> > I hate this limitation, so I fixed it ;-)
>
> Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?
Hold off, I found some bugs that I'm fixing ;-)
>
> >
> > I added a meta_page_size field to the meta page, and user space can do:
> >
> > meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> > if (meta == MAP_FAILED)
> > pdie("mmap");
> >
> > map = meta;
> > meta_len = map->meta_page_size;
> >
> > if (meta_len > page_size) {
> > munmap(meta, page_size);
> > meta = mmap(NULL, meta_len, PROT_READ, MAP_SHARED, fd, 0);
> > if (meta == MAP_FAILED)
> > pdie("mmap");
> > map = meta;
> > }
> >
> > This appears to work (but I'm still testing it).
> >
> > -- Steve
> >
> > diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
> > index 24bcec754a35..12f3f7ee33d9 100644
> > --- a/include/uapi/linux/trace_mmap.h
> > +++ b/include/uapi/linux/trace_mmap.h
> > @@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
> > __u32 reader_page;
> > __u32 nr_data_pages; /* doesn't take into account the reader_page */
> > __u32 data_page_head; /* index of data_pages[] */
> > + __u32 meta_page_size; /* size of the meta page */
>
> Do we want a specific field here? That could be deduced from nr_data_pages()
> quite easily?
I rather not have too much implementation detail knowledge in user space.
It only removes a single entry, and it makes user space easier. In fact,
I'm thinking we should not include "__u32 data_pages[]" but instead add a:
"__u32 data_start" where user space does:
__u32 *data_pages = (_u32 *)meta_page + meta_page->data_start;
That way we could extend the data provided by the meta_page in the future.
-- Steve
>
>
> > __u32 data_pages[];
> > };
> >
> > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> > index 10a17e78cfe6..77c92e4a7adc 100644
> > --- a/kernel/trace/ring_buffer.c
> > +++ b/kernel/trace/ring_buffer.c
> > @@ -526,6 +526,7 @@ struct ring_buffer_per_cpu {
> > u64 read_stamp;
> >
> > int mapped;
> > + int meta_order;
> > struct mutex mapping_lock;
> > unsigned long *page_ids; /* ID to addr */
> > struct ring_buffer_meta_page *meta_page;
> > @@ -5898,7 +5899,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
> > EXPORT_SYMBOL_GPL(ring_buffer_read_page);
> >
> > #define META_PAGE_MAX_PAGES \
> > - ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
> > + ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_pages))) >> 2)
> >
>
> [...]
On Tue, Mar 21, 2023 at 11:40:47AM -0400, Steven Rostedt wrote:
> On Tue, 21 Mar 2023 15:17:15 +0000
> Vincent Donnefort <[email protected]> wrote:
>
> > On Mon, Mar 20, 2023 at 09:45:16PM -0400, Steven Rostedt wrote:
> > > On Fri, 17 Mar 2023 14:33:09 +0000
> > > Vincent Donnefort <[email protected]> wrote:
> > >
> > > > Also, the meta-page being... a single page, this limits at the moment the
> > > > number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
> > > > system.
> > >
> > > I hate this limitation, so I fixed it ;-)
> >
> > Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?
>
> Hold off, I found some bugs that I'm fixing ;-)
>
> >
> > >
> > > I added a meta_page_size field to the meta page, and user space can do:
> > >
> > > meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> > > if (meta == MAP_FAILED)
> > > pdie("mmap");
> > >
> > > map = meta;
> > > meta_len = map->meta_page_size;
> > >
> > > if (meta_len > page_size) {
> > > munmap(meta, page_size);
> > > meta = mmap(NULL, meta_len, PROT_READ, MAP_SHARED, fd, 0);
> > > if (meta == MAP_FAILED)
> > > pdie("mmap");
> > > map = meta;
> > > }
> > >
> > > This appears to work (but I'm still testing it).
> > >
> > > -- Steve
> > >
> > > diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
> > > index 24bcec754a35..12f3f7ee33d9 100644
> > > --- a/include/uapi/linux/trace_mmap.h
> > > +++ b/include/uapi/linux/trace_mmap.h
> > > @@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
> > > __u32 reader_page;
> > > __u32 nr_data_pages; /* doesn't take into account the reader_page */
> > > __u32 data_page_head; /* index of data_pages[] */
> > > + __u32 meta_page_size; /* size of the meta page */
> >
> > Do we want a specific field here? That could be deduced from nr_data_pages()
> > quite easily?
>
> I rather not have too much implementation detail knowledge in user space.
> It only removes a single entry, and it makes user space easier. In fact,
Ack.
> I'm thinking we should not include "__u32 data_pages[]" but instead add a:
> "__u32 data_start" where user space does:
>
> __u32 *data_pages = (_u32 *)meta_page + meta_page->data_start;
>
> That way we could extend the data provided by the meta_page in the future.
That'd be nice. Couldn't we keep both to simplify the code for the kernel side?
>
> -- Steve
>
On Tue, 21 Mar 2023 11:40:47 -0400
Steven Rostedt <[email protected]> wrote:
> >
> > Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?
>
> Hold off, I found some bugs that I'm fixing ;-)
OK, you can fold this in. I also fixed an issue with your patch where it
was missing setting a page->mapping and also clearing it.
I haven't updated to replace "__u32 *data_pages[]" with an "__u32 data_start"
But I think that should still be done.
-- Steve
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
index 24bcec754a35..12f3f7ee33d9 100644
--- a/include/uapi/linux/trace_mmap.h
+++ b/include/uapi/linux/trace_mmap.h
@@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
__u32 reader_page;
__u32 nr_data_pages; /* doesn't take into account the reader_page */
__u32 data_page_head; /* index of data_pages[] */
+ __u32 meta_page_size; /* size of the meta page */
__u32 data_pages[];
};
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 10a17e78cfe6..d546fdd14fc3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -526,6 +526,7 @@ struct ring_buffer_per_cpu {
u64 read_stamp;
int mapped;
+ int meta_order;
struct mutex mapping_lock;
unsigned long *page_ids; /* ID to addr */
struct ring_buffer_meta_page *meta_page;
@@ -5898,32 +5899,63 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
#define META_PAGE_MAX_PAGES \
- ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
+ ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_pages))) >> 2)
+
+static void unmap_page(unsigned long addr)
+{
+ struct page *page = virt_to_page(addr);
+
+ page->mapping = NULL;
+}
static void rb_free_page_ids(struct ring_buffer_per_cpu *cpu_buffer)
{
+ int i;
+
+ for (i = 0; i < cpu_buffer->nr_pages; i++)
+ unmap_page(cpu_buffer->page_ids[i]);
+
kfree(cpu_buffer->page_ids);
cpu_buffer->page_ids = NULL;
}
static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
+ struct page *meta_pages;
+ int pages;
+ int order = 0;
+
if (cpu_buffer->meta_page)
return 0;
- if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES)
- return -E2BIG;
-
- cpu_buffer->meta_page = page_to_virt(alloc_page(GFP_USER));
- if (!cpu_buffer->meta_page)
+ if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES) {
+ /* Calcualte how many more pages we need to hold indexes */
+ pages = DIV_ROUND_UP(cpu_buffer->nr_pages - META_PAGE_MAX_PAGES,
+ PAGE_SIZE / sizeof(u32));
+ /* Add back the meta_page itself */
+ pages++;
+ order = fls(pages) - 1;
+ }
+ meta_pages = alloc_pages(GFP_USER, order);
+ if (!meta_pages)
return -ENOMEM;
+ cpu_buffer->meta_page = page_to_virt(meta_pages);
+ cpu_buffer->meta_order = order;
+
return 0;
}
static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
- free_page((unsigned long)cpu_buffer->meta_page);
+ unsigned long addr = (unsigned long)cpu_buffer->meta_page;
+ int i;
+
+ for (i = 0; i < (1 << cpu_buffer->meta_order); i++) {
+ unmap_page(addr);
+ addr += PAGE_SIZE;
+ }
+ free_pages((unsigned long)cpu_buffer->meta_page, cpu_buffer->meta_order);
cpu_buffer->meta_page = NULL;
}
@@ -5932,14 +5964,20 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
{
struct ring_buffer_meta_page *meta = cpu_buffer->meta_page;
struct buffer_page *first_page, *bpage;
+ int data_page_end;
int id = 0;
page_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
cpu_buffer->reader_page->id = id++;
+ /* Calculate the last index of data_pages[] */
+ data_page_end = (1 << (cpu_buffer->meta_order + PAGE_SHIFT)) -
+ offsetof(struct ring_buffer_meta_page, data_pages);
+ data_page_end /= sizeof(u32);
+
first_page = bpage = rb_set_head_page(cpu_buffer);
do {
- if (id > META_PAGE_MAX_PAGES) {
+ if (id > data_page_end) {
WARN_ON(1);
break;
}
@@ -5960,6 +5998,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
meta->pages_touched = 0;
meta->reader_page = cpu_buffer->reader_page->id;
meta->nr_data_pages = cpu_buffer->nr_pages;
+ meta->meta_page_size = 1 << (cpu_buffer->meta_order + PAGE_SHIFT);
meta->data_page_head = 0;
}
@@ -6092,10 +6131,12 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
/*
* +--------------+
* | meta page | pgoff=0
+ * | ... |
+ * | | pgoff=(1<<cpu_buffer->meta_order - 1)
* +--------------+
- * | data page1 | pgoff=1 page_ids=0
+ * | data page1 | page_ids=0
* +--------------+
- * | data page2 | pgoff=2 page_ids=1
+ * | data page2 | page_ids=1
* ...
*/
struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
@@ -6103,10 +6144,11 @@ struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
- if (!pgoff)
- return virt_to_page(cpu_buffer->meta_page);
+ if (pgoff < (1 << cpu_buffer->meta_order))
+ return virt_to_page((void *)cpu_buffer->meta_page + (pgoff << PAGE_SHIFT));
+
+ pgoff -= (1 << cpu_buffer->meta_order);
- pgoff--;
if (pgoff > cpu_buffer->nr_pages)
return NULL;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ea48eabce7b7..2f43e4a842e7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8479,9 +8479,12 @@ static vm_fault_t tracing_buffers_mmap_fault(struct vm_fault *vmf)
if (!page)
return ret;
- get_page(page);
vmf->page = page;
+ get_page(vmf->page);
+ vmf->page->mapping = vmf->vma->vm_file->f_mapping;
+ vmf->page->index = vmf->pgoff;
+
return 0;
}
On Tue, Mar 21, 2023 at 12:44:25PM -0400, Steven Rostedt wrote:
> On Tue, 21 Mar 2023 11:40:47 -0400
> Steven Rostedt <[email protected]> wrote:
>
> > >
> > > Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?
> >
> > Hold off, I found some bugs that I'm fixing ;-)
>
> OK, you can fold this in. I also fixed an issue with your patch where it
> was missing setting a page->mapping and also clearing it.
>
> I haven't updated to replace "__u32 *data_pages[]" with an "__u32 data_start"
> But I think that should still be done.
>
> -- Steve
>
[...]
Thanks! I'll prepare a v2 with all that!
On Tue, 21 Mar 2023 16:20:42 +0000
Vincent Donnefort <[email protected]> wrote:
> > > Do we want a specific field here? That could be deduced from nr_data_pages()
> > > quite easily?
> >
> > I rather not have too much implementation detail knowledge in user space.
> > It only removes a single entry, and it makes user space easier. In fact,
>
> Ack.
>
> > I'm thinking we should not include "__u32 data_pages[]" but instead add a:
> > "__u32 data_start" where user space does:
> >
> > __u32 *data_pages = (_u32 *)meta_page + meta_page->data_start;
> >
> > That way we could extend the data provided by the meta_page in the future.
>
> That'd be nice. Couldn't we keep both to simplify the code for the kernel side?
I would not expose the data_pages[] to user space, because then they'll use
it, and that *will* become an API.
But we could expose it to the kernel side with;
include/uapi/linux/trace_mmap.h:
struct ring_buffer_meta_page {
#if __BITS_PER_LONG == 64
__u64 entries;
__u64 overrun;
#else
__u32 entries;
__u32 overrun;
#endif
__u32 pages_touched;
__u32 reader_page;
__u32 nr_data_pages; /* doesn't take into account the reader_page */
__u32 data_page_head; /* index of data_pages[] */
__u32 meta_page_size; /* size of the meta page */
__u32 data_start; /* offset to where data_pages are */
};
kernel/trace/ring_buffer.c:
struct ring_buffer_meta {
struct ring_buffer_meta_page meta;
u32 data_pages[];
}
Then we can start each function with:
struct ring_buffer_meta_page *meta = &cpu_buffer->meta_page.meta;
u32 *data_pages = cpu_buffer->meta_page.data_pages;
-- Steve