2019-06-07 16:25:56

by John Ogness

[permalink] [raw]
Subject: [RFC PATCH v2 0/2] printk: new ringbuffer implementation

Hello,

This is a follow-up RFC on the work to reimplement much of
the core of printk. The original thread can be seen here[0].

One of the results of that thread was that the work needs to
be broken up into several pieces. A roadmap was laid out[1]
and this RFC is for the base component of the first piece:
a new ringbuffer implementation for printk.

This series does not touch any existing printk code. It is
only the ringbuffer implementation. I am particularly
interested in feedback relating to the design of the
ringbuffer and the use of memory barriers.

The series also includes a test module that performs some
heavy writer stress testing. I have successfully run these
tests on a 16-core ARM64 platform.

John Ogness

[0] https://lkml.kernel.org/r/[email protected]
[1] https://lkml.kernel.org/r/[email protected]

John Ogness (2):
printk-rb: add a new printk ringbuffer implementation
printk-rb: add test module

Documentation/core-api/index.rst | 1 +
Documentation/core-api/printk-ringbuffer.rst | 104 +++
include/linux/printk_ringbuffer.h | 238 +++++++
lib/Makefile | 2 +
lib/printk_ringbuffer.c | 924 +++++++++++++++++++++++++++
lib/test_prb.c | 237 +++++++
6 files changed, 1506 insertions(+)
create mode 100644 Documentation/core-api/printk-ringbuffer.rst
create mode 100644 include/linux/printk_ringbuffer.h
create mode 100644 lib/printk_ringbuffer.c
create mode 100644 lib/test_prb.c

--
2.11.0


2019-06-07 16:26:04

by John Ogness

[permalink] [raw]
Subject: [RFC PATCH v2 2/2] printk-rb: add test module

This module does some heavy write stress testing on the ringbuffer
with a reader that is checking for integrity.

Signed-off-by: John Ogness <[email protected]>
---
lib/Makefile | 2 +
lib/test_prb.c | 237 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 239 insertions(+)
create mode 100644 lib/test_prb.c

diff --git a/lib/Makefile b/lib/Makefile
index fb7697031a79..9a485274b6ba 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -292,3 +292,5 @@ obj-$(CONFIG_GENERIC_LIB_MULDI3) += muldi3.o
obj-$(CONFIG_GENERIC_LIB_CMPDI2) += cmpdi2.o
obj-$(CONFIG_GENERIC_LIB_UCMPDI2) += ucmpdi2.o
obj-$(CONFIG_OBJAGG) += objagg.o
+
+obj-m += test_prb.o
diff --git a/lib/test_prb.c b/lib/test_prb.c
new file mode 100644
index 000000000000..2c365028f4e4
--- /dev/null
+++ b/lib/test_prb.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/printk_ringbuffer.h>
+
+/*
+ * This is a test module that starts "num_online_cpus() - 1" writer threads
+ * and 1 reader thread. The writer threads each write strings of varying
+ * length. They do this as fast as they can.
+ *
+ * The reader thread reads as fast as it can and performs sanity checks on
+ * the data.
+ *
+ * Because the threads are running in such tight loops, they will call
+ * schedule() from time to time so the system stays alive.
+ *
+ * If either the writers or the reader encounter an error, the test is
+ * aborted. Test results are recorded to the ftrace buffers. The test can
+ * be aborted manually by removing the module. (Ideally the test should
+ * never abort on its own.)
+ */
+
+struct rbdata {
+ int len;
+ char text[0];
+};
+
+static char *test_running;
+static int halt_test;
+
+static void dump_rb(struct printk_ringbuffer *rb)
+{
+ DECLARE_PRINTKRB_ENTRY(entry, 140);
+ DECLARE_PRINTKRB_ITER(iter, rb, &entry);
+ struct rbdata *dat;
+ u64 last_seq = 0;
+ int len;
+
+ trace_printk("BEGIN full dump\n");
+
+ prb_for_each_entry(&iter, len) {
+ if (entry.seq - last_seq != 1) {
+ trace_printk("LOST %llu\n",
+ entry.seq - (last_seq + 1));
+ }
+ last_seq = entry.seq;
+
+ dat = (struct rbdata *)&entry.buffer[0];
+
+ trace_printk("seq=%llu len=%d textlen=%d dataval=%s\n",
+ entry.seq, len, dat->len, dat->text);
+ }
+
+ trace_printk("END full dump\n");
+}
+
+DECLARE_PRINTKRB(test_rb, 7, 5);
+
+static int prbtest_writer(void *data)
+{
+ unsigned long num = (unsigned long)data;
+ struct prb_reserved_entry e;
+ char id = 'A' + num;
+ struct rbdata *dat;
+ int count = 0;
+ int len;
+
+ pr_err("prbtest: start thread %lu (writer)\n", num);
+
+ for (;;) {
+ len = sizeof(struct rbdata) + (prandom_u32() & 0x7f) + 1;
+
+ dat = (struct rbdata *)prb_reserve(&e, &test_rb, len);
+ if (dat) {
+ len -= sizeof(struct rbdata) + 1;
+ memset(&dat->text[0], id, len);
+ dat->text[len] = 0;
+ dat->len = len;
+ prb_commit(&e);
+ } else {
+ WRITE_ONCE(halt_test, 1);
+ trace_printk("writer%lu (%c) failed to reserve\n",
+ num, id);
+ }
+
+ if ((count++ & 0x3fff) == 0)
+ schedule();
+
+ if (READ_ONCE(halt_test) == 1)
+ break;
+ }
+
+ pr_err("prbtest: end thread %lu (writer)\n", num);
+
+ test_running[num] = 0;
+
+ return 0;
+}
+
+static int prbtest_reader(void *data)
+{
+ unsigned long num = (unsigned long)data;
+ DECLARE_PRINTKRB_ENTRY(entry, 140);
+ DECLARE_PRINTKRB_ITER(iter, &test_rb, &entry);
+ unsigned long total_lost = 0;
+ unsigned long max_lost = 0;
+ struct rbdata *dat;
+ int did_sched = 1;
+ u64 last_seq = 0;
+ int count = 0;
+ int len;
+
+ pr_err("prbtest: start thread %lu (reader)\n", num);
+
+ for (;;) {
+ prb_for_each_entry(&iter, len) {
+ if (entry.seq - last_seq != 1 && !did_sched) {
+ total_lost += entry.seq - (last_seq + 1);
+ if (max_lost < entry.seq - (last_seq + 1))
+ max_lost = entry.seq - (last_seq + 1);
+ }
+ last_seq = entry.seq;
+ did_sched = 0;
+
+ dat = (struct rbdata *)&entry.buffer[0];
+
+ len = strlen(dat->text);
+ if (len != dat->len) {
+ WRITE_ONCE(halt_test, 1);
+ trace_printk("reader%lu invalid length\n",
+ num);
+ }
+ while (len) {
+ len--;
+ if (dat->text[len] != dat->text[0]) {
+ WRITE_ONCE(halt_test, 1);
+ trace_printk("reader%lu invalid data\n",
+ num);
+ }
+ }
+
+ if ((count++ & 0x3fff) == 0) {
+ did_sched = 1;
+ schedule();
+ }
+
+ if (READ_ONCE(halt_test) == 1)
+ goto out;
+ }
+ }
+out:
+ pr_err("reader%lu: total_lost=%lu max_lost=%lu seq=%llu\n",
+ num, total_lost, max_lost, entry.seq);
+ pr_err("prbtest: end thread %lu (reader)\n", num);
+
+ test_running[num] = 0;
+
+ return 0;
+}
+
+static int module_test_running;
+
+static int start_test(void *arg)
+{
+ struct task_struct *thread;
+ unsigned long i;
+ int num_cpus;
+
+ num_cpus = num_online_cpus();
+ test_running = kzalloc(num_cpus, GFP_KERNEL);
+ if (!test_running)
+ return -ENOMEM;
+
+ module_test_running = 1;
+
+ pr_err("prbtest: starting test\n");
+
+ for (i = 0; i < num_cpus; i++) {
+ test_running[i] = 1;
+ if (i < num_cpus - 1) {
+ thread = kthread_run(prbtest_writer, (void *)i,
+ "prbtest writer");
+ } else {
+ thread = kthread_run(prbtest_reader, (void *)i,
+ "prbtest reader");
+ }
+ if (IS_ERR(thread)) {
+ pr_err("prbtest: unable to create thread %lu\n", i);
+ test_running[i] = 0;
+ }
+ }
+
+ for (;;) {
+ for (i = 0; i < num_cpus; i++) {
+ if (test_running[i] == 1)
+ break;
+ }
+ if (i == num_cpus)
+ break;
+ msleep(1000);
+ }
+
+ pr_err("prbtest: completed test\n");
+
+ dump_rb(&test_rb);
+
+ module_test_running = 0;
+
+ kfree(test_running);
+
+ return 0;
+}
+
+static int prbtest_init(void)
+{
+ kthread_run(start_test, NULL, "prbtest");
+ return 0;
+}
+
+static void prbtest_exit(void)
+{
+ WRITE_ONCE(halt_test, 1);
+
+ while (module_test_running)
+ msleep(1000);
+}
+
+module_init(prbtest_init);
+module_exit(prbtest_exit);
+
+MODULE_AUTHOR("John Ogness <[email protected]>");
+MODULE_DESCRIPTION("printk ringbuffer test");
+MODULE_LICENSE("GPL v2");
--
2.11.0

2019-06-07 19:20:39

by John Ogness

[permalink] [raw]
Subject: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

See documentation for details.

Signed-off-by: John Ogness <[email protected]>
---
Documentation/core-api/index.rst | 1 +
Documentation/core-api/printk-ringbuffer.rst | 104 +++
include/linux/printk_ringbuffer.h | 238 +++++++
lib/printk_ringbuffer.c | 924 +++++++++++++++++++++++++++
4 files changed, 1267 insertions(+)
create mode 100644 Documentation/core-api/printk-ringbuffer.rst
create mode 100644 include/linux/printk_ringbuffer.h
create mode 100644 lib/printk_ringbuffer.c

diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index ee1bb8983a88..0ab649134577 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -27,6 +27,7 @@ Core utilities
errseq
printk-formats
circular-buffers
+ printk-ringbuffer
generic-radix-tree
memory-allocation
mm-api
diff --git a/Documentation/core-api/printk-ringbuffer.rst b/Documentation/core-api/printk-ringbuffer.rst
new file mode 100644
index 000000000000..5634c2010ed8
--- /dev/null
+++ b/Documentation/core-api/printk-ringbuffer.rst
@@ -0,0 +1,104 @@
+=====================
+The printk Ringbuffer
+=====================
+
+:Author: John Ogness <[email protected]>
+
+
+.. Contents:
+
+ (*) Overview
+ - Features
+ - Terminology
+ - Behavior
+ - Data Blocks
+ - Descriptors
+ - Why Descriptors?
+
+ (*) Memory Barriers
+ - Writers
+ - Readers
+
+ (*) Structures, Macros, Functions
+
+ (*) Examples
+ - Writer
+ - Reader
+
+
+Overview
+========
+
+.. kernel-doc:: lib/printk_ringbuffer.c
+ :doc: prb overview
+
+
+Memory Barriers
+===============
+
+.. kernel-doc:: lib/printk_ringbuffer.c
+ :doc: memory barriers
+
+
+Examples
+========
+
+Here are some simple examples demonstrating writers and readers. For the
+examples it is assumed that a global ringbuffer is available::
+
+ DECLARE_PRINTKRB(rb, 7, 5);
+
+This expects an average data size of 128 bytes and allows up to 32
+descriptors.
+
+
+Writer
+------
+
+Sample writer code::
+
+ struct prb_reserved_entry e;
+ char *s;
+
+ s = prb_reserve(&e, &rb, 32);
+ if (s) {
+ sprintf(s, "Hello, world!");
+ prb_commit(&e);
+ }
+
+
+Reader
+------
+
+Sample reader code::
+
+ DECLARE_PRINTKRB_ENTRY(entry, 128);
+ DECLARE_PRINTKRB_ITER(iter, &test_rb, &entry);
+ u64 last_seq = 0;
+ int len;
+ char *s;
+
+ prb_for_each_entry(&iter, len) {
+ if (entry.seq - last_seq != 1) {
+ printf("LOST %llu ENTRIES\n",
+ entry.seq - (last_seq + 1));
+ }
+ last_seq = entry.seq;
+
+ s = (char *)&entry.buffer[0];
+ if (len >= 128)
+ s[128 - 1] = 0;
+ printf("data: %s\n", s);
+ }
+
+
+Structures, Macros, Functions
+=============================
+
+Follwing is a description of all the printk ringbuffer data structures,
+macros, and functions, most of which are private. Public interfaces are
+explicitly mentioned/marked as so.
+
+.. kernel-doc:: include/linux/printk_ringbuffer.h
+.. kernel-doc:: lib/printk_ringbuffer.c
+ :functions:
diff --git a/include/linux/printk_ringbuffer.h b/include/linux/printk_ringbuffer.h
new file mode 100644
index 000000000000..569980a61c0a
--- /dev/null
+++ b/include/linux/printk_ringbuffer.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PRINTK_RINGBUFFER_H
+#define _LINUX_PRINTK_RINGBUFFER_H
+
+#include <linux/atomic.h>
+
+/**
+ * struct prb_list - An abstract linked list of items.
+ * @oldest: The oldest item on the list.
+ * @newest: The newest item on the list.
+ *
+ * Items are represented as integers (logical array indexes) into an external
+ * array. For the data array they represent the beginning of the data for that
+ * item. For the descriptor array they represent the array element of the
+ * descriptor.
+ *
+ * Traversing the list requires that the items also include integers to the
+ * next item in the list. Note that only the descriptor list is ever
+ * traversed.
+ */
+struct prb_list {
+ /* private */
+ unsigned long oldest;
+ unsigned long newest;
+};
+
+/**
+ * struct prb_descr - A descriptor representing an entry in the ringbuffer.
+ * @seq: The sequence number of the entry.
+ * @id: The descriptor id.
+ * The location of the descriptor within the descriptor array can be
+ * determined from this value.
+ * @data: The logical position of the data for this entry.
+ * The location of the beginning of the data within the data array
+ * can be determined from this value.
+ * @data_next: The logical position of the data next to this entry.
+ * This is used to determine the length of the data as well as
+ * identify where the next data begins.
+ * @next: The id of the next (newer) descriptor in the linked list.
+ * A value of EOL means it is the last descriptor in the list.
+ *
+ * Descriptors are used to identify where the data for each entry is and
+ * also provide an ordering for readers. Entry ordering is based on the
+ * descriptor linked list (not the ordering of data in the data array).
+ */
+struct prb_descr {
+ /* private */
+ u64 seq;
+ unsigned long id;
+ unsigned long data;
+ unsigned long data_next;
+ unsigned long next;
+};
+
+/**
+ * struct printk_ringbuffer - The ringbuffer structure.
+ * @data_array_size_bits: The size of the data array as a power-of-2.
+ * @data_array: A pointer to the data array.
+ * @data_list: A list of entry data.
+ * Since the data list is not traversed, this list is only used to
+ * mark the contiguous section of the data array that is in use.
+ * @descr_max_count: The maximum amount of descriptors allowed.
+ * @descr_array: A pointer to the descriptor array.
+ * @descr_list: A list of entry descriptors.
+ * The list can be traversed from oldest to newest.
+ * @descr_next: An index of the next available (never before used) descriptor.
+ * This value only increases until the maximum is reached.
+ * @lost: A counter tracking how often writers failed to write.
+ * This is only provided as convenience. It does not increment
+ * automatically. Writers must increment it after they have deteremined
+ * that a write failed.
+ */
+struct printk_ringbuffer {
+ /* private */
+ unsigned int data_array_size_bits;
+ char *data_array;
+ struct prb_list data_list;
+
+ unsigned int descr_max_count;
+ struct prb_descr *descr_array;
+ struct prb_list descr_list;
+
+ atomic_t descr_next;
+
+ atomic_long_t lost;
+};
+
+/**
+ * struct prb_reserved_entry - Used by writers to reserve/commit data.
+ * @rb: The printk ringbuffer used for reserve/commit.
+ * @descr: A pointer to the descriptor assigned to the reserved data.
+ * @id: The descriptor's id value, set on reserve.
+ * @data: The descriptor's data value, set on reserve.
+ * @data_next: The descriptor's future data_next value, set on commit.
+ * @irqflags: Local IRQs are disabled during the reserve/commit window.
+ *
+ * A writer provides this structure when reserving and committing data. The
+ * values of all the members are set on reserve and are only valid until
+ * commit.
+ */
+struct prb_reserved_entry {
+ /* private */
+ struct printk_ringbuffer *rb;
+ struct prb_descr *descr;
+ unsigned long id;
+ unsigned long data;
+ unsigned long data_next;
+ unsigned long irqflags;
+};
+
+/**
+ * struct prb_entry - Used by readers to read a ringbuffer entry.
+ * @seq: The sequence number of the entry descriptor.
+ * @buffer: A pointer to a reader-provided buffer.
+ * When reading an entry, the data is copied to this buffer.
+ * @buffer_size: The size of the reader-provided buffer.
+ *
+ * A reader initializes and provides this structure when traversing/reading
+ * the entries of the ringbuffer.
+ */
+struct prb_entry {
+ /* public */
+ u64 seq;
+ char *buffer;
+ int buffer_size;
+};
+
+/**
+ * struct prb_iterator - Used by readers to traverse a descriptor list.
+ * @rb: The printk ringbuffer being traversed.
+ * @e: A pointer to a reader-provided entry structure.
+ * @id: The id of the descriptor last accessed.
+ * @id_next: The id of the next (newer) descriptor to access.
+ */
+struct prb_iterator {
+ /* private */
+ struct printk_ringbuffer *rb;
+ struct prb_entry *e;
+ unsigned long id;
+ unsigned long id_next;
+};
+
+/**
+ * DECLARE_PRINTKRB() - Declare a printk ringbuffer.
+ * @name: The name for the ringbuffer structure variable.
+ * @avgdatabits: The average size of data as a power-of-2.
+ * If this value is too small, it will not be possible to store
+ * as many entries as desired. If this value is too large, there
+ * will be some wasted space in the data array because there are
+ * not enough descriptors. Generatlly values that are too large
+ * are preferred over thost that are too small.
+ * @descrbits: The number of descriptors (desired entries) as a power-of-2.
+ *
+ * The size of the data array will be the average data size multiplied by the
+ * number of descriptors.
+ */
+#define DECLARE_PRINTKRB(name, avgdatabits, descrbits) \
+char _##name##_data_array[(1 << ((avgdatabits) + (descrbits))) + \
+ sizeof(long)] \
+ __aligned(__alignof__(long)); \
+struct prb_descr _##name##_descr_array[1 << (descrbits)]; \
+struct printk_ringbuffer name = { \
+ .data_array_size_bits = (avgdatabits) + (descrbits), \
+ .data_array = &_##name##_data_array[0], \
+ .data_list.oldest = -111 * sizeof(long), \
+ .data_list.newest = -111 * sizeof(long), \
+ .descr_max_count = 1 << (descrbits), \
+ .descr_array = &_##name##_descr_array[0], \
+ .descr_next = ATOMIC_INIT(0), \
+ .descr_list.oldest = 0, \
+ .descr_list.newest = 0, \
+ .lost = ATOMIC_LONG_INIT(0), \
+}
+
+/**
+ * DECLARE_PRINTKRB_ENTRY() - Declare an entry structure.
+ * @name: The name for the entry structure variable.
+ * @size: The size of the associated reader buffer (also declared).
+ *
+ * This macro is particularly useful for static entry structures that should be
+ * immediately available and initialized. It is an alternative to the reader
+ * manually setting the buffer and buffer_size members of the structure.
+ *
+ * Note that this macro will declare the buffer as well. This could be a
+ * problem if this is used with a large buffer size within a stack frame.
+ */
+#define DECLARE_PRINTKRB_ENTRY(name, size) \
+char _##name##_entry_buf[size]; \
+struct prb_entry name = { \
+ .buffer_size = size, \
+ .buffer = &_##name##_entry_buf[0], \
+}
+
+/**
+ * DECLARE_PRINTKRB_ITER() - Declare an iterator for readers.
+ * @name: The name for the iterator structure variable.
+ * @rbaddr: A pointer to a printk ringbuffer.
+ * @entryaddr: A pointer to an entry structure.
+ *
+ * This macro is particularly useful for static iterators that should be
+ * immediately available and initialized. It is an alternative to
+ * manually initializing an iterator with prb_iter_init().
+ */
+#define DECLARE_PRINTKRB_ITER(name, rbaddr, entryaddr) \
+struct prb_iterator name = { \
+ .rb = rbaddr, \
+ .e = entryaddr, \
+ .id = 0, \
+ .id_next = 0, \
+}
+
+/* writer interface */
+char *prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ unsigned int size);
+void prb_commit(struct prb_reserved_entry *e);
+
+/* reader interface */
+void prb_iter_init(struct prb_iterator *iter, struct printk_ringbuffer *rb,
+ struct prb_entry *e);
+int prb_iter_next_valid_entry(struct prb_iterator *iter);
+bool prb_iter_peek_next_entry(struct prb_iterator *iter);
+
+/**
+ * prb_for_each_entry() - Iterate through all the entries of a ringbuffer.
+ * @i: A pointer to an iterator.
+ * @l: An integer used to identify when the last entry is traversed.
+ *
+ * This macro expects the iterator to be initialized. It also does not reset
+ * the iterator. So if the iterator has already been used for some traversal,
+ * this macro will continue where the iterator left off.
+ */
+#define prb_for_each_entry(i, l) \
+ for (; (l = prb_iter_next_valid_entry(i)) != 0;)
+
+/* utility functions */
+void prb_inc_lost(struct printk_ringbuffer *rb);
+
+#endif /*_LINUX_PRINTK_RINGBUFFER_H */
diff --git a/lib/printk_ringbuffer.c b/lib/printk_ringbuffer.c
new file mode 100644
index 000000000000..d0b2b6a549b0
--- /dev/null
+++ b/lib/printk_ringbuffer.c
@@ -0,0 +1,924 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/printk_ringbuffer.h>
+
+/**
+ * DOC: prb overview
+ *
+ * As the name suggests, this ringbuffer was implemented specifically to
+ * serve the needs of the printk() infrastructure. The ringbuffer itself is
+ * not specific to printk and could be used for other purposes. However, the
+ * requirements and semantics of printk are rather unique. If you intend to use
+ * this ringbuffer for anything other than printk, you need to be very clear on
+ * its features, behavior, and pitfalls.
+ *
+ * Features
+ * --------
+ * * single global buffer
+ * * resides in initialized data section (available at early boot)
+ * * supports multiple concurrent lockless writers
+ * * supports multiple concurrent lockless readers
+ * * safe from any context (including NMI)
+ * * groups bytes into variable length data blocks (referenced by entries)
+ * * entries tagged with sequence numbers
+ *
+ * Terminology
+ * -----------
+ * data block - A contiguous block of data containing an id to an associated
+ * descriptor and the raw data from the writer.
+ * descriptor - Meta data for a data block containing an id, the logical
+ * positions of the associated data block, a unique sequence
+ * number, and a pointer to the next (newer) descriptor.
+ * entry - A high level object used by the readers/writers that contains a
+ * descriptor as well as state information during the reserve/commit
+ * window.
+ *
+ * Behavior
+ * --------
+ * Since the printk ringbuffer is lockless, there exists no synchronization
+ * between readers and writers. Basically writers are the tasks in control and
+ * may overwrite any and all committed data at any time and from any context.
+ * For this reason readers can miss entries if they are overwritten before the
+ * reader was able to access the data. The reader API implementation is such
+ * that reader access to data is atomic, so there is no risk of readers having
+ * to deal with partial or corrupt data blocks. Also, entries include the
+ * sequence number of the associated descriptor so that readers can recognize
+ * if entries were missed.
+ *
+ * Writing to the ringbuffer consists of 2 steps. First a writer must reserve
+ * a data block of desired size. After this step the writer has exclusive
+ * access to the memory region. Once the writer's data has been written to
+ * memory, the entry needs to be committed to the ringbuffer. After this step
+ * the data has been inserted into the ringbuffer and assigned an appropriate
+ * sequence number.
+ *
+ * Once committed, a writer must no longer access the data directly. This is
+ * because the data may have been overwritten and no longer exists. If a
+ * writer must access the data, it should either keep a private copy before
+ * committing or use the reader API to gain access to the data.
+ *
+ * Because of how the data backend is implemented, data blocks that have been
+ * reserved but not yet committed act as barriers, preventing future writers
+ * from filling the ringbuffer beyond the location of the reserved but not
+ * yet committed data block region. For this reason it is important that
+ * writers perform both reserve and commit as quickly as possible. Also, be
+ * aware that local interrupts are disabled during the reserve/commit window.
+ * Writers in NMI contexts can still preempt any other writers, but as long
+ * as these writers do not write a large amount of data with respect to the
+ * ringbuffer size, this should not become an issue.
+ *
+ * Data Blocks
+ * -----------
+ * All ring buffer data is stored within a single static byte array. The reason
+ * for this is to ensure that any pointers to the data (past and present) will
+ * always point to valid memory. This is important because the lockless readers
+ * and writers may preempt for long periods of time and when they resume may be
+ * working with expired pointers.
+ *
+ * Data blocks are specified by start and end indices. (The end index has the
+ * same value as the start index of the neighboring data block.) But indices
+ * are not simply offsets into the byte array. They are logical position
+ * values (lpos) that always increase but map directly to byte array offsets.
+ *
+ * For example, for a byte array of 1000, a data block may have have a start
+ * lpos of 100. Another data block may have a start lpos of 1100. And yet
+ * another 2100. All of these data blocks are pointing to the same data, but
+ * only the most recent data block is valid. The other data blocks are pointing
+ * to valid memory, but represent data blocks that have been overwritten.
+ *
+ * Also note that due to overflowing, the most recent data block is not
+ * necessarily the one with the highest lpos. Indeed, the printk ring buffer
+ * initializes its data such that an overflow happens relatively quickly in
+ * order to validate the handling of this situation.
+ *
+ * If a data block starts near the end of the byte array but would extend
+ * beyond it, that data block is handled differently: a special "wrapping data
+ * block" is inserted into the byte array and the real data block is placed at
+ * the beginning of the byte array. This can waste space at the end of the byte
+ * array, but simplifies the implementation by allowing writers to always work
+ * with contiguous buffers. For example, for a 1000 byte array, a descriptor
+ * may show a start lpos of 1950 and an end lpos of 2100. The data block
+ * associated with this descriptor is 100 bytes in size and its data is found
+ * at offset 0 of the byte array.
+ *
+ * Descriptors
+ * -----------
+ * A descriptor is a handle to a data block. Like data blocks, all descriptors
+ * are also stored in their own single static array. The reasoning is the same
+ * as for the data blocks: pointers to descriptors should point to valid
+ * memory at all times, even if the descriptor itself has become invalid.
+ *
+ * Descriptors contain the start (data) and end (data_next) lpos of the data
+ * block they represent. They also have their own id that works like the lpos
+ * for data blocks: values that always increase but map directly to the
+ * descriptor array offset.
+ *
+ * For example, for a descriptor array of 10, a descriptor may have an id of
+ * 1, another of 11, and another of 21. All of these descriptor ids are
+ * pointing to the same descriptor, but only the most recent descriptor id
+ * is valid. The other values represent descriptors that have become invalid.
+ *
+ * Why Descriptors?
+ * ----------------
+ * At first glance it may seem as though descriptors are an unnecessary
+ * abstraction layer added over the data array. After all, couldn't we just put
+ * the fields of a descriptor into a structure at the head of the data block?
+ * The answer is no. The reason is that the printk ring buffer supports
+ * variable length records, which means that data blocks will not always begin
+ * at a predictable offset of the byte array. This is a major problem for
+ * lockless writers that, for example, will need to expire old data blocks
+ * when the ringbuffer is full. A writer has no way of knowing if it is
+ * allowed to push the oldest pointer. Flags could not be used because upon
+ * pushing the newest pointer (reserve), initially random data will be set
+ * that could falsely indicate the flag status. Introducing a third
+ * "committed" pointer also will not help because now the problem is
+ * advancing the committed pointer. (Reserve ordering does not match commit
+ * ordering.) Even using cmpxchg() will not help because random data could
+ * potentially match the metadata to replace.
+ *
+ * Descriptors allow safe and controlled access to data block metadata by
+ * providing predictable offsets for such metadata. This is key to supporting
+ * multiple concurrent lockless writers.
+ */
+
+/**
+ * DOC: memory barriers
+ *
+ * Writers
+ * -------
+ * The main issue with writers is expiring/invalidating old data blocks in
+ * order to create new data blocks. This is performed in 6 steps that must
+ * be observed in order by all writers to allow cooperation. Here is a list
+ * of the 6 steps and the named acquire/release memory barrier pairs that
+ * are used to synchronized them:
+ *
+ * * old data invalidation (MB1): Pushing rb.data_list.oldest forward.
+ * Necessary for identifying if data has been expired.
+ *
+ * * new data reservation (MB2): Pushing rb.data_list.newest forward.
+ * Necessary for validating data.
+ *
+ * * assign the data block to a descriptor (MB3): Setting data block id to
+ * descriptor id. Necessary for finding the descriptor associated with th
+ * data block.
+ *
+ * * commit data (MB4): Setting data block data_next. (Now data block is
+ * valid). Necessary for validating data.
+ *
+ * * make descriptor newest (MB5): Setting rb.descr_list.newest to descriptor.
+ * (Now following new descriptors will be linked to this one.) Necessary for
+ * ensuring the descriptor's next is set to EOL before adding to the list.
+ *
+ * * link descriprtor to previous newest (MB6): Setting the next of the
+ * previous descriptor to this one. Necessary for correctly identifying if
+ * a descriptor is the only descriptor on the list.
+ *
+ * Readers
+ * -------
+ * Readers only make of smb_rmb() to ensure that certain critical load
+ * operations are performed in an order that allows readers to evaluate if
+ * the data they read is really valid.
+ */
+
+/* end of list marker */
+#define EOL 0
+
+/**
+ * struct prb_datablock - A data block.
+ * @id: The descriptor id that is associated with this data block.
+ * @data: The data committed by the writer.
+ */
+struct prb_datablock {
+ unsigned long id;
+ char data[0];
+};
+
+#define DATAARRAY_SIZE(rb) (1 << rb->data_array_size_bits)
+#define DATAARRAY_SIZE_BITMASK(rb) (DATAARRAY_SIZE(rb) - 1)
+
+/**
+ * DATA_INDEX() - Determine the data array index from logical position.
+ * @rb: The associated ringbuffer.
+ * @lpos: The logical position (data/data_next).
+ */
+#define DATA_INDEX(rb, lpos) (lpos & DATAARRAY_SIZE_BITMASK(rb))
+
+/**
+ * DATA_WRAPS() - Determine how many times the data array has wrapped.
+ * @rb: The associated ringbuffer.
+ * @lpos: The logical position (data/data_next).
+ *
+ * The number of wraps is useful when determining if one logical position
+ * is overtaking the data array index another logical position.
+ */
+#define DATA_WRAPS(rb, lpos) (lpos >> rb->data_array_size_bits)
+
+/**
+ * DATA_THIS_WRAP_START_LPOS() - Get the position at the start of the wrap.
+ * @rb: The associated ringbuffer.
+ * @lpos: The logical position (data/data_next).
+ *
+ * Given a logical position, return the logical position if backed up to the
+ * beginning (data array index 0) of the current wrap. This is used when a
+ * data block wraps and therefore needs to begin at the beginning of the data
+ * array (for the next wrap).
+ */
+#define DATA_THIS_WRAP_START_LPOS(rb, lpos) \
+ (DATA_WRAPS(rb, lpos) << rb->data_array_size_bits)
+
+#define DATA_ALIGN sizeof(long)
+#define DATA_ALIGN_SIZE(sz) \
+ ((sz + (DATA_ALIGN - 1)) & ~(DATA_ALIGN - 1))
+
+#define DESCR_COUNT_BITMASK(rb) (rb->descr_max_count - 1)
+
+/**
+ * DESCR_INDEX() - Determine the descriptor array index from the id.
+ * @rb: The associated ringbuffer.
+ * @id: The descriptor id.
+ */
+#define DESCR_INDEX(rb, id) (id & DESCR_COUNT_BITMASK(rb))
+
+#define TO_DATABLOCK(rb, lpos) \
+ ((struct prb_datablock *)&rb->data_array[DATA_INDEX(rb, lpos)])
+#define TO_DESCR(rb, id) \
+ (&rb->descr_array[DESCR_INDEX(rb, id)])
+
+/**
+ * data_valid() - Check if a data block is valid.
+ * @rb: The ringbuffer containing the data.
+ * @oldest_data: The oldest data logical position.
+ * @newest_data: The newest data logical position.
+ * @data: The logical position for the data block to check.
+ * @data_next: The logical position for the data block next to this one.
+ * This value is used to identify the end of the data block.
+ *
+ * A data block is considered valid if it satisfies the two conditions:
+ *
+ * * oldest_data <= data < data_next <= newest_data
+ * * oldest_data is at most exactly 1 wrap behind newest_data
+ *
+ * Return: true if the specified data block is valid.
+ */
+static inline bool data_valid(struct printk_ringbuffer *rb,
+ unsigned long oldest_data,
+ unsigned long newest_data,
+ unsigned long data, unsigned long data_next)
+
+{
+ return ((data - oldest_data) < DATAARRAY_SIZE(rb) &&
+ data_next != data &&
+ (data_next - data) < DATAARRAY_SIZE(rb) &&
+ (newest_data - data_next) < DATAARRAY_SIZE(rb) &&
+ (newest_data - oldest_data) <= DATAARRAY_SIZE(rb));
+}
+
+/**
+ * add_descr_list() - Add a descriptor to the descriptor list.
+ * @e: An entry that has already reserved data.
+ *
+ * The provided entry contains a pointer to a descriptor that has already
+ * been reserved for this entry. However, the reserved descriptor is not
+ * yet on the list. Add this descriptor as the newest item.
+ *
+ * A descriptor is added in two steps. The first step is to make this
+ * descriptor the newest. The second step is to update the "next" field of
+ * the former newest item to point to this item.
+ */
+static void add_descr_list(struct prb_reserved_entry *e)
+{
+ struct printk_ringbuffer *rb = e->rb;
+ struct prb_list *l = &rb->descr_list;
+ struct prb_descr *d = e->descr;
+ struct prb_descr *newest_d;
+ unsigned long newest_id;
+
+ /* set as newest */
+ do {
+ /* MB5: synchronize add descr */
+ newest_id = smp_load_acquire(&l->newest);
+ newest_d = TO_DESCR(rb, newest_id);
+
+ if (newest_id == EOL)
+ WRITE_ONCE(d->seq, 1);
+ else
+ WRITE_ONCE(d->seq, READ_ONCE(newest_d->seq) + 1);
+ /*
+ * MB5: synchronize add descr
+ *
+ * In particular: next written before cmpxchg
+ */
+ } while (cmpxchg_release(&l->newest, newest_id, e->id) != newest_id);
+
+ if (unlikely(newest_id == EOL)) {
+ /* no previous newest means we *are* the list, set oldest */
+
+ /*
+ * MB UNPAIRED
+ *
+ * In particular: Force cmpxchg _after_ cmpxchg on newest.
+ */
+ WARN_ON_ONCE(cmpxchg_release(&l->oldest, EOL, e->id) != EOL);
+ } else {
+ /* link to previous chain */
+
+ /*
+ * MB6: synchronize link descr
+ *
+ * In particular: Force cmpxchg _after_ cmpxchg on newest.
+ */
+ WARN_ON_ONCE(cmpxchg_release(&newest_d->next,
+ EOL, e->id) != EOL);
+ }
+}
+
+/**
+ * remove_oldest_descr() - Remove the oldest descriptor from the list.
+ * @rb: The ringbuffer from which to remove the oldest descriptor.
+ *
+ * A oldest descriptor can be removed from the descriptor list on two
+ * conditions:
+ *
+ * * The data block for the descriptor is invalid.
+ * * The descriptor is not the only descriptor on the list.
+ *
+ * If, during this function, another task removes the oldest, this function
+ * will try again.
+ *
+ * Return: The removed descriptor or NULL if the oldest descriptor cannot
+ * removed.
+ */
+static struct prb_descr *remove_oldest_descr(struct printk_ringbuffer *rb)
+{
+ struct prb_list *l = &rb->descr_list;
+ unsigned long oldest_id;
+ struct prb_descr *d;
+ unsigned long next;
+
+ for (;;) {
+ oldest_id = READ_ONCE(l->oldest);
+
+ /* list empty */
+ if (oldest_id == EOL)
+ return NULL;
+
+ d = TO_DESCR(rb, oldest_id);
+
+ /* only descriptors with _invalid_ data can be removed */
+ if (data_valid(rb, READ_ONCE(rb->data_list.oldest),
+ READ_ONCE(rb->data_list.newest),
+ READ_ONCE(d->data),
+ READ_ONCE(d->data_next))) {
+ return NULL;
+ }
+
+ /*
+ * MB6: synchronize link descr
+ *
+ * In particular: l->oldest is loaded as a data dependency so
+ * d->next and the following l->oldest will load afterwards,
+ * respectively.
+ */
+ next = smp_load_acquire(&d->next);
+
+ if (next == EOL && READ_ONCE(l->oldest) == oldest_id) {
+ /*
+ * The oldest has no next, so this is a list of one
+ * descriptor. Lists must always have at least one
+ * descriptor.
+ */
+ return NULL;
+ }
+
+ if (cmpxchg(&l->oldest, oldest_id, next) == oldest_id) {
+ /* removed successfully */
+ break;
+ }
+
+ /* oldest descriptor removed by another task, try again */
+ }
+
+ return d;
+}
+
+/**
+ * expire_oldest_data() - Invalidate the oldest data block.
+ * @rb: The ringbuffer containing the data block.
+ * @oldest_lpos: The logical position of the oldest data block.
+ *
+ * This function expects to "push" the pointer to the oldest data block
+ * forward, thus invalidating the oldest data block. However, before pushing,
+ * it is verified if the data block is valid. (For example, if the data block
+ * was reserved but not yet committed, it is not permitted to invalidate the
+ * "in use by a writer" data.)
+ *
+ * If the data is valid, it will be associated with a descriptor, which will
+ * then provide the necessary information to validate the data.
+ *
+ * Return: true if the oldest data was invalidated (regardless if this
+ * task was the one that did it or not), otherwise false.
+ */
+static bool expire_oldest_data(struct printk_ringbuffer *rb,
+ unsigned long oldest_lpos)
+{
+ unsigned long newest_lpos;
+ struct prb_datablock *b;
+ unsigned long data_next;
+ struct prb_descr *d;
+ unsigned long data;
+
+ /* MB2: synchronize data reservation */
+ newest_lpos = smp_load_acquire(&rb->data_list.newest);
+
+ b = TO_DATABLOCK(rb, oldest_lpos);
+
+ /* MB3: synchronize descr setup */
+ d = TO_DESCR(rb, smp_load_acquire(&b->id));
+
+ data = READ_ONCE(d->data);
+
+ /* sanity check to check to see if b->id was correct */
+ if (oldest_lpos != data)
+ goto out;
+
+ /* MB4: synchronize commit */
+ data_next = smp_load_acquire(&d->data_next);
+
+ if (!data_valid(rb, oldest_lpos, newest_lpos, data, data_next))
+ goto out;
+
+ /* MB1: synchronize data invalidation */
+ cmpxchg_release(&rb->data_list.oldest, data, data_next);
+
+ /* Some task (maybe this one) successfully expired the oldest data. */
+ return true;
+out:
+ return (oldest_lpos != READ_ONCE(rb->data_list.oldest));
+}
+
+/**
+ * get_new_lpos() - Determine the logical positions of a new data block.
+ * @rb: The ringbuffer to contain the data.
+ * @size: The size of the new data block.
+ * @data: A pointer to the start logical position value to be set.
+ * This will be the beginning of the data block.
+ * @data_next: A pointer to the end logical position value to be set.
+ * This value is used to identify the end of the data block.
+ *
+ * Based on the logical position of the newest data block to create,
+ * determine what the data and data_next values will be. If the data block
+ * would overwrite the oldest data block, this function will invalidate the
+ * oldest data block, thus providing itself space for the new data block.
+ *
+ * Return: true if logical positions were determined, otherwise false.
+ *
+ * This will only fail if it was not possible to invalidate the oldest data
+ * block. This can happen if a writer has reserved but not yet committed data
+ * and that reserved data is currently the oldest data.
+ */
+static bool get_new_lpos(struct printk_ringbuffer *rb, unsigned int size,
+ unsigned long *data, unsigned long *data_next)
+{
+ unsigned long oldest_lpos;
+ unsigned long data_begin;
+
+ for (;;) {
+ *data = READ_ONCE(rb->data_list.newest);
+ data_begin = *data;
+
+ for (;;) {
+ *data_next = data_begin + size;
+
+ /* MB1: synchronize data invalidation */
+ oldest_lpos = smp_load_acquire(&rb->data_list.oldest);
+
+ if (*data_next - oldest_lpos > DATAARRAY_SIZE(rb)) {
+ /* would overwrite oldest */
+ if (!expire_oldest_data(rb, oldest_lpos))
+ return false;
+ break;
+ }
+
+ if (DATA_WRAPS(rb, data_begin) ==
+ DATA_WRAPS(rb, *data_next)) {
+ return true;
+ }
+
+ data_begin = DATA_THIS_WRAP_START_LPOS(rb, *data_next);
+ }
+ }
+}
+
+/**
+ * assign_descr() - Assign a descriptor to an entry.
+ * @e: The entry to assign a descriptor to.
+ *
+ * Find an available descriptor to assign to the entry. First it is checked
+ * if the oldest descriptor can be used. If not, perhaps a never-used
+ * descriptor is available.
+ *
+ * If no descriptors are found, data blocks will be invalidated until the
+ * oldest descriptor can be used.
+ *
+ * Return: true if a descriptor was assigned, otherwise false.
+ *
+ * This will only fail if it was not possible to invalidate the oldest data
+ * block. This can happen if a writer has reserved but not yet committed data
+ * and that reserved data is currently the oldest data.
+ */
+static bool assign_descr(struct prb_reserved_entry *e)
+{
+ struct printk_ringbuffer *rb = e->rb;
+ struct prb_descr *d;
+ unsigned long id;
+
+ for (;;) {
+ /* use invalid descriptor at oldest */
+ d = remove_oldest_descr(rb);
+ if (d) {
+ id = READ_ONCE(d->id) + rb->descr_max_count;
+ /*
+ * EOL has special meaning (to represent a terminator
+ * for the list) so no descriptor is allowed to use
+ * it as its id.
+ */
+ if (id == EOL)
+ id += rb->descr_max_count;
+
+ /*
+ * Any readers sitting at this descriptor can still
+ * traverse forward until the new id is assigned.
+ */
+ break;
+ }
+
+ /* fallback to static never-used descriptors */
+ if (atomic_read(&rb->descr_next) < rb->descr_max_count) {
+ id = atomic_fetch_inc(&rb->descr_next);
+ if (id < rb->descr_max_count) {
+ d = &rb->descr_array[id];
+ break;
+ }
+ }
+
+ /* no descriptors, free one */
+ /* MB1: synchronize data invalidation */
+ if (!expire_oldest_data(rb,
+ smp_load_acquire(&rb->data_list.oldest))) {
+ return false;
+ }
+ }
+
+ e->id = id;
+ e->descr = d;
+ return true;
+}
+
+/**
+ * data_reserve() - Reserve data in the data array.
+ * @e: The entry to reserve data for.
+ * @size: The size to reserve.
+ *
+ * This function expects to "push" the pointer to the newest data block
+ * forward. If this would result in overtaking the data array index of the
+ * oldest data, that oldest data will be invalidated.
+ *
+ * Return: true if data was reservied, otherwise false.
+ *
+ * This will only fail if it was not possible to invalidate the oldest data
+ * block. This can happen if a writer has reserved but not yet committed data
+ * and that reserved data is currently the oldest data.
+ */
+static bool data_reserve(struct prb_reserved_entry *e, unsigned int size)
+{
+ struct printk_ringbuffer *rb = e->rb;
+
+ do {
+ if (!get_new_lpos(rb, size, &e->data, &e->data_next))
+ return false;
+ /* MB2: synchronize data reservation */
+ } while (cmpxchg_release(&rb->data_list.newest,
+ e->data, e->data_next) != e->data);
+
+ return true;
+}
+
+/**
+ * prb_reserve() - Reserve data in the ringbuffer.
+ * @e: The entry structure to setup.
+ * @rb: The ringbuffer to reserve data in.
+ * @size: The size of the data to reserve.
+ *
+ * This is the public function available to writers to reserve data.
+ *
+ * Context: Any context. Disables local interrupts on success.
+ * Return: A pointer to the reserved data or NULL if data could not be
+ * reserved.
+ *
+ * Assuming the provided size is legal, this will only fail if it was not
+ * possible to invalidate the oldest data block. This can happen if a writer
+ * has reserved but not yet committed data and that reserved data is
+ * currently the oldest data.
+ */
+char *prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ unsigned int size)
+{
+ struct prb_datablock *b;
+ struct prb_descr *d;
+ char *buf;
+
+ if (size == 0)
+ return NULL;
+
+ size += sizeof(struct prb_datablock);
+ size = DATA_ALIGN_SIZE(size);
+ if (size > DATAARRAY_SIZE(rb))
+ return NULL;
+
+ e->rb = rb;
+
+ local_irq_save(e->irqflags);
+
+ if (!assign_descr(e))
+ goto err_out;
+
+ d = e->descr;
+ WRITE_ONCE(d->id, e->id);
+
+ if (!data_reserve(e, size)) {
+ /* put invalid descriptor on list, can still be traversed */
+ WRITE_ONCE(d->next, EOL);
+ add_descr_list(e);
+ goto err_out;
+ }
+
+ WRITE_ONCE(d->data, e->data);
+ WRITE_ONCE(d->data_next, e->data);
+
+ if (DATA_WRAPS(rb, e->data) != DATA_WRAPS(rb, e->data_next)) {
+ b = TO_DATABLOCK(rb, 0);
+ WRITE_ONCE(b->id, e->id);
+ } else {
+ b = TO_DATABLOCK(rb, e->data);
+ }
+ buf = &b->data[0];
+
+ b = TO_DATABLOCK(rb, e->data);
+
+ /* MB3: synchronize descr setup */
+ smp_store_release(&b->id, e->id);
+
+ return buf;
+err_out:
+ local_irq_restore(e->irqflags);
+ return NULL;
+}
+EXPORT_SYMBOL(prb_reserve);
+
+/**
+ * prb_commit() - Commit (previously reserved) data to the ringbuffer.
+ * @e: The entry containing the reserved data information.
+ *
+ * This is the public function available to writers to commit data.
+ *
+ * Context: Any context. Enables local interrupts.
+ */
+void prb_commit(struct prb_reserved_entry *e)
+{
+ struct prb_descr *d = e->descr;
+
+ WRITE_ONCE(d->next, EOL);
+
+ /* MB4: synchronize commit */
+ smp_store_release(&d->data_next, e->data_next);
+
+ /* from this point on, the data could be expired */
+
+ add_descr_list(e);
+
+ /* now the descriptor is visible to the readers */
+
+ local_irq_restore(e->irqflags);
+}
+EXPORT_SYMBOL(prb_commit);
+
+/**
+ * get_datablock() - Return the data block for the provided logical positions.
+ * @rb: The ringbuffer containin the data block.
+ * @data: The logical position for the beginning of the data block.
+ * @data_next: The logical position for the data block next to this one.
+ * This value is used to identify the end of the data block.
+ * @size: A pointer to an integer to set to the size of the data block.
+ *
+ * Since datablocks always contain contiguous data, the situation will occur
+ * where there is not enough room at the end of the array for a new data
+ * block. In this situation, two data blocks are created:
+ *
+ * * A "wrapping" data block at the end of the data array.
+ * * The real data block at the beginning of the data array.
+ *
+ * The descriptor contains the beginning position of the wrapping data block
+ * and the end position of the real data block. This function is used
+ * determines if a wrapping data block is being used and always returns the
+ * real data block and size. (Note that the descriptor id from the wrapping
+ * data block is used.)
+ *
+ * Return: A pointer to data block structure. Also, size is set to the size of
+ * the data block.
+ */
+static struct prb_datablock *get_datablock(struct printk_ringbuffer *rb,
+ unsigned long data,
+ unsigned long data_next, int *size)
+{
+ if (DATA_WRAPS(rb, data) == DATA_WRAPS(rb, data_next)) {
+ *size = data_next - data;
+ } else {
+ *size = DATA_INDEX(rb, data_next);
+ data = 0;
+ }
+ *size -= sizeof(struct prb_datablock);
+
+ return TO_DATABLOCK(rb, data);
+}
+
+/**
+ * prb_iter_init() - Initialize an iterator structure.
+ * @iter: The iterator to initialize.
+ * @rb: The ringbuffer to associate with the iterator.
+ * @e: An entry structure to use during iteration.
+ *
+ * This is the public function available to readers to initialize their
+ * iterator structure.
+ *
+ * As an alternative, DECLARE_PRINTKRB_ITER() could be used.
+ *
+ * Context: Any context.
+ */
+void prb_iter_init(struct prb_iterator *iter, struct printk_ringbuffer *rb,
+ struct prb_entry *e)
+{
+ iter->rb = rb;
+ iter->e = e;
+ iter->id = EOL;
+ iter->id_next = EOL;
+}
+EXPORT_SYMBOL(prb_iter_init);
+
+/**
+ * iter_peek_next_id() - Determine the next (newer) descriptor id.
+ * @iter: The iterator used for list traversal.
+ *
+ * If the iterator has not yet been used or has fallen behind and no longer
+ * has a pointer to a valid descriptor, the next descriptor will be the oldest
+ * descriptor in the list.
+ *
+ * Return: The next descriptor id. A value of EOL means there is no next
+ * descriptor.
+ */
+static unsigned long iter_peek_next_id(struct prb_iterator *iter)
+{
+ struct printk_ringbuffer *rb = iter->rb;
+ unsigned long next_id = iter->id_next;
+ struct prb_descr *d;
+
+ if (iter->id == EOL) {
+ next_id = READ_ONCE(rb->descr_list.oldest);
+ } else if (iter->id_next == EOL) {
+ d = TO_DESCR(rb, iter->id);
+ next_id = READ_ONCE(d->next);
+
+ if (READ_ONCE(d->id) != iter->id)
+ next_id = READ_ONCE(rb->descr_list.oldest);
+ }
+
+ return next_id;
+}
+
+/**
+ * prb_iter_peek_next_entry() - Check if there is a next (newer) entry.
+ * @iter: The iterator used for list traversal.
+ *
+ * This is the public function available to readers to check if a newer
+ * entry is available.
+ *
+ * Context: Any context.
+ * Return: true if there is a next entry, otherwise false.
+ */
+bool prb_iter_peek_next_entry(struct prb_iterator *iter)
+{
+ return (iter_peek_next_id(iter) != EOL);
+}
+EXPORT_SYMBOL(prb_iter_peek_next_entry);
+
+/**
+ * prb_iter_next_valid_entry() - Traverse to and read the next (newer) entry.
+ * @iter: The iterator used for list traversal.
+ *
+ * This is the public function available to readers to traverse the entry
+ * list.
+ *
+ * If the iterator has not yet been used or has fallen behind and no longer
+ * has a pointer to a valid descriptor, the next descriptor will be the oldest
+ * descriptor in the list.
+ *
+ * Context: Any context.
+ * Return: The size of the entry data or 0 if there is no next entry.
+ *
+ * The entry data is padded (if necessary) to allow aligment for following
+ * data blocks. Therefore the size value can be larger than the size reserved.
+ * If users want the exact size to be tracked, they should include this
+ * information within their data.
+ */
+int prb_iter_next_valid_entry(struct prb_iterator *iter)
+{
+ struct printk_ringbuffer *rb = iter->rb;
+ struct prb_entry *e = iter->e;
+ struct prb_datablock *b;
+ unsigned long data_next;
+ unsigned long next_id;
+ struct prb_descr *d;
+ unsigned long data;
+ int size;
+
+ iter->id_next = iter_peek_next_id(iter);
+
+ while (iter->id_next != EOL) {
+ d = TO_DESCR(rb, iter->id_next);
+ data = READ_ONCE(d->data);
+ data_next = READ_ONCE(d->data_next);
+
+ /*
+ * Loaded a local copy of the data pointers before
+ * checking for validity of the data.
+ */
+ smp_rmb();
+
+ if (READ_ONCE(d->id) == iter->id_next &&
+ data_valid(rb, READ_ONCE(rb->data_list.oldest),
+ READ_ONCE(rb->data_list.newest),
+ data, data_next)) {
+
+ b = get_datablock(rb, data, data_next, &size);
+
+ memcpy(&e->buffer[0], &b->data[0],
+ size > e->buffer_size ? e->buffer_size : size);
+ e->seq = READ_ONCE(d->seq);
+
+ /*
+ * Loaded a local copy of the data/seq before
+ * rechecking the validity of the data.
+ */
+ smp_rmb();
+
+ if (READ_ONCE(d->id) == iter->id_next &&
+ data_valid(rb,
+ READ_ONCE(rb->data_list.oldest),
+ READ_ONCE(rb->data_list.newest),
+ data,
+ data_next)) {
+
+ iter->id = iter->id_next;
+ iter->id_next = READ_ONCE(d->next);
+ return size;
+ }
+ }
+
+ next_id = READ_ONCE(d->next);
+
+ /*
+ * Loaded a local copy of the next descr before
+ * checking the traversal-validity of the descr.
+ * (It is enough for the id to be consistent.)
+ */
+ smp_rmb();
+
+ if (READ_ONCE(d->id) == iter->id_next) {
+ iter->id = iter->id_next;
+ iter->id_next = next_id;
+ } else {
+ iter->id = EOL;
+ iter->id_next = READ_ONCE(rb->descr_list.oldest);
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(prb_iter_next_valid_entry);
+
+/**
+ * prb_inc_lost() - Increment internal lost counter.
+ * @rb: The ringbuffer, whose counter to modify.
+ *
+ * This is the public function available to writers to update statistics about
+ * failed writes where the writer has given up.
+ *
+ * Context: Any context.
+ */
+void prb_inc_lost(struct printk_ringbuffer *rb)
+{
+ atomic_long_inc(&rb->lost);
+}
+EXPORT_SYMBOL(prb_inc_lost);
--
2.11.0

2019-06-17 21:10:10

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC PATCH v2 0/2] printk: new ringbuffer implementation

On Fri, 7 Jun 2019, John Ogness wrote:

Polite ping ....

> This is a follow-up RFC on the work to reimplement much of
> the core of printk. The original thread can be seen here[0].
>
> One of the results of that thread was that the work needs to
> be broken up into several pieces. A roadmap was laid out[1]
> and this RFC is for the base component of the first piece:
> a new ringbuffer implementation for printk.
>
> This series does not touch any existing printk code. It is
> only the ringbuffer implementation. I am particularly
> interested in feedback relating to the design of the
> ringbuffer and the use of memory barriers.
>
> The series also includes a test module that performs some
> heavy writer stress testing. I have successfully run these
> tests on a 16-core ARM64 platform.
>
> John Ogness
>
> [0] https://lkml.kernel.org/r/[email protected]
> [1] https://lkml.kernel.org/r/[email protected]
>
> John Ogness (2):
> printk-rb: add a new printk ringbuffer implementation
> printk-rb: add test module
>
> Documentation/core-api/index.rst | 1 +
> Documentation/core-api/printk-ringbuffer.rst | 104 +++
> include/linux/printk_ringbuffer.h | 238 +++++++
> lib/Makefile | 2 +
> lib/printk_ringbuffer.c | 924 +++++++++++++++++++++++++++
> lib/test_prb.c | 237 +++++++
> 6 files changed, 1506 insertions(+)
> create mode 100644 Documentation/core-api/printk-ringbuffer.rst
> create mode 100644 include/linux/printk_ringbuffer.h
> create mode 100644 lib/printk_ringbuffer.c
> create mode 100644 lib/test_prb.c
>
> --
> 2.11.0
>
>

2019-06-18 04:53:08

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

Hello John,

On (06/07/19 18:29), John Ogness wrote:
[..]
> + struct prb_reserved_entry e;
> + char *s;
> +
> + s = prb_reserve(&e, &rb, 32);
> + if (s) {
> + sprintf(s, "Hello, world!");
> + prb_commit(&e);
> + }

A nit: snprintf().

sprintf() is tricky, it may write "slightly more than was
anticipated" bytes - all those string_nocheck(" disabled"),
error_string("pK-error"), etc.

[..]
> +Sample reader code::
> +
> + DECLARE_PRINTKRB_ENTRY(entry, 128);
> + DECLARE_PRINTKRB_ITER(iter, &test_rb, &entry);
> + u64 last_seq = 0;
> + int len;
> + char *s;
> +
> + prb_for_each_entry(&iter, len) {
> + if (entry.seq - last_seq != 1) {
> + printf("LOST %llu ENTRIES\n",
> + entry.seq - (last_seq + 1));
> + }
> + last_seq = entry.seq;
> +
> + s = (char *)&entry.buffer[0];
> + if (len >= 128)
> + s[128 - 1] = 0;
> + printf("data: %s\n", s);
> + }

How are we going to handle pr_cont() loops?

print_modules()
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
pr_cont(" %s%s", mod->name, module_flags(mod, buf));
}
preempt_enable();

-ss

2019-06-18 07:16:34

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC PATCH v2 0/2] printk: new ringbuffer implementation

On Mon 2019-06-17 23:09:32, Thomas Gleixner wrote:
> On Fri, 7 Jun 2019, John Ogness wrote:
>
> Polite ping ....

I have started looking at the patchset yesterday. I informed John
that I was busy last week via a personal mail.

My first impression is very good. But I still have to dive much more
into the code.

Best Regards,
Petr

2019-06-18 11:13:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Fri, Jun 07, 2019 at 06:29:48PM +0206, John Ogness wrote:
> +/**
> + * struct prb_descr - A descriptor representing an entry in the ringbuffer.
> + * @seq: The sequence number of the entry.
> + * @id: The descriptor id.
> + * The location of the descriptor within the descriptor array can be
> + * determined from this value.
> + * @data: The logical position of the data for this entry.
> + * The location of the beginning of the data within the data array
> + * can be determined from this value.
> + * @data_next: The logical position of the data next to this entry.
> + * This is used to determine the length of the data as well as
> + * identify where the next data begins.
> + * @next: The id of the next (newer) descriptor in the linked list.
> + * A value of EOL means it is the last descriptor in the list.
> + *

For the entire patch; can you please vertically align those
descriptions? This is unreadable. Also, add some whitespace, to aid with
reading, something like do:

* struct prb_descr - A descriptor representing an entry in the ringbuffer.
*
* @seq: The sequence number of the entry.
*
* @id: The descriptor id.
* The location of the descriptor within the descriptor
* array can be determined from this value.
*
* @data: The logical position of the data for this entry. The
* location of the beginning of the data within the data
* array can be determined from this value.
*
* @data_next: The logical position of the data next to this entry.
* This is used to determine the length of the data as well as
* identify where the next data begins.
*
* @next: The id of the next (newer) descriptor in the linked list.
* A value of EOL means it is the last descriptor in the
* list.


2019-06-18 11:25:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Fri, Jun 07, 2019 at 06:29:48PM +0206, John Ogness wrote:
> +/**
> + * DOC: memory barriers

What's up with that 'DOC' crap?

> + *
> + * Writers
> + * -------
> + * The main issue with writers is expiring/invalidating old data blocks in
> + * order to create new data blocks. This is performed in 6 steps that must
> + * be observed in order by all writers to allow cooperation. Here is a list
> + * of the 6 steps and the named acquire/release memory barrier pairs that
> + * are used to synchronized them:
> + *
> + * * old data invalidation (MB1): Pushing rb.data_list.oldest forward.
> + * Necessary for identifying if data has been expired.
> + *
> + * * new data reservation (MB2): Pushing rb.data_list.newest forward.
> + * Necessary for validating data.
> + *
> + * * assign the data block to a descriptor (MB3): Setting data block id to
> + * descriptor id. Necessary for finding the descriptor associated with th
> + * data block.
> + *
> + * * commit data (MB4): Setting data block data_next. (Now data block is
> + * valid). Necessary for validating data.
> + *
> + * * make descriptor newest (MB5): Setting rb.descr_list.newest to descriptor.
> + * (Now following new descriptors will be linked to this one.) Necessary for
> + * ensuring the descriptor's next is set to EOL before adding to the list.
> + *
> + * * link descriprtor to previous newest (MB6): Setting the next of the
> + * previous descriptor to this one. Necessary for correctly identifying if
> + * a descriptor is the only descriptor on the list.
> + *
> + * Readers
> + * -------
> + * Readers only make of smb_rmb() to ensure that certain critical load
> + * operations are performed in an order that allows readers to evaluate if
> + * the data they read is really valid.
> + */

This isn't really helping much I feel. It doesn't begin to describe the
ordering. But maybe the code makes more sense.

2019-06-18 11:48:42

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Fri, Jun 07, 2019 at 06:29:48PM +0206, John Ogness wrote:
> +#define DATAARRAY_SIZE(rb) (1 << rb->data_array_size_bits)
> +#define DATAARRAY_SIZE_BITMASK(rb) (DATAARRAY_SIZE(rb) - 1)

*phew* no comments on those..

I think the kernel typically uses _MASK instead of _BITMASK for this
though.

> +/**
> + * DATA_INDEX() - Determine the data array index from logical position.
> + * @rb: The associated ringbuffer.
> + * @lpos: The logical position (data/data_next).
> + */
> +#define DATA_INDEX(rb, lpos) (lpos & DATAARRAY_SIZE_BITMASK(rb))
> +
> +/**
> + * DATA_WRAPS() - Determine how many times the data array has wrapped.
> + * @rb: The associated ringbuffer.
> + * @lpos: The logical position (data/data_next).
> + *
> + * The number of wraps is useful when determining if one logical position
> + * is overtaking the data array index another logical position.
> + */
> +#define DATA_WRAPS(rb, lpos) (lpos >> rb->data_array_size_bits)
> +
> +/**
> + * DATA_THIS_WRAP_START_LPOS() - Get the position at the start of the wrap.
> + * @rb: The associated ringbuffer.
> + * @lpos: The logical position (data/data_next).
> + *
> + * Given a logical position, return the logical position if backed up to the
> + * beginning (data array index 0) of the current wrap. This is used when a
> + * data block wraps and therefore needs to begin at the beginning of the data
> + * array (for the next wrap).
> + */
> +#define DATA_THIS_WRAP_START_LPOS(rb, lpos) \
> + (DATA_WRAPS(rb, lpos) << rb->data_array_size_bits)

That's more easily written as: ((lpos) & ~MASK(rb))

> +
> +#define DATA_ALIGN sizeof(long)
> +#define DATA_ALIGN_SIZE(sz) \
> + ((sz + (DATA_ALIGN - 1)) & ~(DATA_ALIGN - 1))

We have ALIGN() for that

> +
> +#define DESCR_COUNT_BITMASK(rb) (rb->descr_max_count - 1)

I think the kernel typically uses 'DESC' as shorthand for Descriptor.
Idem on the MASK vs BITMASK thing.

> +
> +/**
> + * DESCR_INDEX() - Determine the descriptor array index from the id.
> + * @rb: The associated ringbuffer.
> + * @id: The descriptor id.
> + */
> +#define DESCR_INDEX(rb, id) (id & DESCR_COUNT_BITMASK(rb))
> +
> +#define TO_DATABLOCK(rb, lpos) \
> + ((struct prb_datablock *)&rb->data_array[DATA_INDEX(rb, lpos)])

If I were paranoid, I'd point out that this evaluates @rb twice, and
doesn't have the macro arguments in parens.

> +#define TO_DESCR(rb, id) \
> + (&rb->descr_array[DESCR_INDEX(rb, id)])
> +
> +/**
> + * data_valid() - Check if a data block is valid.
> + * @rb: The ringbuffer containing the data.
> + * @oldest_data: The oldest data logical position.
> + * @newest_data: The newest data logical position.
> + * @data: The logical position for the data block to check.
> + * @data_next: The logical position for the data block next to this one.
> + * This value is used to identify the end of the data block.
> + *
> + * A data block is considered valid if it satisfies the two conditions:
> + *
> + * * oldest_data <= data < data_next <= newest_data
> + * * oldest_data is at most exactly 1 wrap behind newest_data
> + *
> + * Return: true if the specified data block is valid.
> + */
> +static inline bool data_valid(struct printk_ringbuffer *rb,
> + unsigned long oldest_data,
> + unsigned long newest_data,
> + unsigned long data, unsigned long data_next)
> +
> +{
> + return ((data - oldest_data) < DATAARRAY_SIZE(rb) &&
> + data_next != data &&
> + (data_next - data) < DATAARRAY_SIZE(rb) &&
> + (newest_data - data_next) < DATAARRAY_SIZE(rb) &&
> + (newest_data - oldest_data) <= DATAARRAY_SIZE(rb));

unsigned long size = DATA_SIZE(rb);

/* oldest_data <= data */
if (data - oldest_data >= size);
return false;

/* data_next < data */
if (data_next == data)
return false

/* data_next <= newest_data */
if (newest_data - data_next >= size)
return false;

/* 1 wrap */
if (newest_data - oldest_data >= size)
return false;

return true;

> +}
> +
> +/**
> + * add_descr_list() - Add a descriptor to the descriptor list.
> + * @e: An entry that has already reserved data.
> + *
> + * The provided entry contains a pointer to a descriptor that has already
> + * been reserved for this entry. However, the reserved descriptor is not
> + * yet on the list. Add this descriptor as the newest item.
> + *
> + * A descriptor is added in two steps. The first step is to make this
> + * descriptor the newest. The second step is to update the "next" field of
> + * the former newest item to point to this item.
> + */
> +static void add_descr_list(struct prb_reserved_entry *e)
> +{
> + struct printk_ringbuffer *rb = e->rb;
> + struct prb_list *l = &rb->descr_list;
> + struct prb_descr *d = e->descr;
> + struct prb_descr *newest_d;
> + unsigned long newest_id;
> +
> + /* set as newest */
> + do {
> + /* MB5: synchronize add descr */
> + newest_id = smp_load_acquire(&l->newest);
> + newest_d = TO_DESCR(rb, newest_id);
> +
> + if (newest_id == EOL)
> + WRITE_ONCE(d->seq, 1);
> + else
> + WRITE_ONCE(d->seq, READ_ONCE(newest_d->seq) + 1);
> + /*
> + * MB5: synchronize add descr
> + *
> + * In particular: next written before cmpxchg
> + */
> + } while (cmpxchg_release(&l->newest, newest_id, e->id) != newest_id);

What does this pair with? I find ->newest usage in:

- later this function with an MB6 comment
- remove_oldest_descr() with no comment
- expire_oldest_data() with an MB2 comment
- get_new_lpos() with no comment
- data_reserve() with an MB2 comment
- prb_iter_next_valid_entry() with no comment
(and the smp_rmb()s have no clear comments either).

In short; I've no frigging clue and I might as well just delete all
these comments and reverse engineer :-(

> +
> + if (unlikely(newest_id == EOL)) {
> + /* no previous newest means we *are* the list, set oldest */
> +
> + /*
> + * MB UNPAIRED

That's a bug, MB must always be paired.

> + *
> + * In particular: Force cmpxchg _after_ cmpxchg on newest.
> + */
> + WARN_ON_ONCE(cmpxchg_release(&l->oldest, EOL, e->id) != EOL);
> + } else {
> + /* link to previous chain */
> +
> + /*
> + * MB6: synchronize link descr
> + *
> + * In particular: Force cmpxchg _after_ cmpxchg on newest.

But why... and who cares.

> + */
> + WARN_ON_ONCE(cmpxchg_release(&newest_d->next,
> + EOL, e->id) != EOL);
> + }
> +}

2019-06-18 22:14:53

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-18, Sergey Senozhatsky <[email protected]> wrote:
>> + struct prb_reserved_entry e;
>> + char *s;
>> +
>> + s = prb_reserve(&e, &rb, 32);
>> + if (s) {
>> + sprintf(s, "Hello, world!");
>> + prb_commit(&e);
>> + }
>
> A nit: snprintf().
>
> sprintf() is tricky, it may write "slightly more than was
> anticipated" bytes - all those string_nocheck(" disabled"),
> error_string("pK-error"), etc.

Agreed. Documentation should show good examples.

>> +Sample reader code::
>> +
>> + DECLARE_PRINTKRB_ENTRY(entry, 128);
>> + DECLARE_PRINTKRB_ITER(iter, &test_rb, &entry);
>> + u64 last_seq = 0;
>> + int len;
>> + char *s;
>> +
>> + prb_for_each_entry(&iter, len) {
>> + if (entry.seq - last_seq != 1) {
>> + printf("LOST %llu ENTRIES\n",
>> + entry.seq - (last_seq + 1));
>> + }
>> + last_seq = entry.seq;
>> +
>> + s = (char *)&entry.buffer[0];
>> + if (len >= 128)
>> + s[128 - 1] = 0;
>> + printf("data: %s\n", s);
>> + }
>
> How are we going to handle pr_cont() loops?
>
> print_modules()
> preempt_disable();
> list_for_each_entry_rcu(mod, &modules, list) {
> pr_cont(" %s%s", mod->name, module_flags(mod, buf));
> }
> preempt_enable();

pr_cont() (in its current form) is not related to the printk buffer
because cont messages use their own separate struct cont buffer. And for
the initial integration of the new ringbuffer I would leave that as it
is. Which means initially, pr_cont() would still sit behind a raw
spinlock and pr_cont() from NMI context would be stored as individual
messages.

However, to remove the spinlock of the cont buffer and allow pr_cont()
to work from NMI context, I would like to introduce a separate lockless
ringbuffer instance for cont that contains all the cont pieces
(including the caller_id). As soon as the caller_id changes from the
oldest record in the cont ringbuffer, that caller would assemble the
full cont message, popping all the pieces from the ringbuffer (with a
single cmpxchg) and insert the message to the printk ringbuffer.

John Ogness

2019-06-18 22:19:13

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-18, Peter Zijlstra <[email protected]> wrote:
>> +/**
>> + * struct prb_descr - A descriptor representing an entry in the ringbuffer.
>> + * @seq: The sequence number of the entry.
>> + * @id: The descriptor id.
>> + * The location of the descriptor within the descriptor array can be
>> + * determined from this value.
>> + * @data: The logical position of the data for this entry.
>> + * The location of the beginning of the data within the data array
>> + * can be determined from this value.
>> + * @data_next: The logical position of the data next to this entry.
>> + * This is used to determine the length of the data as well as
>> + * identify where the next data begins.
>> + * @next: The id of the next (newer) descriptor in the linked list.
>> + * A value of EOL means it is the last descriptor in the list.
>> + *
>
> For the entire patch; can you please vertically align those
> descriptions? This is unreadable. Also, add some whitespace, to aid with
> reading, something like do:
>
> * struct prb_descr - A descriptor representing an entry in the ringbuffer.
> *
> * @seq: The sequence number of the entry.
> *
> * @id: The descriptor id.
> * The location of the descriptor within the descriptor
> * array can be determined from this value.
> *
> * @data: The logical position of the data for this entry. The
> * location of the beginning of the data within the data
> * array can be determined from this value.
> *
> * @data_next: The logical position of the data next to this entry.
> * This is used to determine the length of the data as well as
> * identify where the next data begins.
> *
> * @next: The id of the next (newer) descriptor in the linked list.
> * A value of EOL means it is the last descriptor in the
> * list.

OK. Thanks for taking the time to format an example.

John Ogness

2019-06-18 22:31:30

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-18, Peter Zijlstra <[email protected]> wrote:
>> +/**
>> + * DOC: memory barriers
>
> What's up with that 'DOC' crap?

The separate documentation in
Documentation/core-api/printk-ringbuffer.rst references this so it
automatically shows up in the kernel docs. An external reference
requires the DOC keyword.

Maybe the memory barrier descriptions do not belong in the kernel docs?

>> + *
>> + * Writers
>> + * -------
>> + * The main issue with writers is expiring/invalidating old data blocks in
>> + * order to create new data blocks. This is performed in 6 steps that must
>> + * be observed in order by all writers to allow cooperation. Here is a list
>> + * of the 6 steps and the named acquire/release memory barrier pairs that
>> + * are used to synchronized them:
>> + *
>> + * * old data invalidation (MB1): Pushing rb.data_list.oldest forward.
>> + * Necessary for identifying if data has been expired.
>> + *
>> + * * new data reservation (MB2): Pushing rb.data_list.newest forward.
>> + * Necessary for validating data.
>> + *
>> + * * assign the data block to a descriptor (MB3): Setting data block id to
>> + * descriptor id. Necessary for finding the descriptor associated with th
>> + * data block.
>> + *
>> + * * commit data (MB4): Setting data block data_next. (Now data block is
>> + * valid). Necessary for validating data.
>> + *
>> + * * make descriptor newest (MB5): Setting rb.descr_list.newest to descriptor.
>> + * (Now following new descriptors will be linked to this one.) Necessary for
>> + * ensuring the descriptor's next is set to EOL before adding to the list.
>> + *
>> + * * link descriprtor to previous newest (MB6): Setting the next of the
>> + * previous descriptor to this one. Necessary for correctly identifying if
>> + * a descriptor is the only descriptor on the list.
>> + *
>> + * Readers
>> + * -------
>> + * Readers only make of smb_rmb() to ensure that certain critical load
>> + * operations are performed in an order that allows readers to evaluate if
>> + * the data they read is really valid.
>> + */
>
> This isn't really helping much I feel. It doesn't begin to describe the
> ordering. But maybe the code makes more sense.

Sorry. I really have no feel about what (or how) exactly I should
document the memory barriers. I think the above comments make sense when
someone understands the details of the implementation. But perhaps it
should describe things such that someone without knowledge of the
implementation would understand what the memory barriers are for? That
would significantly increase the amount of text as I would have to
basically explain the implementation.

I would appreciate it if you could point out a source file that
documents its memory barriers the way you would like to see these memory
barriers documented.

John Ogness

2019-06-19 10:47:34

by Andrea Parri

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

> I would appreciate it if you could point out a source file that
> documents its memory barriers the way you would like to see these memory
> barriers documented.

IMO, you could find some inspiration by looking at the memory barriers
comments from:

kernel/sched/core.c:try_to_wake_up()
include/linux/wait.h:waitqueue_active()
kernel/futex.c [header _and inline annotations]

I'll detail a single example here, and then conclude with some general
guidelines:

---
[from kernel/sched/rt.c]

static inline void rt_set_overload(struct rq *rq)
{
if (!rq->online)
return;

cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
/*
* Make sure the mask is visible before we set
* the overload count. That is checked to determine
* if we should look at the mask. It would be a shame
* if we looked at the mask, but the mask was not
* updated yet.
*
* Matched by the barrier in pull_rt_task().
*/
smp_wmb();
atomic_inc(&rq->rd->rto_count);
}

static void pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, cpu;
bool resched = false;
struct task_struct *p;
struct rq *src_rq;
int rt_overload_count = rt_overloaded(this_rq);

if (likely(!rt_overload_count))
return;

/*
* Match the barrier from rt_set_overloaded; this guarantees that if we
* see overloaded we must also see the rto_mask bit.
*/
smp_rmb();

/* If we are the only overloaded CPU do nothing */
if (rt_overload_count == 1 &&
cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
return;

[...]

}
---

Notice that the comments provide the following information: for _each_
memory barrier primitive,

1) the _memory accesses_ being ordered

the store to ->rto_mask and the store to ->rto_count for the smp_wmb()
the load from ->rto_count and the from ->rto_mask for the smp_rmb()

2) the _matching barrier_ (and its location)

3) an informal description of the _underlying guarantee(s)_ (c.f.,
"if we see overloaded we must also see the rto_mask bit").

One can provide this information by embedding some snippet/pseudo-code
in its comments as illustrated in the examples pointed out above.

I'd suggest to _not be stingy with memory barriers explanations: this
eases/makes it possible the review itself as well as future changes or
fixes to the implementation.

FWIW (and as anticipated time ago in a private email), when I see code
like this I tend to look elsewhere... ;-/

Thanks,
Andrea

2019-06-19 11:08:59

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Wed, Jun 19, 2019 at 12:30:26AM +0200, John Ogness wrote:
> On 2019-06-18, Peter Zijlstra <[email protected]> wrote:
> >> +/**
> >> + * DOC: memory barriers
> >
> > What's up with that 'DOC' crap?
>
> The separate documentation in
> Documentation/core-api/printk-ringbuffer.rst references this so it
> automatically shows up in the kernel docs. An external reference
> requires the DOC keyword.
>
> Maybe the memory barrier descriptions do not belong in the kernel docs?

So i'm biased; I don't much care for Documentation/ -- code should be
readable and have sufficient comments; I hate rst and I think that
anything that detracts from reading code comments in an editor is pure
evil.

Personally, I've stopped using /** comments, live is better now.

YMMV


> Sorry. I really have no feel about what (or how) exactly I should
> document the memory barriers. I think the above comments make sense when
> someone understands the details of the implementation. But perhaps it
> should describe things such that someone without knowledge of the
> implementation would understand what the memory barriers are for? That
> would significantly increase the amount of text as I would have to
> basically explain the implementation.
>
> I would appreciate it if you could point out a source file that
> documents its memory barriers the way you would like to see these memory
> barriers documented.

Yeah, I was going to read the implementation and make suggestions; just
haven't gotten there yet.

2019-06-20 22:24:28

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

Hi Peter,

This is a long response, but we are getting into some fine details about
the memory barriers (as well as battling my communication skill level).

On 2019-06-18, Peter Zijlstra <[email protected]> wrote:
>> +#define DATAARRAY_SIZE(rb) (1 << rb->data_array_size_bits)
>> +#define DATAARRAY_SIZE_BITMASK(rb) (DATAARRAY_SIZE(rb) - 1)
>
> *phew* no comments on those..
>
> I think the kernel typically uses _MASK instead of _BITMASK for this
> though.

Yes, you are right.

>> +/**
>> + * DATA_INDEX() - Determine the data array index from logical position.
>> + * @rb: The associated ringbuffer.
>> + * @lpos: The logical position (data/data_next).
>> + */
>> +#define DATA_INDEX(rb, lpos) (lpos & DATAARRAY_SIZE_BITMASK(rb))
>> +
>> +/**
>> + * DATA_WRAPS() - Determine how many times the data array has wrapped.
>> + * @rb: The associated ringbuffer.
>> + * @lpos: The logical position (data/data_next).
>> + *
>> + * The number of wraps is useful when determining if one logical position
>> + * is overtaking the data array index another logical position.
>> + */
>> +#define DATA_WRAPS(rb, lpos) (lpos >> rb->data_array_size_bits)
>> +
>> +/**
>> + * DATA_THIS_WRAP_START_LPOS() - Get the position at the start of the wrap.
>> + * @rb: The associated ringbuffer.
>> + * @lpos: The logical position (data/data_next).
>> + *
>> + * Given a logical position, return the logical position if backed up to the
>> + * beginning (data array index 0) of the current wrap. This is used when a
>> + * data block wraps and therefore needs to begin at the beginning of the data
>> + * array (for the next wrap).
>> + */
>> +#define DATA_THIS_WRAP_START_LPOS(rb, lpos) \
>> + (DATA_WRAPS(rb, lpos) << rb->data_array_size_bits)
>
> That's more easily written as: ((lpos) & ~MASK(rb))

Agreed.

>> +
>> +#define DATA_ALIGN sizeof(long)
>> +#define DATA_ALIGN_SIZE(sz) \
>> + ((sz + (DATA_ALIGN - 1)) & ~(DATA_ALIGN - 1))
>
> We have ALIGN() for that

OK.

>> +
>> +#define DESCR_COUNT_BITMASK(rb) (rb->descr_max_count - 1)
>
> I think the kernel typically uses 'DESC' as shorthand for Descriptor.
> Idem on the MASK vs BITMASK thing.

Yes, you are correct.

>> +
>> +/**
>> + * DESCR_INDEX() - Determine the descriptor array index from the id.
>> + * @rb: The associated ringbuffer.
>> + * @id: The descriptor id.
>> + */
>> +#define DESCR_INDEX(rb, id) (id & DESCR_COUNT_BITMASK(rb))
>> +
>> +#define TO_DATABLOCK(rb, lpos) \
>> + ((struct prb_datablock *)&rb->data_array[DATA_INDEX(rb, lpos)])
>
> If I were paranoid, I'd point out that this evaluates @rb twice, and
> doesn't have the macro arguments in parens.

There are several other macros that also should have some parens for the
arguments. Thanks for pointing that out.

As for the double evaluation, I'm not sure what should be done
instead. It is a convenience macro. I could split it into 2 macros and
have the caller always call the 2 macros. Is that desirable?

>> +#define TO_DESCR(rb, id) \
>> + (&rb->descr_array[DESCR_INDEX(rb, id)])
>> +
>> +/**
>> + * data_valid() - Check if a data block is valid.
>> + * @rb: The ringbuffer containing the data.
>> + * @oldest_data: The oldest data logical position.
>> + * @newest_data: The newest data logical position.
>> + * @data: The logical position for the data block to check.
>> + * @data_next: The logical position for the data block next to this one.
>> + * This value is used to identify the end of the data block.
>> + *
>> + * A data block is considered valid if it satisfies the two conditions:
>> + *
>> + * * oldest_data <= data < data_next <= newest_data
>> + * * oldest_data is at most exactly 1 wrap behind newest_data
>> + *
>> + * Return: true if the specified data block is valid.
>> + */
>> +static inline bool data_valid(struct printk_ringbuffer *rb,
>> + unsigned long oldest_data,
>> + unsigned long newest_data,
>> + unsigned long data, unsigned long data_next)
>> +
>> +{
>> + return ((data - oldest_data) < DATAARRAY_SIZE(rb) &&
>> + data_next != data &&
>> + (data_next - data) < DATAARRAY_SIZE(rb) &&
>> + (newest_data - data_next) < DATAARRAY_SIZE(rb) &&
>> + (newest_data - oldest_data) <= DATAARRAY_SIZE(rb));
>
> unsigned long size = DATA_SIZE(rb);
>
> /* oldest_data <= data */
> if (data - oldest_data >= size);
> return false;
>
> /* data_next < data */
> if (data_next == data)
> return false
>
> /* data_next <= newest_data */
> if (newest_data - data_next >= size)
> return false;
>
> /* 1 wrap */
> if (newest_data - oldest_data >= size)
> return false;
>
> return true;

Ha! That was my original implementation, but I changed it because I
figured there would be feedback telling me to put everything into a
single expression for compiler optimization. I am happy to use your
suggestion instead.

>> +}
>> +
>> +/**
>> + * add_descr_list() - Add a descriptor to the descriptor list.
>> + * @e: An entry that has already reserved data.
>> + *
>> + * The provided entry contains a pointer to a descriptor that has already
>> + * been reserved for this entry. However, the reserved descriptor is not
>> + * yet on the list. Add this descriptor as the newest item.
>> + *
>> + * A descriptor is added in two steps. The first step is to make this
>> + * descriptor the newest. The second step is to update the "next" field of
>> + * the former newest item to point to this item.
>> + */
>> +static void add_descr_list(struct prb_reserved_entry *e)
>> +{
>> + struct printk_ringbuffer *rb = e->rb;
>> + struct prb_list *l = &rb->descr_list;
>> + struct prb_descr *d = e->descr;
>> + struct prb_descr *newest_d;
>> + unsigned long newest_id;
>> +
>> + /* set as newest */
>> + do {
>> + /* MB5: synchronize add descr */
>> + newest_id = smp_load_acquire(&l->newest);
>> + newest_d = TO_DESCR(rb, newest_id);
>> +
>> + if (newest_id == EOL)
>> + WRITE_ONCE(d->seq, 1);
>> + else
>> + WRITE_ONCE(d->seq, READ_ONCE(newest_d->seq) + 1);
>> + /*
>> + * MB5: synchronize add descr
>> + *
>> + * In particular: next written before cmpxchg
>> + */
>> + } while (cmpxchg_release(&l->newest, newest_id, e->id) != newest_id);
>
> What does this pair with? I find ->newest usage in:

It is pairing with the smp_load_acquire() at the beginning of this loop
(also labeled MB5) that is running simultaneously on another CPU. I am
avoiding a possible situation that a new descriptor is added but the
store of "next" from the previous descriptor is not yet visible and thus
the cmpxchg following will fail, which is not allowed. (Note that "next"
is set to EOL shortly before this function is called.)

The litmus test for this is:

P0(int *newest, int *d_next)
{
// set descr->next to EOL (terminates list)
WRITE_ONCE(*d_next, 1);

// set descr as newest
smp_store_release(*newest, 1);
}

P1(int *newest, int *d_next)
{
int local_newest;
int local_next;

// get newest descriptor
local_newest = smp_load_acquire(*newest);

// a new descriptor is set as the newest
// (not relevant here)

// read descr->next of previous newest
// (must be EOL!)
local_next = READ_ONCE(*d_next);
}

exists (1:local_newest=1 /\ 1:local_next=0)

> - later this function with an MB6 comment
> - remove_oldest_descr() with no comment
> - expire_oldest_data() with an MB2 comment
> - get_new_lpos() with no comment
> - data_reserve() with an MB2 comment
> - prb_iter_next_valid_entry() with no comment
> (and the smp_rmb()s have no clear comments either).
>
> In short; I've no frigging clue and I might as well just delete all
> these comments and reverse engineer :-(

OK. I understand that I haved failed horribly at commenting the
barriers. Perhaps I should submit a v3 with only new memory barrier
comments so that you can better understand and (hopefully) Andrea would
also be able to take a look?

>> +
>> + if (unlikely(newest_id == EOL)) {
>> + /* no previous newest means we *are* the list, set oldest */
>> +
>> + /*
>> + * MB UNPAIRED
>
> That's a bug, MB must always be paired.

Well, it "pairs" with the smp_rmb() of the readers, but I didn't think
that counts as a pair. That's why I wrote unpaired. The litmus test is:

P0(int *x, int *y)
{
WRITE_ONCE(*x, 1);
smp_store_release(y, 1);
}

P1(int *x, int *y)
{
int rx;
int ry;

ry = READ_ONCE(*y);
smp_rmb();
rx = READ_ONCE(*x);
}

exists (1:rx=0 /\ 1:ry=1)

The readers rely on the store_releases "pairing" with the smp_rmb() so
that the readers see things in a sane order.

For this particular case, I could change the
READ_ONCE(rb->descr_list.oldest) in iter_peek_next_id() and
prb_iter_next_valid_entry() to smp_load_acquires and then there would be
an official (and correct) pairing. But since the smp_rmb's are needed
anyway (for other fields), it really isn't necessary.

>> + *
>> + * In particular: Force cmpxchg _after_ cmpxchg on newest.
>> + */
>> + WARN_ON_ONCE(cmpxchg_release(&l->oldest, EOL, e->id) != EOL);
>> + } else {
>> + /* link to previous chain */
>> +
>> + /*
>> + * MB6: synchronize link descr
>> + *
>> + * In particular: Force cmpxchg _after_ cmpxchg on newest.
>
> But why... and who cares.

The comments on the matching MB6 and in the MB6 documentation are more
precise about this. But I guess they aren't clear enough either. :-/

It is important to understand that once the ringbuffer is full (which is
quite common for the printk buffer) then old data starts to be expired
and descriptors recycled. This is really where memory barriers become
critical.

In this situation, it becomes normal that a writer adding a new record
must first expire/recycle and old record. The writer moving the oldest
descriptor pointer forward is also the one that is re-initializing the
oldest descriptor and re-adding it to end of the reader-visible list.

As described in the documentation, a descriptor can not be removed from
the list if it is the only one on the list. This is verified by checking
if it's "next" is EOL and it is the "oldest". However, because
descriptors are recycled (and thus the next set to EOL) and oldest can
be moving (by writers on other CPUs), the check for this case must be
taken into careful consideration, which is where MB6 comes in.

If a writer changes "next" then that writer previously changed
"oldest". And when another writer checks "next" it needs to be
guaranteed that it also sees the updated "oldest".

The litmus test for this is:

P0(int *oldest, int *prev_newest_next)
{
// remove the oldest descriptor
WRITE_ONCE(*oldest, 1);

// set the previous newest next to this descriptor
smp_store_release(*prev_newest_next, 1);
}

P1(int *oldest, int *prev_newest_next)
{
int local_oldest;
int local_next;

// get next of oldest descriptor
// (assuming prev_newest_next is now the oldest)
local_next = smp_load_acquire(*prev_newest_next);

// read the oldest
local_oldest = READ_ONCE(*oldest);
}

exists (1:local_next=1 /\ 1:local_oldest=0)

>> + */
>> + WARN_ON_ONCE(cmpxchg_release(&newest_d->next,
>> + EOL, e->id) != EOL);
>> + }
>> +}

So this email has a lot of explanations. My first question is: do my
explanations make any sense to you? If yes, then I need to figure out
how to massively condense them to something that is appropriate for a
memory barrier comment. If no... I guess I need to find someone to
translate my madness into something that is understandable to the rest
of the world.

John Ogness

P.S. Is it correct that git HEAD herd7 does not support cmpxchg? For my
litmus tests I use stores and loads instead, which works and is correct,
but makes it more difficult for someone to match the test to the actual
code.

2019-06-20 22:52:30

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-19, Andrea Parri <[email protected]> wrote:
>> I would appreciate it if you could point out a source file that
>> documents its memory barriers the way you would like to see these memory
>> barriers documented.
>
> IMO, you could find some inspiration by looking at the memory barriers
> comments from:
>
> kernel/sched/core.c:try_to_wake_up()
> include/linux/wait.h:waitqueue_active()
> kernel/futex.c [header _and inline annotations]
>
> I'll detail a single example here, and then conclude with some general
> guidelines:
>
> ---
> [from kernel/sched/rt.c]
>
> static inline void rt_set_overload(struct rq *rq)
> {
> if (!rq->online)
> return;
>
> cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
> /*
> * Make sure the mask is visible before we set
> * the overload count. That is checked to determine
> * if we should look at the mask. It would be a shame
> * if we looked at the mask, but the mask was not
> * updated yet.
> *
> * Matched by the barrier in pull_rt_task().
> */
> smp_wmb();
> atomic_inc(&rq->rd->rto_count);
> }
>
> static void pull_rt_task(struct rq *this_rq)
> {
> int this_cpu = this_rq->cpu, cpu;
> bool resched = false;
> struct task_struct *p;
> struct rq *src_rq;
> int rt_overload_count = rt_overloaded(this_rq);
>
> if (likely(!rt_overload_count))
> return;
>
> /*
> * Match the barrier from rt_set_overloaded; this guarantees that if we
> * see overloaded we must also see the rto_mask bit.
> */
> smp_rmb();
>
> /* If we are the only overloaded CPU do nothing */
> if (rt_overload_count == 1 &&
> cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
> return;
>
> [...]
>
> }
> ---
>
> Notice that the comments provide the following information: for _each_
> memory barrier primitive,
>
> 1) the _memory accesses_ being ordered
>
> the store to ->rto_mask and the store to ->rto_count for the smp_wmb()
> the load from ->rto_count and the from ->rto_mask for the smp_rmb()
>
> 2) the _matching barrier_ (and its location)
>
> 3) an informal description of the _underlying guarantee(s)_ (c.f.,
> "if we see overloaded we must also see the rto_mask bit").
>
> One can provide this information by embedding some snippet/pseudo-code
> in its comments as illustrated in the examples pointed out above.
>
> I'd suggest to _not be stingy with memory barriers explanations: this
> eases/makes it possible the review itself as well as future changes or
> fixes to the implementation.

Thank you for the specific examples and explanations. I need to frame
your email and hang it next to my monitor for reference.

> FWIW (and as anticipated time ago in a private email), when I see code
> like this I tend to look elsewhere... ;-/

Do you really mean "code" or are you just referring to "code comments"?
If you really mean code, then I'd appreciate some feedback about what
should change.

Your private email helped me a great deal. The memory barrier work in v2
is vastly superior to v1, even if it still crap in your eyes. I
appreciate you continuing to support me on this.

John Ogness

2019-06-21 12:19:02

by Andrea Parri

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

> > FWIW (and as anticipated time ago in a private email), when I see code
> > like this I tend to look elsewhere... ;-/
>
> Do you really mean "code" or are you just referring to "code comments"?
> If you really mean code, then I'd appreciate some feedback about what
> should change.

I really just meant "uncommented code". ;-) I do plan to read your:

https://lkml.kernel.org/r/[email protected]

(and the code it's referring to) with due calm in the following days.
Thank you in advance for these remarks.

[Trying to address your question about herd7,]

A list of supported primitives is available from:

tools/memory-model/linux-kernel.def (left column)

This includes cmpxchg() (and its variants: _relaxed(), _acquire() and
_release()), however, herd7 can currently only parse statements like:

loc_var = cmpxchg(addr, old, new);

(and it will complain loudly without that "loc_var = " part...); this
is something that could be improved, or at least it seems this way...

A similar consideration holds for all the value-returning primitives.

Thanks,
Andrea

2019-06-21 14:07:28

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

Hi John,

I am still scratching head around this. Anyway, I wanted to
write something. I am sorry that the answer is really long.
I do not know how to write it more effectively.

First, the documentation helped a lot. Also I found several
ideas that were important to make it working a lockless way.
Especially:

+ separate buffer with descriptors of data entries
+ descriptor can be reused only when the related data
have already been discarded

There are few comments about the naming and code at the end of
the mail. But I primary want to write about the design, namely:

+ Linked list of descriptors
+ Code structure, consistency, barriers
+ Ideas


1. Linked list of descriptors
-----------------------------

The list of descriptors makes the code more complicated
and I do not see much gain. It is possible that I just missed
something.

If I get it correctly then the list could only grow by adding
never used members. The already added members are newer removed
neither shuffled.

If the above is true then we could achieve similar result
when using the array as a circular buffer. It would be
the same like when all members are linked from the beginning.

It would allow to remove:

+ desc->id because it will be the same as desc->seq
+ desc->next because the address of the next member
can be easily counted


2. Consistency, barriers, and code structure
--------------------------------------------

I haven't got the whole picture about the code logic so far.
Maybe I haven't tried hard enough. I actually spent quite
some time with playing with some alternatives.

In each case, the code is very complicated (of course, the problem
is complicated):

+ 6 steps (barriers) are needed to synchronize writers.
This means a lot of variants of possible races.

+ The six barriers are somehow related to 6 variables.
But there are several other variables that are being
modified. It is needed to check that they can be
safely modified/read on the given locations.


OK, the structures have a livecycle:

+ descriptor:
+ free
+ taken

+ data block:
+ reserved
+ committed
+ correctly read
+ freed

And there are few basic questions about each state:

+ where and how the state is set
+ where and how it is invalidated
+ where and how the state is checked in other code
+ how is it achieved that the state is the same
as long as needed

Some of the answers are easier to find than the others.
So far I found one suspicious thing in expire_oldest_data().

To be honest, I am not sure how to describe this effectively.
It might help to better describe the barriers (what they
synchronize (a after b) and where is the counterpart,
and why it is needed from some top level point of view).

Of course, the best solution is an easy to follow code.
This brings me to the next section.

3. Ideas
--------

I started reading your code and I though that it must have
been possible to write it a more straightforward way. I tried
it and reached many dead ends so far ;-) But it helped me to
better understand your code.

I have not given up yet and would like to give it some
more time. Unfortunately, I will not have much time
the next week.

Anyway, I am trying to:

+ use the array of descriptors as a ring buffer
(no list, no id, only the sequence number)

+ distinguish the state of the data by some
flags in struct prb_desc to avoid complicated
and tricky checks

It seems that the ring buffer of descriptors really makes
things easier.

Regarding the flags. I have something like:

struct prb_desc
{
unsigned long seq;
bool committed;
bool freed;
}

The basic idea with the flags is that they are valid only
when the seq number in the structure is valid. The newly
reserved struct prb_desc is written the following way:

static void prb_init_desc(struct prb_desc *desc)
{
desc->committed = false;
desc->freed = false;

/*
* Flags must be cleared before we tell others that they
* are for this sequence number.
*/
smp_wmb();

desc->seq = seq;
}

Then we could have checks like:

/*
* Used by readers to check if the data are valid.
* It has to be called twice (before and after)
* to make sure that the read data are valid.
*/
static bool prb_data_valid(struct printk_ringbuffer *rb,
unsigned long seq)
{
static prb_desc *desc = TO_DESC(rb, seq);

if (READ_ONCE(desc->seq) != seq)
false;

/* Do not read outdated flags, see prb_init_desc()
smp_rmb();

return READ_ONCE(desc->committed) && !READ_ONCE(desc->freed);
}

I am not sure if these extra flags are really needed and useful.
This is why I play with it myself. I do not want to ask you to
spend a lot of time with my crazy ideas.

Anyway, the above approach looked promising until I tried to
to free data from the data array. The problem is how to prove
that the sequence number read from the data array is not
a garbage. BTW: I think that your expire_oldest_data() is
buggy from this point of view, see below.

I think that it might be much more safe when we mask the two
highest bits of seq number and use them for the flags.
Then we could track the state of the given sequence number
a very safe and straightforward way.



Finally, here are some comments about the original patch:

On Fri 2019-06-07 18:29:48, John Ogness wrote:
> See documentation for details.

Please, mention here some basics. It might be enough to copy the
following sections from the documentation:

Overview
Features
Behavior

Note that the documentation is written via .rst file. You need to
build html or pdf to get all the pieces together.


> diff --git a/include/linux/printk_ringbuffer.h b/include/linux/printk_ringbuffer.h
> new file mode 100644
> index 000000000000..569980a61c0a
> --- /dev/null
> +++ b/include/linux/printk_ringbuffer.h
> @@ -0,0 +1,238 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_PRINTK_RINGBUFFER_H
> +#define _LINUX_PRINTK_RINGBUFFER_H
> +
> +#include <linux/atomic.h>
> +
> +/**
> + * struct prb_list - An abstract linked list of items.
> + * @oldest: The oldest item on the list.
> + * @newest: The newest item on the list.

I admit that I got confused by this. I wonder if there is another
location in kernel where lists are handled this way.

I have always seen in kernel only lists handled via the struct
list_head trick. Where the same structure is bundled in all
linked members.

I can't find a good name. I would personally remove the structure
and add the members into the relates structures directly.

Also I would personally use "first" and "last" because they are
shorter and easier to visually distinguish. I know that "oldest"
and "newest" are more clear but...


> +/**
> + * struct prb_descr - A descriptor representing an entry in the ringbuffer.

I agree with Peter that "desc" is a better shortcut.

> + * @seq: The sequence number of the entry.
> + * @id: The descriptor id.
> + * The location of the descriptor within the descriptor array can be
> + * determined from this value.
> + * @data: The logical position of the data for this entry.
> + * The location of the beginning of the data within the data array
> + * can be determined from this value.

I was quite confused by this name. Please, use "lpos". It will make
clear that it is the logical position. Also it will be clear
that desc->data is the same as lpos used on other location
in the code.


> + * @data_next: The logical position of the data next to this entry.
> + * This is used to determine the length of the data as well as
> + * identify where the next data begins.

next_lpos

> + * @next: The id of the next (newer) descriptor in the linked list.
> + * A value of EOL means it is the last descriptor in the list.
> + *
> + * Descriptors are used to identify where the data for each entry is and
> + * also provide an ordering for readers. Entry ordering is based on the
> + * descriptor linked list (not the ordering of data in the data array).
> + */
> +struct prb_descr {
> + /* private */
> + u64 seq;
> + unsigned long id;
> + unsigned long data;
> + unsigned long data_next;
> + unsigned long next;
> +};
> +
> +/**
> + * struct printk_ringbuffer - The ringbuffer structure.
> + * @data_array_size_bits: The size of the data array as a power-of-2.

I would use "data_size_bits"

> + * @data_array: A pointer to the data array.

and "data"

> + * @data_list: A list of entry data.
> + * Since the data list is not traversed, this list is only used to
> + * mark the contiguous section of the data array that is in use.
> + * @descr_max_count: The maximum amount of descriptors allowed.
> + * @descr_array: A pointer to the descriptor array.

descs?

> +/**
> + * prb_for_each_entry() - Iterate through all the entries of a ringbuffer.
> + * @i: A pointer to an iterator.
> + * @l: An integer used to identify when the last entry is traversed.
> + *
> + * This macro expects the iterator to be initialized. It also does not reset
> + * the iterator. So if the iterator has already been used for some traversal,
> + * this macro will continue where the iterator left off.
> + */
> +#define prb_for_each_entry(i, l) \
> + for (; (l = prb_iter_next_valid_entry(i)) != 0;)

This is very unusual semantic. Please, define two iterators:

prb_for_each_entry() - iterate over all entries
prb_for_each_entry_continue() - iterate from the gived entry

> +/**
> + * expire_oldest_data() - Invalidate the oldest data block.
> + * @rb: The ringbuffer containing the data block.
> + * @oldest_lpos: The logical position of the oldest data block.
> + *
> + * This function expects to "push" the pointer to the oldest data block
> + * forward, thus invalidating the oldest data block. However, before pushing,
> + * it is verified if the data block is valid. (For example, if the data block
> + * was reserved but not yet committed, it is not permitted to invalidate the
> + * "in use by a writer" data.)
> + *
> + * If the data is valid, it will be associated with a descriptor, which will
> + * then provide the necessary information to validate the data.
> + *
> + * Return: true if the oldest data was invalidated (regardless if this
> + * task was the one that did it or not), otherwise false.
> + */
> +static bool expire_oldest_data(struct printk_ringbuffer *rb,
> + unsigned long oldest_lpos)
> +{
> + unsigned long newest_lpos;
> + struct prb_datablock *b;
> + unsigned long data_next;
> + struct prb_descr *d;
> + unsigned long data;
> +
> + /* MB2: synchronize data reservation */
> + newest_lpos = smp_load_acquire(&rb->data_list.newest);
> +
> + b = TO_DATABLOCK(rb, oldest_lpos);
> +
> + /* MB3: synchronize descr setup */
> + d = TO_DESCR(rb, smp_load_acquire(&b->id));
> +
> + data = READ_ONCE(d->data);
> +
> + /* sanity check to check to see if b->id was correct */
> + if (oldest_lpos != data)
> + goto out;

Is this cross check really enough?
How is it ensured that the data are committed?
How is it ensured that the descriptor is not an outdated one?

IMHO, there might be a garbage in the data array. It might be chance
point to an outdated descriptor that by chance pointed to this
data range in the past. I agree that it is very unlikely. But
we could not afford such a risk.

> + /* MB4: synchronize commit */
> + data_next = smp_load_acquire(&d->data_next);
> +
> + if (!data_valid(rb, oldest_lpos, newest_lpos, data, data_next))
> + goto out;
> +
> + /* MB1: synchronize data invalidation */
> + cmpxchg_release(&rb->data_list.oldest, data, data_next);
> +
> + /* Some task (maybe this one) successfully expired the oldest data. */
> + return true;
> +out:
> + return (oldest_lpos != READ_ONCE(rb->data_list.oldest));
> +}

Best Regards,
Petr

2019-06-24 08:51:12

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

Hi Petr,

On 2019-06-21, Petr Mladek <[email protected]> wrote:
> I am still scratching head around this. Anyway, I wanted to
> write something. I am sorry that the answer is really long.
> I do not know how to write it more effectively.

No need to apologize (to me) for long answers.

> 1. Linked list of descriptors
> -----------------------------
>
> The list of descriptors makes the code more complicated
> and I do not see much gain. It is possible that I just missed
> something.
>
> If I get it correctly then the list could only grow by adding
> never used members. The already added members are newer removed
> neither shuffled.
>
> If the above is true then we could achieve similar result
> when using the array as a circular buffer. It would be
> the same like when all members are linked from the beginning.

So you are suggesting using a multi-reader multi-writer lockless
ringbuffer to implement a multi-reader multi-writer lockless
ringbuffer. ;-)

The descriptor ringbuffer has fixed-size items, which simplifies the
task. But I expect you will run into a chicken-egg scenario.

> It would allow to remove:
>
> + desc->id because it will be the same as desc->seq
> + desc->next because the address of the next member
> can be easily counted

Yes, you will remove these and then replace them with new variables to
track array-element state.

> 2. Consistency, barriers, and code structure
> --------------------------------------------
>
> I haven't got the whole picture about the code logic so far.
> Maybe I haven't tried hard enough. I actually spent quite
> some time with playing with some alternatives.
>
> In each case, the code is very complicated (of course, the problem
> is complicated):
>
> + 6 steps (barriers) are needed to synchronize writers.
> This means a lot of variants of possible races.

Agreed. The writer documentation explains those 6 steps, which are 6
atomic operations. I am not sure if you would be able to reduce the 6
steps by using a descriptor ringbuffer. Instead of cmpxchg'ing list
pointers you will be cmpxchg'ing state variables.

Using litmus tests and lots of testing on SMP arm64, I have been able
hit and address these races (i.e. removing one of the barriers causes
the test module to fail on SMP arm64). But if we can simplify the
design, that would certainly help to deal with races.

> + The six barriers are somehow related to 6 variables.
> But there are several other variables that are being
> modified. It is needed to check that they can be
> safely modified/read on the given locations.

Here are the writer-relevant memory barriers and their associated
variables:

MB1: data_list.oldest
MB2: data_list.newest
MB3: data_block.id
MB4: descr.data_next
MB5: descr_list.newest
MB6: descr.next

The only other variables that I see as relevant are:

descr.id: This variable is used often and is quite important (the basis
for linked lists and descriptor validation). I will go through all its
uses again. My memory barrier comments should definitely include this
variable in their explanations.

descr.data: This is set when the descriptor is not part of the list and
is indirectly synchronized by MB3 (written before store_release of
data_block.id and loaded as data dependent on the load_acquire of
data_block.id).

descr.seq: This is set when the descriptor is not part of the list and
is indirectly synchronized by MB5 (written before store_release of
descr_list.newest and loaded as data dependent on the load_acquire of
descr_list.newest).

descr_list.oldest: As I explained[0] to Peter, this variable is
indirectly synchronized by MB6 when there is only 1 descriptor on the
list (which is not the normal case). Otherwise it has no
synchronization. This could lead to a writer unnecessarily trying
multiple times to remove the oldest descriptor because of failed
cmpxchg() calls. But that isn't really an issue. (And now revisiting
remove_oldest_descr() I see I can pull the READ_ONCE(l->oldest) out of
the loop and use the cmpxchg() return value for the next iteration.)

Are there any other variables that you are referring to?

> OK, the structures have a livecycle:
>
> + descriptor:
> + free
> + taken
>
> + data block:
> + reserved
> + committed
> + correctly read
> + freed

IMO, it is:

+ descriptor:
+ free
+ taken
+ valid (i.e. data/data_next have been set by the writer)

+ data block:
+ reserved
+ committed
+ invalid/garbage

> And there are few basic questions about each state:
>
> + where and how the state is set
> + where and how it is invalidated
> + where and how the state is checked in other code
> + how is it achieved that the state is the same
> as long as needed
>
> Some of the answers are easier to find than the others.
> So far I found one suspicious thing in expire_oldest_data().
>
> To be honest, I am not sure how to describe this effectively.
> It might help to better describe the barriers (what they
> synchronize (a after b) and where is the counterpart,
> and why it is needed from some top level point of view).
>
> Of course, the best solution is an easy to follow code.
> This brings me to the next section.
>
> 3. Ideas
> --------
>
> I started reading your code and I though that it must have
> been possible to write it a more straightforward way. I tried
> it and reached many dead ends so far ;-) But it helped me to
> better understand your code.
>
> I have not given up yet and would like to give it some
> more time. Unfortunately, I will not have much time
> the next week.
>
> Anyway, I am trying to:
>
> + use the array of descriptors as a ring buffer
> (no list, no id, only the sequence number)
>
> + distinguish the state of the data by some
> flags in struct prb_desc to avoid complicated
> and tricky checks
>
> It seems that the ring buffer of descriptors really makes
> things easier.
>
> Regarding the flags. I have something like:
>
> struct prb_desc
> {
> unsigned long seq;
> bool committed;
> bool freed;
> }
>
> The basic idea with the flags is that they are valid only
> when the seq number in the structure is valid. The newly
> reserved struct prb_desc is written the following way:
>
> static void prb_init_desc(struct prb_desc *desc)
> {
> desc->committed = false;
> desc->freed = false;
>
> /*
> * Flags must be cleared before we tell others that they
> * are for this sequence number.
> */
> smp_wmb();
>
> desc->seq = seq;
> }
>
> Then we could have checks like:
>
> /*
> * Used by readers to check if the data are valid.
> * It has to be called twice (before and after)
> * to make sure that the read data are valid.
> */
> static bool prb_data_valid(struct printk_ringbuffer *rb,
> unsigned long seq)
> {
> static prb_desc *desc = TO_DESC(rb, seq);
>
> if (READ_ONCE(desc->seq) != seq)
> false;
>
> /* Do not read outdated flags, see prb_init_desc()
> smp_rmb();
>
> return READ_ONCE(desc->committed) && !READ_ONCE(desc->freed);
> }
>
> I am not sure if these extra flags are really needed and useful.
> This is why I play with it myself. I do not want to ask you to
> spend a lot of time with my crazy ideas.

But thank you for sharing them. I always welcome new ideas.

> Anyway, the above approach looked promising until I tried to
> to free data from the data array. The problem is how to prove
> that the sequence number read from the data array is not
> a garbage. BTW: I think that your expire_oldest_data() is
> buggy from this point of view, see below.

You do point out an issue for 32-bit systems. Below I include an
explanation and patch.

> I think that it might be much more safe when we mask the two
> highest bits of seq number and use them for the flags.
> Then we could track the state of the given sequence number
> a very safe and straightforward way.

When I first started to design/code this, I implemented something quite
similar: using a single variable to represent state and id. This works
nicely for cmpxchg operations (lockless synchronization) and reader
validation. The problem I ran into was a chicken-egg problem (which I
suspect you will also run into).

I solved this problem by changing the design to use a linked list for
the descriptors. At first I had kept state information for each
descriptor. But later I realized that state information was not
necessary because the linked list itself was providing implicit state
information.

I do not claim that using linked lists for the descriptors is absolutely
necessary. But it was the only way that I could figure out how to make
everything work. Now that I have something that works (and the
experience of getting it there), maybe I could make it work without a
linked list. I will let the idea simmer on my brain in the background
and I am following your experimentation/ideas with great curiosity.

> Finally, here are some comments about the original patch:
>
> On Fri 2019-06-07 18:29:48, John Ogness wrote:
>> See documentation for details.
>
> Please, mention here some basics. It might be enough to copy the
> following sections from the documentation:
>
> Overview
> Features
> Behavior

Ugh. Do we really want all that in a commit message?

> Note that the documentation is written via .rst file. You need to
> build html or pdf to get all the pieces together.

Yes, but isn't that how all the kernel docs are supposed to be for the
future?

>> diff --git a/include/linux/printk_ringbuffer.h b/include/linux/printk_ringbuffer.h
>> new file mode 100644
>> index 000000000000..569980a61c0a
>> --- /dev/null
>> +++ b/include/linux/printk_ringbuffer.h
>> @@ -0,0 +1,238 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef _LINUX_PRINTK_RINGBUFFER_H
>> +#define _LINUX_PRINTK_RINGBUFFER_H
>> +
>> +#include <linux/atomic.h>
>> +
>> +/**
>> + * struct prb_list - An abstract linked list of items.
>> + * @oldest: The oldest item on the list.
>> + * @newest: The newest item on the list.
>
> I admit that I got confused by this. I wonder if there is another
> location in kernel where lists are handled this way.
>
> I have always seen in kernel only lists handled via the struct
> list_head trick. Where the same structure is bundled in all
> linked members.
>
> I can't find a good name. I would personally remove the structure
> and add the members into the relates structures directly.

The only reason I made it a struct is so that I could just write
l->oldest instead of rb->descr_list_oldest. But it is an otherwise
useless struct that I can remove.

> Also I would personally use "first" and "last" because they are
> shorter and easier to visually distinguish. I know that "oldest"
> and "newest" are more clear but...

I don't like "oldest" and "newest" either, but it is immediately
clear. How about:

rb->data_lpos_oldest (formerly rb->data_list.oldest)
rb->data_lpos_newest (formerly rb->data_list.newest)
rb->desc_id_oldest (formerly rb->descr_list.oldest)
rb->desc_id_newest (formerly rb->descr_list.newest)

If using the strings "oldest" and "newest" are too ugly for people, I
have no problems using first/last or head/tail, even if IMHO they add
unnecessary confusion.

>> +/**
>> + * struct prb_descr - A descriptor representing an entry in the ringbuffer.
>
> I agree with Peter that "desc" is a better shortcut.

OK.

>> + * @seq: The sequence number of the entry.
>> + * @id: The descriptor id.
>> + * The location of the descriptor within the descriptor array can be
>> + * determined from this value.
>> + * @data: The logical position of the data for this entry.
>> + * The location of the beginning of the data within the data array
>> + * can be determined from this value.
>
> I was quite confused by this name. Please, use "lpos". It will make
> clear that it is the logical position. Also it will be clear
> that desc->data is the same as lpos used on other location
> in the code.

Agreed.

>> + * @data_next: The logical position of the data next to this entry.
>> + * This is used to determine the length of the data as well as
>> + * identify where the next data begins.
>
> next_lpos

How about lpos_next?

>> + * @next: The id of the next (newer) descriptor in the linked list.
>> + * A value of EOL means it is the last descriptor in the list.
>> + *
>> + * Descriptors are used to identify where the data for each entry is and
>> + * also provide an ordering for readers. Entry ordering is based on the
>> + * descriptor linked list (not the ordering of data in the data array).
>> + */
>> +struct prb_descr {
>> + /* private */
>> + u64 seq;
>> + unsigned long id;
>> + unsigned long data;
>> + unsigned long data_next;
>> + unsigned long next;
>> +};
>> +
>> +/**
>> + * struct printk_ringbuffer - The ringbuffer structure.
>> + * @data_array_size_bits: The size of the data array as a power-of-2.
>
> I would use "data_size_bits"

OK.

>> + * @data_array: A pointer to the data array.
>
> and "data"

OK.

>> + * @data_list: A list of entry data.
>> + * Since the data list is not traversed, this list is only used to
>> + * mark the contiguous section of the data array that is in use.
>> + * @descr_max_count: The maximum amount of descriptors allowed.
>> + * @descr_array: A pointer to the descriptor array.
>
> descs?

OK.

>> +/**
>> + * prb_for_each_entry() - Iterate through all the entries of a ringbuffer.
>> + * @i: A pointer to an iterator.
>> + * @l: An integer used to identify when the last entry is traversed.
>> + *
>> + * This macro expects the iterator to be initialized. It also does not reset
>> + * the iterator. So if the iterator has already been used for some traversal,
>> + * this macro will continue where the iterator left off.
>> + */
>> +#define prb_for_each_entry(i, l) \
>> + for (; (l = prb_iter_next_valid_entry(i)) != 0;)
>
> This is very unusual semantic. Please, define two iterators:
>
> prb_for_each_entry() - iterate over all entries
> prb_for_each_entry_continue() - iterate from the gived entry

OK.

>> +/**
>> + * expire_oldest_data() - Invalidate the oldest data block.
>> + * @rb: The ringbuffer containing the data block.
>> + * @oldest_lpos: The logical position of the oldest data block.
>> + *
>> + * This function expects to "push" the pointer to the oldest data block
>> + * forward, thus invalidating the oldest data block. However, before pushing,
>> + * it is verified if the data block is valid. (For example, if the data block
>> + * was reserved but not yet committed, it is not permitted to invalidate the
>> + * "in use by a writer" data.)
>> + *
>> + * If the data is valid, it will be associated with a descriptor, which will
>> + * then provide the necessary information to validate the data.
>> + *
>> + * Return: true if the oldest data was invalidated (regardless if this
>> + * task was the one that did it or not), otherwise false.
>> + */
>> +static bool expire_oldest_data(struct printk_ringbuffer *rb,
>> + unsigned long oldest_lpos)
>> +{
>> + unsigned long newest_lpos;
>> + struct prb_datablock *b;
>> + unsigned long data_next;
>> + struct prb_descr *d;
>> + unsigned long data;
>> +
>> + /* MB2: synchronize data reservation */
>> + newest_lpos = smp_load_acquire(&rb->data_list.newest);
>> +
>> + b = TO_DATABLOCK(rb, oldest_lpos);
>> +
>> + /* MB3: synchronize descr setup */
>> + d = TO_DESCR(rb, smp_load_acquire(&b->id));
>> +
>> + data = READ_ONCE(d->data);
>> +
>> + /* sanity check to check to see if b->id was correct */
>> + if (oldest_lpos != data)
>> + goto out;
>
> Is this cross check really enough?

This is only a cheap sanity check to filter out most garbage. However,
it still is not a guarantee that b is valid. At this point d could be
some random descriptor that by chance is pointing to the loaded oldest
value.

> How is it ensured that the data are committed?

The following data_valid() call checks this. If it is valid, it is
committed.

> How is it ensured that the descriptor is not an outdated one?
>
> IMHO, there might be a garbage in the data array. It might be chance
> point to an outdated descriptor that by chance pointed to this
> data range in the past. I agree that it is very unlikely. But
> we could not afford such a risk.

An outdated descriptor that has a data value (lpos) matching the oldest
(lpos) would mean that the lpos has completely wrapped (4GB of data on a
32-bit system) without the descriptor having been recycled. It should be
possible to force such a situation on a 32-bit system, so this issue
does need to be addressed. Thanks.

>> + /* MB4: synchronize commit */
>> + data_next = smp_load_acquire(&d->data_next);
>> +
>> + if (!data_valid(rb, oldest_lpos, newest_lpos, data, data_next))
>> + goto out;
>> +
>> + /* MB1: synchronize data invalidation */
>> + cmpxchg_release(&rb->data_list.oldest, data, data_next);
>> +
>> + /* Some task (maybe this one) successfully expired the oldest data. */
>> + return true;
>> +out:
>> + return (oldest_lpos != READ_ONCE(rb->data_list.oldest));
>> +}

Currently the descriptor's data/data_next values are left "as is" until
they are used again. This creates the risk (really only on 32-bit
systems) that in the case of a complete unsigned long wrap around while
never reusing that descriptor, random (or specially crafted) data during
a writer race to expire the oldest data could be read that point to that
unused descriptor where its data (lpos) is the same as oldest (lpos) but
data_next is not valid. This would corrupt the data array.

To avoid this, explicitly make the descriptor invalid (independent of
any future oldest/newest values) after expiring the data.

diff --git a/lib/printk_ringbuffer.c b/lib/printk_ringbuffer.c
index d0b2b6a549b0..43735d9429b2 100644
--- a/lib/printk_ringbuffer.c
+++ b/lib/printk_ringbuffer.c
@@ -451,7 +451,16 @@ static bool expire_oldest_data(struct printk_ringbuffer *rb,
goto out;

/* MB1: synchronize data invalidation */
- cmpxchg_release(&rb->data_list.oldest, data, data_next);
+ if (cmpxchg_release(&rb->data_list.oldest, data, data_next) == data) {
+ /*
+ * Set data_next to match data so the descriptor is invalid.
+ * This avoids the possibility of mistaking it as valid in
+ * case of a full lpos value wrap around.
+ *
+ * If this fails, the descriptor has already been recycled.
+ */
+ cmpxchg(&d->data_next, data_next, data);
+ }

/* Some task (maybe this one) successfully expired the oldest data. */
return true;

John Ogness

[0] https://lkml.kernel.org/r/[email protected]

2019-06-24 14:20:34

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

Hi,

I would like to point out an important typo related to memory
barriers...

On 2019-06-07, John Ogness <[email protected]> wrote:
> diff --git a/lib/printk_ringbuffer.c b/lib/printk_ringbuffer.c
> new file mode 100644
> index 000000000000..d0b2b6a549b0
> --- /dev/null
> +++ b/lib/printk_ringbuffer.c
[...]
> +static struct prb_descr *remove_oldest_descr(struct printk_ringbuffer *rb)
> +{
> + struct prb_list *l = &rb->descr_list;
> + unsigned long oldest_id;
> + struct prb_descr *d;
> + unsigned long next;
> +
> + for (;;) {
> + oldest_id = READ_ONCE(l->oldest);
> +
> + /* list empty */
> + if (oldest_id == EOL)
> + return NULL;
> +
> + d = TO_DESCR(rb, oldest_id);
> +
> + /* only descriptors with _invalid_ data can be removed */
> + if (data_valid(rb, READ_ONCE(rb->data_list.oldest),
> + READ_ONCE(rb->data_list.newest),
> + READ_ONCE(d->data),
> + READ_ONCE(d->data_next))) {
> + return NULL;
> + }
> +
> + /*
> + * MB6: synchronize link descr
> + *
> + * In particular: l->oldest is loaded as a data dependency so
> + * d->next and the following l->oldest will load afterwards,
> + * respectively.
> + */
> + next = smp_load_acquire(&d->next);
> +
> + if (next == EOL && READ_ONCE(l->oldest) == oldest_id) {
> + /*
> + * The oldest has no next, so this is a list of one
> + * descriptor. Lists must always have at least one
> + * descriptor.
> + */
> + return NULL;
> + }
> +
> + if (cmpxchg(&l->oldest, oldest_id, next) == oldest_id) {
> + /* removed successfully */
> + break;
> + }

This is supposed to be cmpxchg_relaxed(), not cmpxchg(). I did not
intend to include the general mb() memory barriers around the RMW
operation. (For some reason I thought _relaxed was the default.) Sorry.

John Ogness

2019-06-24 14:25:23

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Mon 2019-06-24 10:33:15, John Ogness wrote:
> > 1. Linked list of descriptors
> > -----------------------------
> >
> > The list of descriptors makes the code more complicated
> > and I do not see much gain. It is possible that I just missed
> > something.
> >
> > If I get it correctly then the list could only grow by adding
> > never used members. The already added members are newer removed
> > neither shuffled.

Is the above paragraph correct, please?

> > If the above is true then we could achieve similar result
> > when using the array as a circular buffer. It would be
> > the same like when all members are linked from the beginning.
>
> So you are suggesting using a multi-reader multi-writer lockless
> ringbuffer to implement a multi-reader multi-writer lockless
> ringbuffer. ;-)
>
> The descriptor ringbuffer has fixed-size items, which simplifies the
> task. But I expect you will run into a chicken-egg scenario.

AFAIK, the main obstacle with the fully lockless solution was
that the entries did not have a fixed size.

If I understand it correctly, the list works exactly as a
ring buffer once all available descriptors are used.


> > It would allow to remove:
> >
> > + desc->id because it will be the same as desc->seq
> > + desc->next because the address of the next member
> > can be easily counted
>
> Yes, you will remove these and then replace them with new variables to
> track array-element state.

Yes, it should easier to understand that, for example, a descriptor
is free by a flag named "free" than by some magic check of links.

It is not must to have. But the code is complicated. Anything
that might make it easier to understand is much appreciated.

> > I think that it might be much more safe when we mask the two
> > highest bits of seq number and use them for the flags.
> > Then we could track the state of the given sequence number
> > a very safe and straightforward way.
>
> When I first started to design/code this, I implemented something quite
> similar: using a single variable to represent state and id. This works
> nicely for cmpxchg operations (lockless synchronization) and reader
> validation. The problem I ran into was a chicken-egg problem (which I
> suspect you will also run into).

Do you remember more details about the chicken-egg problem, please?
I believe that there might be one. Any hint just could save
me quite some time.

I have hit two big dead ends so far:

1. I was not able to free data when there was no free descriptor
and free descriptor when the data have already been freed.
I was not able to make both operations race-free.

I got inspiration from remove_oldest_descr() and solved this
by failing to get descriptor when there was no free one.

But it is possible that I just did not try hard enough.
I see that your code is actually able to free the data
and descriptor from assign_descriptor().


2. I was not able to free the oldest data. I did not know
how to make sure that the seq read from the data buffer
was valid.

My plan was to solve this by changing seq and state flags
in the descriptor atomically. Then I would just check
whether the seq was in valid bounds (I would ignore
overflow) and that the flag "committed" was set. Then
I would just set the flag "freed". The descriptor
itself would be freed from prb_get_desc().

But I might actually use similar approach like you
are using in expire_oldest_data(). We could assume
that as long as the desc->seq is within valid
bounds (rb->first_seq <= seq <= rb->last_seq)
then it is the right descriptor.


> I solved this problem by changing the design to use a linked list for
> the descriptors. At first I had kept state information for each
> descriptor. But later I realized that state information was not
> necessary because the linked list itself was providing implicit state
> information.

And this is my problem. I do not see how the list itself provides
the state information. Especially I do not see how it distinguishes
reserved and commited state, for example, from expire_oldest_data()
point of view.


> > Finally, here are some comments about the original patch:
> >
> > On Fri 2019-06-07 18:29:48, John Ogness wrote:
> >> See documentation for details.
> >
> > Please, mention here some basics. It might be enough to copy the
> > following sections from the documentation:
> >
> > Overview
> > Features
> > Behavior
>
> Ugh. Do we really want all that in a commit message?

2-3 pages of text for such a complicated commit is perfectly fine.
You could not build html/pdf variant easily when reading "git log -p".

> > Note that the documentation is written via .rst file. You need to
> > build html or pdf to get all the pieces together.
>
> Yes, but isn't that how all the kernel docs are supposed to be for the
> future?

I could not talk for others. I have personally built the html version
for the first time just few weeks ago. And it was only because
I reviewed conversion of livepatch related documentation into rst.

I normally search for information using "cscope in emacs", "git
blame", "git log -p", "git grep", and "google in web browser".
I much prefer to find the information in the code sources or
in the related commit message.


> >> +/**
> >> + * struct prb_list - An abstract linked list of items.
> >> + * @oldest: The oldest item on the list.
> >> + * @newest: The newest item on the list.
> >
> > I admit that I got confused by this. I wonder if there is another
> > location in kernel where lists are handled this way.
> >
> > I have always seen in kernel only lists handled via the struct
> > list_head trick. Where the same structure is bundled in all
> > linked members.
> >
> > I can't find a good name. I would personally remove the structure
> > and add the members into the relates structures directly.
>
> The only reason I made it a struct is so that I could just write
> l->oldest instead of rb->descr_list_oldest. But it is an otherwise
> useless struct that I can remove.

rb->last_lpos and rb->last_seq are short enough. And it is clear
what exactly is being compared.

> > Also I would personally use "first" and "last" because they are
> > shorter and easier to visually distinguish. I know that "oldest"
> > and "newest" are more clear but...
>
> I don't like "oldest" and "newest" either, but it is immediately
> clear. How about:
>
> rb->data_lpos_oldest (formerly rb->data_list.oldest)
> rb->data_lpos_newest (formerly rb->data_list.newest)
> rb->desc_id_oldest (formerly rb->descr_list.oldest)
> rb->desc_id_newest (formerly rb->descr_list.newest)
>
> If using the strings "oldest" and "newest" are too ugly for people, I
> have no problems using first/last or head/tail, even if IMHO they add
> unnecessary confusion.

I do not have strong opinion. I am slightly biased because I am used
to "first"/"next" from the current code.

In each case when I compare:

rb->data_lpos_oldest (formerly rb->data_list.oldest)
rb->data_lpos_newest (formerly rb->data_list.newest)
rb->desc_id_oldest (formerly rb->descr_list.oldest)
rb->desc_id_newest (formerly rb->descr_list.newest)

rb->data_lpos_first (formerly rb->data_list.first)
rb->data_lpos_last (formerly rb->data_list.last)
rb->desc_id_first (formerly rb->descr_list.first)
rb->desc_id_last (formerly rb->descr_list.last)

then the 2nd variant helps me to spot the difference
and find the valuable information.

> >> + * @data_next: The logical position of the data next to this entry.
> >> + * This is used to determine the length of the data as well as
> >> + * identify where the next data begins.
> >
> > next_lpos
>
> How about lpos_next?

next_lpos looks gramatically more correct. Well, I do not mind as long
as the style is consistent all over the code.

> >> +/**
> >> + * expire_oldest_data() - Invalidate the oldest data block.
> >> + * @rb: The ringbuffer containing the data block.
> >> + * @oldest_lpos: The logical position of the oldest data block.
> >> + *
> >> + * This function expects to "push" the pointer to the oldest data block
> >> + * forward, thus invalidating the oldest data block. However, before pushing,
> >> + * it is verified if the data block is valid. (For example, if the data block
> >> + * was reserved but not yet committed, it is not permitted to invalidate the
> >> + * "in use by a writer" data.)
> >> + *
> >> + * If the data is valid, it will be associated with a descriptor, which will
> >> + * then provide the necessary information to validate the data.
> >> + *
> >> + * Return: true if the oldest data was invalidated (regardless if this
> >> + * task was the one that did it or not), otherwise false.
> >> + */
> >> +static bool expire_oldest_data(struct printk_ringbuffer *rb,
> >> + unsigned long oldest_lpos)
> >> +{
> > How is it ensured that the descriptor is not an outdated one?
> >
> > IMHO, there might be a garbage in the data array. It might be chance
> > point to an outdated descriptor that by chance pointed to this
> > data range in the past. I agree that it is very unlikely. But
> > we could not afford such a risk.
>
> An outdated descriptor that has a data value (lpos) matching the oldest
> (lpos) would mean that the lpos has completely wrapped (4GB of data on a
> 32-bit system) without the descriptor having been recycled.

Ah, I missed that it takes long time until the positions are reused
(overflow). It would probably helped me when all the compared variables
were called lpos instead of data ;-)

> It should be
> possible to force such a situation on a 32-bit system, so this issue
> does need to be addressed. Thanks.

We might even be able to ignore this because both descriptors and data
arrays are reused. It should be impossible to wrap around lpos without
wrapping seq and vice versa.

Best Regards,
Petr

2019-06-25 07:44:43

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On (06/19/19 00:12), John Ogness wrote:
> On 2019-06-18, Sergey Senozhatsky <[email protected]> wrote:
> >> + struct prb_reserved_entry e;
> >> + char *s;
> >> +
> >> + s = prb_reserve(&e, &rb, 32);
> >> + if (s) {
> >> + sprintf(s, "Hello, world!");
> >> + prb_commit(&e);
> >> + }
> >
> > A nit: snprintf().
> >
> > sprintf() is tricky, it may write "slightly more than was
> > anticipated" bytes - all those string_nocheck(" disabled"),
> > error_string("pK-error"), etc.
>
> Agreed. Documentation should show good examples.

In vprintk_emit(), are we going to always reserve 1024-byte
records, since we don't know the size in advance, e.g.

printk("%pS %s\n", regs->ip, current->name)
prb_reserve(&e, &rb, ????);

or are we going to run vscnprintf() on a NULL buffer first,
then reserve the exactly required number of bytes and afterwards
vscnprintf(s) -> prb_commit(&e)?

-ss

2019-06-25 07:50:32

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On (06/25/19 15:45), Sergey Senozhatsky wrote:
> On (06/19/19 00:12), John Ogness wrote:
> > On 2019-06-18, Sergey Senozhatsky <[email protected]> wrote:
> > >> + struct prb_reserved_entry e;
> > >> + char *s;
> > >> +
> > >> + s = prb_reserve(&e, &rb, 32);
> > >> + if (s) {
> > >> + sprintf(s, "Hello, world!");
> > >> + prb_commit(&e);
> > >> + }
> > >
> > > A nit: snprintf().
> > >
> > > sprintf() is tricky, it may write "slightly more than was
> > > anticipated" bytes - all those string_nocheck(" disabled"),
> > > error_string("pK-error"), etc.
> >
> > Agreed. Documentation should show good examples.
>
> In vprintk_emit(), are we going to always reserve 1024-byte
> records, since we don't know the size in advance, e.g.
>
> printk("%pS %s\n", regs->ip, current->name)
> prb_reserve(&e, &rb, ????);
>
> or are we going to run vscnprintf() on a NULL buffer first,
> then reserve the exactly required number of bytes and afterwards
> vscnprintf(s) -> prb_commit(&e)?

I'm asking this because, well, if the most common usage
pattern (printk->prb_reserve) will always reserve fixed
size records (aka data blocks), then you _probably_ (??)
can drop the 'variable size records' requirement from prb
design and start looking at records (aka data blocks) as
fixed sized chunks of bytes, which are always located at
fixed offsets.

-ss

2019-06-25 08:45:49

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-25, Sergey Senozhatsky <[email protected]> wrote:
>>>> + struct prb_reserved_entry e;
>>>> + char *s;
>>>> +
>>>> + s = prb_reserve(&e, &rb, 32);
>>>> + if (s) {
>>>> + sprintf(s, "Hello, world!");
>>>> + prb_commit(&e);
>>>> + }
>>>
>>> A nit: snprintf().
>>>
>>> sprintf() is tricky, it may write "slightly more than was
>>> anticipated" bytes - all those string_nocheck(" disabled"),
>>> error_string("pK-error"), etc.
>>
>> Agreed. Documentation should show good examples.
>
> In vprintk_emit(), are we going to always reserve 1024-byte
> records, since we don't know the size in advance, e.g.
>
> printk("%pS %s\n", regs->ip, current->name)
> prb_reserve(&e, &rb, ????);
>
> or are we going to run vscnprintf() on a NULL buffer first,
> then reserve the exactly required number of bytes and afterwards
> vscnprintf(s) -> prb_commit(&e)?

(As suggested by Petr) I want to use vscnprintf() on a NULL
buffer. However, a NULL buffer is not sufficient because things like the
loglevel are sometimes added via %s (for example, in /dev/kmsg). So
rather than a NULL buffer, I would use a small buffer on the stack
(large enough to store loglevel/cont information). This way we can use
vscnprintf() to get the exact size _and_ printk_get_level() will see
enough of the formatted string to parse what it needs.

> I'm asking this because, well, if the most common usage
> pattern (printk->prb_reserve) will always reserve fixed
> size records (aka data blocks), then you _probably_ (??)
> can drop the 'variable size records' requirement from prb
> design and start looking at records (aka data blocks) as
> fixed sized chunks of bytes, which are always located at
> fixed offsets.

The average printk message size is well under 128 bytes. It would be
quite wasteful to always reserve 1K blocks.

John Ogness

2019-06-25 08:56:22

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On (06/07/19 18:29), John Ogness wrote:
[..]
> +static void add_descr_list(struct prb_reserved_entry *e)
> +{
> + struct printk_ringbuffer *rb = e->rb;
> + struct prb_list *l = &rb->descr_list;
> + struct prb_descr *d = e->descr;
> + struct prb_descr *newest_d;
> + unsigned long newest_id;
> +
> + /* set as newest */
> + do {
> + /* MB5: synchronize add descr */
> + newest_id = smp_load_acquire(&l->newest);
> + newest_d = TO_DESCR(rb, newest_id);
> +
> + if (newest_id == EOL)
> + WRITE_ONCE(d->seq, 1);
> + else
> + WRITE_ONCE(d->seq, READ_ONCE(newest_d->seq) + 1);
> + /*
> + * MB5: synchronize add descr
> + *
> + * In particular: next written before cmpxchg
> + */
> + } while (cmpxchg_release(&l->newest, newest_id, e->id) != newest_id);
> +
> + if (unlikely(newest_id == EOL)) {
> + /* no previous newest means we *are* the list, set oldest */
> +
> + /*
> + * MB UNPAIRED
> + *
> + * In particular: Force cmpxchg _after_ cmpxchg on newest.
> + */
> + WARN_ON_ONCE(cmpxchg_release(&l->oldest, EOL, e->id) != EOL);
> + } else {
> + /* link to previous chain */
> +
> + /*
> + * MB6: synchronize link descr
> + *
> + * In particular: Force cmpxchg _after_ cmpxchg on newest.
> + */
> + WARN_ON_ONCE(cmpxchg_release(&newest_d->next,
> + EOL, e->id) != EOL);
> + }
> +}

[..]

> +char *prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
> + unsigned int size)
> +{
> + struct prb_datablock *b;
> + struct prb_descr *d;
> + char *buf;
> +
> + if (size == 0)
> + return NULL;
> +
> + size += sizeof(struct prb_datablock);
> + size = DATA_ALIGN_SIZE(size);
> + if (size > DATAARRAY_SIZE(rb))
> + return NULL;
> +
> + e->rb = rb;
> +
> + local_irq_save(e->irqflags);
> +
> + if (!assign_descr(e))
> + goto err_out;
> +
> + d = e->descr;
> + WRITE_ONCE(d->id, e->id);
> +
> + if (!data_reserve(e, size)) {
> + /* put invalid descriptor on list, can still be traversed */
> + WRITE_ONCE(d->next, EOL);
> + add_descr_list(e);
> + goto err_out;
> + }

I'm wondering if prb can always report about its problems. Including the
cases when things "go rather bad".

Suppose we have

printk()
prb_reserve()
!data_reserve()
add_descr_list()
WARN_ON_ONCE()
printk()
prb_reserve()
!assign_descr(e) << lost WARN_ON's "printk" or "printks"?

In general, assuming that there might be more error printk-s either
called directly directly from prb->printk on indirectly, from
prb->ABC->printk.

Also note,
Lost printk-s are not going to be accounted as 'lost' automatically.
It seems that for printk() there is no way to find out that it has
recursed from printk->prb_commit but hasn't succeeded in storing
recursive messages. I'd say that prb_reserve() err_out should probably
&rb->lost++.

-ss

2019-06-25 09:07:07

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Tue 2019-06-25 10:44:19, John Ogness wrote:
> On 2019-06-25, Sergey Senozhatsky <[email protected]> wrote:
> > In vprintk_emit(), are we going to always reserve 1024-byte
> > records, since we don't know the size in advance, e.g.
> >
> > printk("%pS %s\n", regs->ip, current->name)
> > prb_reserve(&e, &rb, ????);
> >
> > or are we going to run vscnprintf() on a NULL buffer first,
> > then reserve the exactly required number of bytes and afterwards
> > vscnprintf(s) -> prb_commit(&e)?
>
> (As suggested by Petr) I want to use vscnprintf() on a NULL
> buffer. However, a NULL buffer is not sufficient because things like the
> loglevel are sometimes added via %s (for example, in /dev/kmsg). So
> rather than a NULL buffer, I would use a small buffer on the stack
> (large enough to store loglevel/cont information). This way we can use
> vscnprintf() to get the exact size _and_ printk_get_level() will see
> enough of the formatted string to parse what it needs.

vscnprintf() with NULL pointer is perfectly fine. Only the formatted
string has variable size.

Log level, timestamp, and other information can be stored as
metadata with a fixed size, see struct printk_log. They are
formatted as text later, see msg_print_text() and
msg_print_ext_header().

> > I'm asking this because, well, if the most common usage
> > pattern (printk->prb_reserve) will always reserve fixed
> > size records (aka data blocks), then you _probably_ (??)
> > can drop the 'variable size records' requirement from prb
> > design and start looking at records (aka data blocks) as
> > fixed sized chunks of bytes, which are always located at
> > fixed offsets.
>
> The average printk message size is well under 128 bytes. It would be
> quite wasteful to always reserve 1K blocks.

Yes, I think that we need to store the strings in variable
sized chunks.

Best Regards,
Petr

2019-06-25 09:11:09

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On (06/25/19 10:44), John Ogness wrote:
> > In vprintk_emit(), are we going to always reserve 1024-byte
> > records, since we don't know the size in advance, e.g.
> >
> > printk("%pS %s\n", regs->ip, current->name)
> > prb_reserve(&e, &rb, ????);
> >
> > or are we going to run vscnprintf() on a NULL buffer first,
> > then reserve the exactly required number of bytes and afterwards
> > vscnprintf(s) -> prb_commit(&e)?
>
> (As suggested by Petr) I want to use vscnprintf() on a NULL
> buffer. However, a NULL buffer is not sufficient because things like the
> loglevel are sometimes added via %s (for example, in /dev/kmsg). So
> rather than a NULL buffer, I would use a small buffer on the stack
> (large enough to store loglevel/cont information). This way we can use
> vscnprintf() to get the exact size _and_ printk_get_level() will see
> enough of the formatted string to parse what it needs.

OK. I guess this should work except for the cases when we want to
printk that we are running out of stack :)

More seriously, tho, sometimes messages come with dictionaries of
key/value pairs. I don't think we impose any strict limits on the
number of key/value pair or on the overall size of the dictionary
each record can have (up to a single PAGE, I'd guess. I really need
to check printk code). Finding a sufficiently large buffer size
might be a bit of a task.

> > I'm asking this because, well, if the most common usage
> > pattern (printk->prb_reserve) will always reserve fixed
> > size records (aka data blocks), then you _probably_ (??)
> > can drop the 'variable size records' requirement from prb
> > design and start looking at records (aka data blocks) as
> > fixed sized chunks of bytes, which are always located at
> > fixed offsets.
>
> The average printk message size is well under 128 bytes.

Do you also count in dictionary of properties (key/value pairs) which
records can carry?

For printks from core kernel 128 bytes would be a good estimation,
for dev_printk() and so on - I'm not exactly sure.

cat /dev/kmsg

This one, for instance, is a single logbuf record

6,560,2470340,-;hid-generic 0003:093A:2510.0001: input,hidraw0: USB HID v1.11 Mouse [PixArt USB Optical Mouse] on usb-0000:00:14.0-3/input0
SUBSYSTEM=hid
DEVICE=+hid:0003:093A:2510.0001

I suspect that it's larger than 128 bytes.

> It would be quite wasteful to always reserve 1K blocks.

Agreed.

-ss

2019-06-25 11:46:00

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-25, Sergey Senozhatsky <[email protected]> wrote:
> [..]
>> +static void add_descr_list(struct prb_reserved_entry *e)
>> +{
>> + struct printk_ringbuffer *rb = e->rb;
>> + struct prb_list *l = &rb->descr_list;
>> + struct prb_descr *d = e->descr;
>> + struct prb_descr *newest_d;
>> + unsigned long newest_id;
>> +
>> + /* set as newest */
>> + do {
>> + /* MB5: synchronize add descr */
>> + newest_id = smp_load_acquire(&l->newest);
>> + newest_d = TO_DESCR(rb, newest_id);
>> +
>> + if (newest_id == EOL)
>> + WRITE_ONCE(d->seq, 1);
>> + else
>> + WRITE_ONCE(d->seq, READ_ONCE(newest_d->seq) + 1);
>> + /*
>> + * MB5: synchronize add descr
>> + *
>> + * In particular: next written before cmpxchg
>> + */
>> + } while (cmpxchg_release(&l->newest, newest_id, e->id) != newest_id);
>> +
>> + if (unlikely(newest_id == EOL)) {
>> + /* no previous newest means we *are* the list, set oldest */
>> +
>> + /*
>> + * MB UNPAIRED
>> + *
>> + * In particular: Force cmpxchg _after_ cmpxchg on newest.
>> + */
>> + WARN_ON_ONCE(cmpxchg_release(&l->oldest, EOL, e->id) != EOL);

This WARN_ON_ONCE...

>> + } else {
>> + /* link to previous chain */
>> +
>> + /*
>> + * MB6: synchronize link descr
>> + *
>> + * In particular: Force cmpxchg _after_ cmpxchg on newest.
>> + */
>> + WARN_ON_ONCE(cmpxchg_release(&newest_d->next,
>> + EOL, e->id) != EOL);

... and this WARN_ON_ONCE should both really be BUG_ON. These situations
will not happen. Actually, they should both be xchg_release(). But until
everyone is happy with the memory barriers, I wanted to leave this bug
checking in place.

>> + }
>> +}
>
> [..]
>
>> +char *prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
>> + unsigned int size)
>> +{
>> + struct prb_datablock *b;
>> + struct prb_descr *d;
>> + char *buf;
>> +
>> + if (size == 0)
>> + return NULL;
>> +
>> + size += sizeof(struct prb_datablock);
>> + size = DATA_ALIGN_SIZE(size);
>> + if (size > DATAARRAY_SIZE(rb))
>> + return NULL;
>> +
>> + e->rb = rb;
>> +
>> + local_irq_save(e->irqflags);
>> +
>> + if (!assign_descr(e))
>> + goto err_out;
>> +
>> + d = e->descr;
>> + WRITE_ONCE(d->id, e->id);
>> +
>> + if (!data_reserve(e, size)) {
>> + /* put invalid descriptor on list, can still be traversed */
>> + WRITE_ONCE(d->next, EOL);
>> + add_descr_list(e);
>> + goto err_out;
>> + }
>
> I'm wondering if prb can always report about its problems. Including the
> cases when things "go rather bad".
>
> Suppose we have
>
> printk()
> prb_reserve()
> !data_reserve()
> add_descr_list()
> WARN_ON_ONCE()
> printk()
> prb_reserve()
> !assign_descr(e) << lost WARN_ON's "printk" or "printks"?
>
> In general, assuming that there might be more error printk-s either
> called directly directly from prb->printk on indirectly, from
> prb->ABC->printk.
>
> Also note,
> Lost printk-s are not going to be accounted as 'lost' automatically.
> It seems that for printk() there is no way to find out that it has
> recursed from printk->prb_commit but hasn't succeeded in storing
> recursive messages. I'd say that prb_reserve() err_out should probably
> &rb->lost++.

This is a good point. I have no problems with that. In that case, it
should probably be called "fail" instead of "lost".

John Ogness

2019-06-25 12:08:24

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On (06/25/19 11:06), Petr Mladek wrote:
> On Tue 2019-06-25 10:44:19, John Ogness wrote:
> > On 2019-06-25, Sergey Senozhatsky <[email protected]> wrote:
> > > In vprintk_emit(), are we going to always reserve 1024-byte
> > > records, since we don't know the size in advance, e.g.
> > >
> > > printk("%pS %s\n", regs->ip, current->name)
> > > prb_reserve(&e, &rb, ????);
> > >
> > > or are we going to run vscnprintf() on a NULL buffer first,
> > > then reserve the exactly required number of bytes and afterwards
> > > vscnprintf(s) -> prb_commit(&e)?
> >
> > (As suggested by Petr) I want to use vscnprintf() on a NULL
> > buffer. However, a NULL buffer is not sufficient because things like the
> > loglevel are sometimes added via %s (for example, in /dev/kmsg). So
> > rather than a NULL buffer, I would use a small buffer on the stack
> > (large enough to store loglevel/cont information). This way we can use
> > vscnprintf() to get the exact size _and_ printk_get_level() will see
> > enough of the formatted string to parse what it needs.
>
> vscnprintf() with NULL pointer is perfectly fine. Only the formatted
> string has variable size.

Yeah, that should work. Probably. Can't think of any issues, except
for increased CPU usage. Some sprintf() format specifiers are heavier
than the rest (pS/pF on ia64/ppc/hppa).

OK, very theoretically.

There is a difference.

Doing "sz = vscprintf(NULL, msg); vscnprintf(buf, sz, msg)" for
msg_print_text() and msg_print_ext_header() was safe, because the
data - msg - would not change under us, we would work with logbuf
records, IOW with data which is owned by printk() and printk only.

But doing
sz = vcsprintf(NULL, "xxx", random_pointer);
if ((buf = prb_reserve(... sz))) {
vscnprintf(buf, sz, "xxx", random_pointer);
prb_commit(...);
}

might have different outcome sometimes. We probably (!!!) can have
some race conditions. The problem is that, unlike msg_print_text()
and msg_print_ext_header(), printk() works with pointers which it
does not own nor control. IOW within single printk() we will access
some random kernel pointers, then do prb stuff, then access those
same pointers, expecting that none of them will ever change their
state. A very simple example

printk("Comm %s\n", current->comm)

Suppose printk on CPU0 and ia64_mca_modify_comm on CPU1

CPU0 CPU1
printk(...)
sz = vscprintf(NULL, "Comm %s\n", current->comm);
ia64_mca_modify_comm()
snprintf(comm, sizeof(comm), "%s %d", current->comm, previous_current->pid);
memcpy(current->comm, comm, sizeof(current->comm));
if ((buf = prb_reserve(... sz))) {
vscnprintf(buf, "Comm %s\n", current->comm);
^^^^^^^^^^^^^^ ->comm has changed.
Nothing critical, we
should not corrupt
anything, but we will
truncate ->comm if its
new size is larger than
what it used to be when
we did vscprintf(NULL).
prb_commit(...);
}

Probably there can be other examples.

-ss

2019-06-25 13:19:30

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-25, Sergey Senozhatsky <[email protected]> wrote:
>>>> In vprintk_emit(), are we going to always reserve 1024-byte
>>>> records, since we don't know the size in advance, e.g.
>>>>
>>>> printk("%pS %s\n", regs->ip, current->name)
>>>> prb_reserve(&e, &rb, ????);
>>>>
>>>> or are we going to run vscnprintf() on a NULL buffer first,
>>>> then reserve the exactly required number of bytes and afterwards
>>>> vscnprintf(s) -> prb_commit(&e)?
>>>
>>> (As suggested by Petr) I want to use vscnprintf() on a NULL
>>> buffer. However, a NULL buffer is not sufficient because things like the
>>> loglevel are sometimes added via %s (for example, in /dev/kmsg). So
>>> rather than a NULL buffer, I would use a small buffer on the stack
>>> (large enough to store loglevel/cont information). This way we can use
>>> vscnprintf() to get the exact size _and_ printk_get_level() will see
>>> enough of the formatted string to parse what it needs.
>>
>> vscnprintf() with NULL pointer is perfectly fine. Only the formatted
>> string has variable size.
>
> Yeah, that should work. Probably. Can't think of any issues, except
> for increased CPU usage. Some sprintf() format specifiers are heavier
> than the rest (pS/pF on ia64/ppc/hppa).
>
> OK, very theoretically.
>
> There is a difference.
>
> Doing "sz = vscprintf(NULL, msg); vscnprintf(buf, sz, msg)" for
> msg_print_text() and msg_print_ext_header() was safe, because the
> data - msg - would not change under us, we would work with logbuf
> records, IOW with data which is owned by printk() and printk only.
>
> But doing
> sz = vcsprintf(NULL, "xxx", random_pointer);
> if ((buf = prb_reserve(... sz))) {
> vscnprintf(buf, sz, "xxx", random_pointer);
> prb_commit(...);
> }
>
> might have different outcome sometimes. We probably (!!!) can have
> some race conditions. The problem is that, unlike msg_print_text()
> and msg_print_ext_header(), printk() works with pointers which it
> does not own nor control. IOW within single printk() we will access
> some random kernel pointers, then do prb stuff, then access those
> same pointers, expecting that none of them will ever change their
> state. A very simple example
>
> printk("Comm %s\n", current->comm)
>
> Suppose printk on CPU0 and ia64_mca_modify_comm on CPU1
>
> CPU0 CPU1
> printk(...)
> sz = vscprintf(NULL, "Comm %s\n", current->comm);
> ia64_mca_modify_comm()
> snprintf(comm, sizeof(comm), "%s %d", current->comm, previous_current->pid);
> memcpy(current->comm, comm, sizeof(current->comm));
> if ((buf = prb_reserve(... sz))) {
> vscnprintf(buf, "Comm %s\n", current->comm);
> ^^^^^^^^^^^^^^ ->comm has changed.
> Nothing critical, we
> should not corrupt
> anything, but we will
> truncate ->comm if its
> new size is larger than
> what it used to be when
> we did vscprintf(NULL).
> prb_commit(...);
> }
>
> Probably there can be other examples.

This is a very good point, and quite important. It is not acceptable if
some crash output is cut off because of this effect.

In my v1 rfc series, I avoided this issue by having a separate dedicated
ringbuffer (rb_sprintf) that was used to allocate a temporary max-size
(2KB) buffer for sprinting to. Then _that_ was used for the real
ringbuffer input (strlen, prb_reserve, memcpy, prb_commit). That would
still be the approach of my choice.

John Ogness

2019-06-25 13:44:08

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-24, Petr Mladek <[email protected]> wrote:
>>> 1. Linked list of descriptors
>>> -----------------------------
>>>
>>> The list of descriptors makes the code more complicated
>>> and I do not see much gain. It is possible that I just missed
>>> something.
>>>
>>> If I get it correctly then the list could only grow by adding
>>> never used members. The already added members are newer removed
>>> neither shuffled.
>
> Is the above paragraph correct, please?

Sorry for not addressing this remark. I am trying to be careful and not
steer you to a certain implementation. I failed to achieve what you are
attempting. You are already doing things slightly different, so my
experiences regarding this may falsely discourage you from figuring out
how to do it.

To address your question: For the linked list implementation, if you are
looking at it from the linked list perspective, the number of
descriptors on the list is constantly fluctuating (increasing and
decreasing) and the ordering of the descriptors is constantly
changing. They are ordered according to the writer commit order (not the
writer reserve order) and the only descriptors on the list are the ones
that are not within a reserve/commit window.

>>> If the above is true then we could achieve similar result
>>> when using the array as a circular buffer. It would be
>>> the same like when all members are linked from the beginning.
>>
>> So you are suggesting using a multi-reader multi-writer lockless
>> ringbuffer to implement a multi-reader multi-writer lockless
>> ringbuffer. ;-)
>>
>> The descriptor ringbuffer has fixed-size items, which simplifies the
>> task. But I expect you will run into a chicken-egg scenario.
>
> AFAIK, the main obstacle with the fully lockless solution was
> that the entries did not have a fixed size.

No. The variable size of the records was the reason I used
descriptors. That has nothing to do with how I chose to connect those
descriptors.

> If I understand it correctly, the list works exactly as a
> ring buffer once all available descriptors are used.

There are a lot of details hiding in the words "exactly as a ring
buffer". We are talking about a (truly) lockless ringbuffer that
supports concurrent writers. (IMO my v1 rfc ringbuffer was not truly
lockless because of the prb_lock.) AFAIK there is no such ringbuffer
implementation in existence, otherwise I would have used that instead of
writing my own.

>>> It would allow to remove:
>>>
>>> + desc->id because it will be the same as desc->seq
>>> + desc->next because the address of the next member
>>> can be easily counted
>>
>> Yes, you will remove these and then replace them with new variables
>> to track array-element state.
>
> Yes, it should easier to understand that, for example, a descriptor
> is free by a flag named "free" than by some magic check of links.
>
> It is not must to have. But the code is complicated. Anything
> that might make it easier to understand is much appreciated.

I think it is not as simple as you think it is and I expect you will end
up with a solution that is more complex (although I could be
wrong). IMHO the linked list solution is quite elegant. Perhaps the real
problem is my coding style, poor naming, and horrible comments.

>>> I think that it might be much more safe when we mask the two
>>> highest bits of seq number and use them for the flags.
>>> Then we could track the state of the given sequence number
>>> a very safe and straightforward way.
>>
>> When I first started to design/code this, I implemented something
>> quite similar: using a single variable to represent state and
>> id. This works nicely for cmpxchg operations (lockless
>> synchronization) and reader validation. The problem I ran into was a
>> chicken-egg problem (which I suspect you will also run into).
>
> Do you remember more details about the chicken-egg problem, please?

You really already answered your own question when you stated:

"we could achieve similar result when using the array
as a circular buffer"

That circular buffer must have the same features as the circular buffer
we are trying to implement. My choice to use a linked list was not due
to variable sized records, but rather that is the only way I could
figure out how to implement a lockless multi-writer multi-reader
ringbuffer.

Again, that doesn't mean it isn't possible. But _I_ could not get it to
work. I had all kinds of state tracking and crazy cmpxchg loops and
wacky races. Switching to linked lists made everything so much simpler
and straight forward. (Many thanks my colleague Benedikt Spranger for
coming up with the idea!)

> I believe that there might be one. Any hint just could save
> me quite some time.
>
> I have hit two big dead ends so far:
>
> 1. I was not able to free data when there was no free descriptor
> and free descriptor when the data have already been freed.
> I was not able to make both operations race-free.

This is why I reserve the descriptor before reserving the data, so that
this situation can never occur.

Note that invalid descriptors in the linked list do not act as
terminators for readers. The readers simply traverse over the invalid
descriptors until they hit an EOL.

> I got inspiration from remove_oldest_descr() and solved this
> by failing to get descriptor when there was no free one.
>
> But it is possible that I just did not try hard enough.
> I see that your code is actually able to free the data
> and descriptor from assign_descriptor().
>
>
> 2. I was not able to free the oldest data. I did not know
> how to make sure that the seq read from the data buffer
> was valid.
>
> My plan was to solve this by changing seq and state flags
> in the descriptor atomically. Then I would just check
> whether the seq was in valid bounds (I would ignore
> overflow) and that the flag "committed" was set. Then
> I would just set the flag "freed". The descriptor
> itself would be freed from prb_get_desc().
>
> But I might actually use similar approach like you
> are using in expire_oldest_data(). We could assume
> that as long as the desc->seq is within valid
> bounds (rb->first_seq <= seq <= rb->last_seq)
> then it is the right descriptor.

desc->seq is not some random data. If it is >= rb->first_seq (and points
back to the same data block!), it is what you are looking for.

>> I solved this problem by changing the design to use a linked list for
>> the descriptors. At first I had kept state information for each
>> descriptor. But later I realized that state information was not
>> necessary because the linked list itself was providing implicit state
>> information.
>
> And this is my problem. I do not see how the list itself provides
> the state information. Especially I do not see how it distinguishes
> reserved and commited state, for example, from expire_oldest_data()
> point of view.

_All_ descriptors on the linked list are not within the reserve/commit
window and no writers have pointers to them. (The only exception is the
newest descriptor, whose next will be modified by a writer that is
adding a new descriptor. That is the reason that if there is only 1
descriptor on the list, it must not be removed. A writer might be
modifying its next.)

When a writer wants to reserve data (i.e. enter the reserve/commit
window), the first thing it must do is pull a descriptor off the
list. With the descriptor off the list, the writer is free to do
whatever it needs to do to prepare the data and the descriptor. And when
it is done committing (i.e. exiting the reserve/commit window), the last
thing it does is add the ready descriptor to the list.

>>> Finally, here are some comments about the original patch:
>>>
>>> On Fri 2019-06-07 18:29:48, John Ogness wrote:
>>>> See documentation for details.
>>>
>>> Please, mention here some basics. It might be enough to copy the
>>> following sections from the documentation:
>>>
>>> Overview
>>> Features
>>> Behavior
>>
>> Ugh. Do we really want all that in a commit message?
>
> 2-3 pages of text for such a complicated commit is perfectly fine.
> You could not build html/pdf variant easily when reading "git log -p".

OK.

>>> Note that the documentation is written via .rst file. You need to
>>> build html or pdf to get all the pieces together.
>>
>> Yes, but isn't that how all the kernel docs are supposed to be for the
>> future?
>
> I could not talk for others. I have personally built the html version
> for the first time just few weeks ago. And it was only because
> I reviewed conversion of livepatch related documentation into rst.

I also built it for the first time so that I could document this series.

> I normally search for information using "cscope in emacs", "git
> blame", "git log -p", "git grep", and "google in web browser".
> I much prefer to find the information in the code sources or
> in the related commit message.

There is an obvious push to get the kernel docs unified under RST, even
if it is not how I usually do things either. However, now that I've done
the work, looking back it seems to be a good idea in order to automate
documentation.

>> If using the strings "oldest" and "newest" are too ugly for people, I
>> have no problems using first/last or head/tail, even if IMHO they add
>> unnecessary confusion.
>
> I do not have strong opinion. I am slightly biased because I am used
> to "first"/"next" from the current code.

But are you used to starting with last and traversing next to first? The
descriptors and data blocks are linked from oldest to newest. IMHO that
is why it tends to be confusing.

In circular buffer speak, writers write to the head and readers read
from the tail. As a writer this feels natural, but for a reader that
reads the tail and follows next pointers to the head, it feels
backwards.

I explicitly wanted to get away from any preconceptions. By specifying
we have data linked from oldest to newest, I find it feels more natural,
regardless if I am a writer writing new records to the newest or a
reader reading all records from oldest to newest.

John Ogness

2019-06-26 02:09:04

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On (06/25/19 14:03), John Ogness wrote:
[..]
> > CPU0 CPU1
> > printk(...)
> > sz = vscprintf(NULL, "Comm %s\n", current->comm);
> > ia64_mca_modify_comm()
> > snprintf(comm, sizeof(comm), "%s %d", current->comm, previous_current->pid);
> > memcpy(current->comm, comm, sizeof(current->comm));
> > if ((buf = prb_reserve(... sz))) {
> > vscnprintf(buf, "Comm %s\n", current->comm);
> > ^^^^^^^^^^^^^^ ->comm has changed.
> > Nothing critical, we
> > should not corrupt
> > anything, but we will
> > truncate ->comm if its
> > new size is larger than
> > what it used to be when
> > we did vscprintf(NULL).
> > prb_commit(...);
> > }

[..]
> In my v1 rfc series, I avoided this issue by having a separate dedicated
> ringbuffer (rb_sprintf) that was used to allocate a temporary max-size
> (2KB) buffer for sprinting to. Then _that_ was used for the real
> ringbuffer input (strlen, prb_reserve, memcpy, prb_commit). That would
> still be the approach of my choice.

In other words per-CPU buffering, AKA printk_safe ;)

-ss

2019-06-26 07:17:59

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-26, Sergey Senozhatsky <[email protected]> wrote:
> [..]
>> > CPU0 CPU1
>> > printk(...)
>> > sz = vscprintf(NULL, "Comm %s\n", current->comm);
>> > ia64_mca_modify_comm()
>> > snprintf(comm, sizeof(comm), "%s %d", current->comm, previous_current->pid);
>> > memcpy(current->comm, comm, sizeof(current->comm));
>> > if ((buf = prb_reserve(... sz))) {
>> > vscnprintf(buf, "Comm %s\n", current->comm);
>> > ^^^^^^^^^^^^^^ ->comm has changed.
>> > Nothing critical, we
>> > should not corrupt
>> > anything, but we will
>> > truncate ->comm if its
>> > new size is larger than
>> > what it used to be when
>> > we did vscprintf(NULL).
>> > prb_commit(...);
>> > }
>
> [..]
>> In my v1 rfc series, I avoided this issue by having a separate dedicated
>> ringbuffer (rb_sprintf) that was used to allocate a temporary max-size
>> (2KB) buffer for sprinting to. Then _that_ was used for the real
>> ringbuffer input (strlen, prb_reserve, memcpy, prb_commit). That would
>> still be the approach of my choice.
>
> In other words per-CPU buffering, AKA printk_safe ;)

Actually, no. I made use of a printk_ringbuffer (which is global). It
was used for temporary memory allocation for sprintf, but the result was
immediately written into the printk buffer from the same context. In
contrast, printk_safe triggers a different context to handle the
insertion.

It is still my intention to eliminate the buffering component of
printk_safe.

After we get a lockless ringbuffer that we are happy with, my next
series to integrate the buffer into printk will again use the sprint_rb
solution to avoid the issue discussed in this thread. Perhaps it would
be best to continue this discussion after I've posted that series.

John Ogness

2019-06-26 07:48:17

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On (06/26/19 09:16), John Ogness wrote:
> On 2019-06-26, Sergey Senozhatsky <[email protected]> wrote:
> > [..]
> >> In my v1 rfc series, I avoided this issue by having a separate dedicated
> >> ringbuffer (rb_sprintf) that was used to allocate a temporary max-size
> >> (2KB) buffer for sprinting to. Then _that_ was used for the real
> >> ringbuffer input (strlen, prb_reserve, memcpy, prb_commit). That would
> >> still be the approach of my choice.
> >
> > In other words per-CPU buffering, AKA printk_safe ;)
>
> Actually, no. I made use of a printk_ringbuffer (which is global). It
> was used for temporary memory allocation for sprintf, but the result was
> immediately written into the printk buffer from the same context. In
> contrast, printk_safe triggers a different context to handle the
> insertion.

I agree that's not relevant to your patch. But let me explain what I
meant. printk_safe has many faces. The NMI part of printk_safe has
the PRINTK_NMI_DIRECT_CONTEXT_MASK bufferring bypass - when we know
that we are in NMI and printk logbuf is unlocked then we can do the
normal logbuf_store() from NMI, avoiding irq flush because the data
is already in the main log buffer. We also can do the same buffering
bypass for non-NMI part of printk_safe, but just sometimes.
PRINTK_SAFE_CONTEXT_MASK most of the times indicates that logbuf is
locked, but not always - e.g. we call console_drivers under
PRINTK_SAFE_CONTEXT_MASK.

But like I said, not relevant to your patch. The relevant part is the
possibility of race conditions.

> It is still my intention to eliminate the buffering component of
> printk_safe.

That's understandable.

> After we get a lockless ringbuffer that we are happy with, my next
> series to integrate the buffer into printk will again use the sprint_rb
> solution to avoid the issue discussed in this thread.

Yes, I agree that either sprint_rb or just 2 LOG_LINE_MAX per-CPU
buffers looks safer. This basically means that printk cannot use
printk_ringbuffer as is and needs some sort of extra layer next to
(or atop of) printk_ringbuffer, but we have the same thing in printk
right now, basically. static char textbuf[LOG_LINE_MAX] -> logbuf.

-ss

2019-06-26 07:48:24

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Wed 2019-06-26 09:16:11, John Ogness wrote:
> On 2019-06-26, Sergey Senozhatsky <[email protected]> wrote:
> > [..]
> >> > CPU0 CPU1
> >> > printk(...)
> >> > sz = vscprintf(NULL, "Comm %s\n", current->comm);
> >> > ia64_mca_modify_comm()
> >> > snprintf(comm, sizeof(comm), "%s %d", current->comm, previous_current->pid);
> >> > memcpy(current->comm, comm, sizeof(current->comm));
> >> > if ((buf = prb_reserve(... sz))) {
> >> > vscnprintf(buf, "Comm %s\n", current->comm);
> >> > ^^^^^^^^^^^^^^ ->comm has changed.
> >> > Nothing critical, we
> >> > should not corrupt
> >> > anything, but we will
> >> > truncate ->comm if its
> >> > new size is larger than
> >> > what it used to be when
> >> > we did vscprintf(NULL).
> >> > prb_commit(...);
> >> > }

Great catch.

> After we get a lockless ringbuffer that we are happy with, my next
> series to integrate the buffer into printk will again use the sprint_rb
> solution to avoid the issue discussed in this thread. Perhaps it would
> be best to continue this discussion after I've posted that series.

We should keep it in head. But I fully agree with postponing
the discussion.

I personally think that this is a corner case. I would start with
a simple vscprintf(NULL, ...) and vscprintf(reserved_buf, ...)
approach. We could always make it more complex when it causes
real life problems.

If the data might change under the hood then we have bigger
problems. For example, there might be a race when the trailing
"\0" has not been written yet.

Best Regards,
Petr

2019-06-26 08:00:10

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On (06/26/19 09:47), Petr Mladek wrote:
[..]
> If the data might change under the hood then we have bigger
> problems. For example, there might be a race when the trailing
> "\0" has not been written yet.

Current printk would not handle such cases. I'm only talking about
transition from one consistent state to another, when
sprintf(NULL, ptr) == A && sprintf(buf, ptr) != A
Wether it's a corner case or not is entirely unclear to me but I
probably would not call it "an impossible scenario".

-ss

2019-06-26 08:31:05

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Tue 2019-06-25 15:29:35, John Ogness wrote:
> To address your question: For the linked list implementation, if you are
> looking at it from the linked list perspective, the number of
> descriptors on the list is constantly fluctuating (increasing and
> decreasing) and the ordering of the descriptors is constantly
> changing. They are ordered according to the writer commit order (not the
> writer reserve order) and the only descriptors on the list are the ones
> that are not within a reserve/commit window.

This and few other comments below are really valuable explanation.
I misunderstood how the list worked. I have to revisit it and
rethink my view on the patchset.

> >>> If the above is true then we could achieve similar result
> >>> when using the array as a circular buffer. It would be
> >>> the same like when all members are linked from the beginning.
> >>
> >> So you are suggesting using a multi-reader multi-writer lockless
> >> ringbuffer to implement a multi-reader multi-writer lockless
> >> ringbuffer. ;-)
> >>
> >> The descriptor ringbuffer has fixed-size items, which simplifies the
> >> task. But I expect you will run into a chicken-egg scenario.
> >
> > AFAIK, the main obstacle with the fully lockless solution was
> > that the entries did not have a fixed size.
>
> No. The variable size of the records was the reason I used
> descriptors. That has nothing to do with how I chose to connect those
> descriptors.

I think that we are talking about the same. If I remember correctly,
the main problem is that cmpxchg() is not reliable when the same
address might be used by the metadata and data.

For example, the code never know if it compared previous seq number
of it another CPU/NMI wrote there data (string) in the meantime.

> I think it is not as simple as you think it is and I expect you will end
> up with a solution that is more complex (although I could be
> wrong). IMHO the linked list solution is quite elegant.

It is quite likely.

> There is an obvious push to get the kernel docs unified under RST, even
> if it is not how I usually do things either. However, now that I've done
> the work, looking back it seems to be a good idea in order to automate
> documentation.

I personally like /** */ description of public API functions. Also
html/pdf version looks nice even though I do not use them.

The thing is that both /** */ and .rst formats can be well readable
even in the source code. I guess that most existing developers read
only the source code.

Well, this discussion probably belongs to another thread. My wish
was just to make the commit message more verbose, please.

> I explicitly wanted to get away from any preconceptions. By specifying
> we have data linked from oldest to newest, I find it feels more natural,
> regardless if I am a writer writing new records to the newest or a
> reader reading all records from oldest to newest.

As I said, I do not have strong opinion. I could live with oldest, newest.

Best Regards,
Petr

2019-06-26 09:11:18

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-26, Petr Mladek <[email protected]> wrote:
>> To address your question: For the linked list implementation, if you
>> are looking at it from the linked list perspective, the number of
>> descriptors on the list is constantly fluctuating (increasing and
>> decreasing) and the ordering of the descriptors is constantly
>> changing. They are ordered according to the writer commit order (not
>> the writer reserve order) and the only descriptors on the list are
>> the ones that are not within a reserve/commit window.
>
> This and few other comments below are really valuable explanation.
> I misunderstood how the list worked.

I will add a documentation section about why a linked list was used.

>>>>> If the above is true then we could achieve similar result
>>>>> when using the array as a circular buffer. It would be
>>>>> the same like when all members are linked from the beginning.
>>>>
>>>> So you are suggesting using a multi-reader multi-writer lockless
>>>> ringbuffer to implement a multi-reader multi-writer lockless
>>>> ringbuffer. ;-)
>>>>
>>>> The descriptor ringbuffer has fixed-size items, which simplifies
>>>> the task. But I expect you will run into a chicken-egg scenario.
>>>
>>> AFAIK, the main obstacle with the fully lockless solution was
>>> that the entries did not have a fixed size.
>>
>> No. The variable size of the records was the reason I used
>> descriptors. That has nothing to do with how I chose to connect those
>> descriptors.
>
> I think that we are talking about the same. If I remember correctly,
> the main problem is that cmpxchg() is not reliable when the same
> address might be used by the metadata and data.

The cmpxchg() issue you mention is why I needed descriptors. But even if
I were to implement a fixed-record-size ringbuffer where the cmpxchg()
issue does not exist, I _still_ would have used a linked list to connect
the records.

It is misleading to think the linked list is because of variable size
records.

John Ogness

2019-06-26 21:17:57

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Mon, Jun 24, 2019 at 10:33:15AM +0200, John Ogness wrote:
> Here are the writer-relevant memory barriers and their associated
> variables:
>
> MB1: data_list.oldest
> MB2: data_list.newest
> MB3: data_block.id
> MB4: descr.data_next
> MB5: descr_list.newest
> MB6: descr.next

I think this is the fundamental divergence in parlance.

You seem to associate a barrier with a (single) variable, where normally
a barrier is between two (or more) variables.

As you wrote in that other email (I'm stlil going through all that);
your MB5 isn't desc_list.newest, but rather between desc_list.newest and
descr.next.

Remember, the topic is called 'memory ordering', and you cannot order
singles.

2019-06-26 21:46:17

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-26, Peter Zijlstra <[email protected]> wrote:
>> Here are the writer-relevant memory barriers and their associated
>> variables:
>>
>> MB1: data_list.oldest
>> MB2: data_list.newest
>> MB3: data_block.id
>> MB4: descr.data_next
>> MB5: descr_list.newest
>> MB6: descr.next
>
> I think this is the fundamental divergence in parlance.
>
> You seem to associate a barrier with a (single) variable, where
> normally a barrier is between two (or more) variables.

The litmus tests I posted to answer your previous questions should
(hopefully) show that I already understand this. The above list shows
the _key_ loads/stores that are used to guarantee ordering (for these
and other memory operations). And yes, I now understand that my comments
need to list all the operations that are being ordered based on these
key loads/stores.

> As you wrote in that other email (I'm stlil going through all that);
> your MB5 isn't desc_list.newest, but rather between desc_list.newest
> and descr.next.

Here is where I have massive problems communicating. I don't understand
why you say the barrier is _between_ newest and next. I would say the
barrier is _on_ newest to _synchronize_ with next (or something). I am
struggling with terminology. (To be honest, I'd much rather just post
litmus tests.)

For example, if we have:

WRITE_ONCE(&a, 1);
WRITE_ONCE(&b, 1);
WRITE_ONCE(&c, 1);
smp_store_release(&d, 1);

and:

local_d = smp_load_acquire(&d);
local_a = READ_ONCE(&a);
local_b = READ_ONCE(&b);
local_c = READ_ONCE(&c);

How do you describe that? Do you say the memory barrier is between a and
d? Or between a, b, c, d? (a, b, c aren't ordered, but they are one-way
synchronized with d).

I would say there is a barrier on d to synchronize a, b, c.

John Ogness

2019-06-26 22:41:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Fri, Jun 21, 2019 at 12:23:19AM +0200, John Ogness wrote:
> Hi Peter,
>
> This is a long response, but we are getting into some fine details about
> the memory barriers (as well as battling my communication skill level).

So I'm going to reply piecewise to this... so not such long emails, but
more of them.

> On 2019-06-18, Peter Zijlstra <[email protected]> wrote:
> >> +static void add_descr_list(struct prb_reserved_entry *e)
> >> +{
> >> + struct printk_ringbuffer *rb = e->rb;
> >> + struct prb_list *l = &rb->descr_list;
> >> + struct prb_descr *d = e->descr;
> >> + struct prb_descr *newest_d;
> >> + unsigned long newest_id;
> >> +
> >> + /* set as newest */
> >> + do {
> >> + /* MB5: synchronize add descr */
> >> + newest_id = smp_load_acquire(&l->newest);
> >> + newest_d = TO_DESCR(rb, newest_id);
> >> +
> >> + if (newest_id == EOL)
> >> + WRITE_ONCE(d->seq, 1);
> >> + else
> >> + WRITE_ONCE(d->seq, READ_ONCE(newest_d->seq) + 1);
> >> + /*
> >> + * MB5: synchronize add descr
> >> + *
> >> + * In particular: next written before cmpxchg
> >> + */
> >> + } while (cmpxchg_release(&l->newest, newest_id, e->id) != newest_id);
> >
> > What does this pair with? I find ->newest usage in:
>
> It is pairing with the smp_load_acquire() at the beginning of this loop
> (also labeled MB5) that is running simultaneously on another CPU. I am
> avoiding a possible situation that a new descriptor is added but the
> store of "next" from the previous descriptor is not yet visible and thus
> the cmpxchg following will fail, which is not allowed. (Note that "next"
> is set to EOL shortly before this function is called.)
>
> The litmus test for this is:
>
> P0(int *newest, int *d_next)
> {
> // set descr->next to EOL (terminates list)
> WRITE_ONCE(*d_next, 1);
>
> // set descr as newest
> smp_store_release(*newest, 1);
> }
>
> P1(int *newest, int *d_next)
> {
> int local_newest;
> int local_next;
>
> // get newest descriptor
> local_newest = smp_load_acquire(*newest);
>
> // a new descriptor is set as the newest
> // (not relevant here)
>
> // read descr->next of previous newest
> // (must be EOL!)
> local_next = READ_ONCE(*d_next);
> }
>
> exists (1:local_newest=1 /\ 1:local_next=0)

I'm having trouble connecting your P1's READ_ONCE() to the actual code.

You say that is in the same function, but I cannot find a LOAD there
that would care about the ACQUIRE.

Afaict prb_list is a list head not a list node (calling it just _list is
confusing at best).

You have a single linked list going from the tail to the head, while
adding to the head and removing from the tail. And that sounds like a
FIFO queue:

struct lqueue_head {
struct lqueue_node *head, *tail;
};

struct lqueue_node {
struct lqueue_node *next;
};

void lqueue_push(struct lqueue_head *h, struct lqueue_node *n)
{
struct lqueue_node *prev;

n->next = NULL;
/*
* xchg() implies RELEASE; and thereby ensures @n is
* complete before getting published.
*/
prev = xchg(&h->head, n);
/*
* xchg() implies ACQUIRE; and thereby ensures @tail is
* written after @head, see lqueue_pop()'s smp_rmb().
*/
if (prev)
WRITE_ONCE(prev->next, n);
else
WRITE_ONCE(h->tail, n);
}

struct lqueue_node *lqueue_pop(struct lqueue_head *h)
{
struct lqueue_node *head, *tail, *next;

do {
tail = READ_ONCE(h->tail);
/* If the list is empty, nothing to remove. */
if (!tail)
return NULL;

/*
* If we see @tail, we must then also see @head.
* Pairs with the xchg() in lqueue_push(),
* ensure no false positive on the singleton
* test below.
*/
smp_rmb();
head = READ_ONCE(h->head);

/* If there is but one item; fail to remove. */
if (head == tail)
return NULL;

next = smp_cond_load_relaxed(&tail->next, VAL);

} while (cmpxchg(h->tail, tail, next) != tail);

return tail;
}

Now, you appear to be using desc_ids instead of pointers, but since
you're not using the actual wrap value; I don't see the benefit of using
those IDs over straight pointers.

That is, unless I've overlooked some subtle ABA issue, but then, your
code doesn't seem to mention that, and I think we're good because if we
re-use an entry, it can never get back in the same location, since we
never allow an empty list (might also be fixable, haven't tought too
hard on this).

That said, the above has cmpxchg() vs WRITE_ONCE() and is therefore not
safe on a number of our architectures. We can either not care about
performance and use xchg() for the ->tail store, or use atomic_long_t
and suffer ugly casting.

But the above is, IMO, a more useful and readable abstraction. Let me
continue in another email (probably tomorrow).

2019-06-26 22:50:27

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Fri, Jun 21, 2019 at 12:23:19AM +0200, John Ogness wrote:
> On 2019-06-18, Peter Zijlstra <[email protected]> wrote:
> >> +
> >> + if (unlikely(newest_id == EOL)) {
> >> + /* no previous newest means we *are* the list, set oldest */
> >> +
> >> + /*
> >> + * MB UNPAIRED
> >
> > That's a bug, MB must always be paired.
>
> Well, it "pairs" with the smp_rmb() of the readers, but I didn't think
> that counts as a pair. That's why I wrote unpaired. The litmus test is:
>
> P0(int *x, int *y)
> {
> WRITE_ONCE(*x, 1);
> smp_store_release(y, 1);
> }
>
> P1(int *x, int *y)
> {
> int rx;
> int ry;
>
> ry = READ_ONCE(*y);
> smp_rmb();
> rx = READ_ONCE(*x);
> }
>
> exists (1:rx=0 /\ 1:ry=1)
>
> The readers rely on the store_releases "pairing" with the smp_rmb() so
> that the readers see things in a sane order.

That is certainly a valid pairing, see also the 'SMP BARRIER PAIRING'
section in memory-barriers.txt (I thought we had a table in there, but
apparently that never quite made it in).

2019-06-27 08:30:01

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Wed 2019-06-26 23:43:56, John Ogness wrote:
> Here is where I have massive problems communicating. I don't understand
> why you say the barrier is _between_ newest and next. I would say the
> barrier is _on_ newest to _synchronize_ with next (or something). I am
> struggling with terminology. (To be honest, I'd much rather just post
> litmus tests.)
>
> For example, if we have:
>
> WRITE_ONCE(&a, 1);
> WRITE_ONCE(&b, 1);
> WRITE_ONCE(&c, 1);
> smp_store_release(&d, 1);
>
> and:
>
> local_d = smp_load_acquire(&d);
> local_a = READ_ONCE(&a);
> local_b = READ_ONCE(&b);
> local_c = READ_ONCE(&c);
>
> How do you describe that? Do you say the memory barrier is between a and
> d? Or between a, b, c, d? (a, b, c aren't ordered, but they are one-way
> synchronized with d).
>
> I would say there is a barrier on d to synchronize a, b, c.

Barriers are always paired. We need to know what variables are
synchonized against each other, what is the reason and where
is the counter part.

I think that it might be done many ways. I am familiar with
bariers in kernel/livepatch/ code. They use rather long description.
But I find it pretty useful especially when the problem is
complicated and more bariers are involved in a single
transition.

Best Regards,
Petr

2019-06-27 18:33:52

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Thu, Jun 27, 2019 at 12:40:34AM +0200, Peter Zijlstra wrote:
> You have a single linked list going from the tail to the head, while
> adding to the head and removing from the tail. And that sounds like a
> FIFO queue:
>
> struct lqueue_head {
> struct lqueue_node *head, *tail;
> };
>
> struct lqueue_node {
> struct lqueue_node *next;
> };
>
> void lqueue_push(struct lqueue_head *h, struct lqueue_node *n)
> {
> struct lqueue_node *prev;
>
> n->next = NULL;
> /*
> * xchg() implies RELEASE; and thereby ensures @n is
> * complete before getting published.
> */
> prev = xchg(&h->head, n);
> /*
> * xchg() implies ACQUIRE; and thereby ensures @tail is
> * written after @head, see lqueue_pop()'s smp_rmb().
> */
> if (prev)
> WRITE_ONCE(prev->next, n);
> else
> WRITE_ONCE(h->tail, n);
> }
>
> struct lqueue_node *lqueue_pop(struct lqueue_head *h)
> {
> struct lqueue_node *head, *tail, *next;
>
> do {
> tail = READ_ONCE(h->tail);
> /* If the list is empty, nothing to remove. */
> if (!tail)
> return NULL;
>
> /*
> * If we see @tail, we must then also see @head.
> * Pairs with the xchg() in lqueue_push(),
> * ensure no false positive on the singleton
> * test below.

or is it false negative?, I'm too tired to think staight. What can
happen without the rmb is that the head load can get hoisted over the
tail load and then observe a NULL head and a !NULL tail and thus head !=
tail and we think there's multiple entries on the list and stuff goes
wobbly.

> */
> smp_rmb();
> head = READ_ONCE(h->head);
>
> /* If there is but one item; fail to remove. */
> if (head == tail)
> return NULL;
>
> next = smp_cond_load_relaxed(&tail->next, VAL);
>
> } while (cmpxchg(h->tail, tail, next) != tail);
>
> return tail;
> }

2019-06-28 09:52:55

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-27, Peter Zijlstra <[email protected]> wrote:
>> This is a long response, but we are getting into some fine details
>> about the memory barriers (as well as battling my communication skill
>> level).
>
> So I'm going to reply piecewise to this... so not such long emails,
> but more of them.

I am not sure if I should wait to respond until you've finished going
through the full patch. I will respond to this email and if you would
like me to wait on further responses, just let me know. Thanks.

>>>> +static void add_descr_list(struct prb_reserved_entry *e)
>>>> +{
>>>> + struct printk_ringbuffer *rb = e->rb;
>>>> + struct prb_list *l = &rb->descr_list;
>>>> + struct prb_descr *d = e->descr;
>>>> + struct prb_descr *newest_d;
>>>> + unsigned long newest_id;
>>>> +
>>>> + /* set as newest */
>>>> + do {
>>>> + /* MB5: synchronize add descr */
>>>> + newest_id = smp_load_acquire(&l->newest);
>>>> + newest_d = TO_DESCR(rb, newest_id);
>>>> +
>>>> + if (newest_id == EOL)
>>>> + WRITE_ONCE(d->seq, 1);
>>>> + else
>>>> + WRITE_ONCE(d->seq, READ_ONCE(newest_d->seq) + 1);
>>>> + /*
>>>> + * MB5: synchronize add descr
>>>> + *
>>>> + * In particular: next written before cmpxchg
>>>> + */
>>>> + } while (cmpxchg_release(&l->newest, newest_id, e->id) != newest_id);
>>>
>>> What does this pair with? I find ->newest usage in:
>>
>> It is pairing with the smp_load_acquire() at the beginning of this
>> loop (also labeled MB5) that is running simultaneously on another
>> CPU. I am avoiding a possible situation that a new descriptor is
>> added but the store of "next" from the previous descriptor is not yet
>> visible and thus the cmpxchg following will fail, which is not
>> allowed. (Note that "next" is set to EOL shortly before this function
>> is called.)
>>
>> The litmus test for this is:
>>
>> P0(int *newest, int *d_next)
>> {
>> // set descr->next to EOL (terminates list)
>> WRITE_ONCE(*d_next, 1);
>>
>> // set descr as newest
>> smp_store_release(*newest, 1);
>> }
>>
>> P1(int *newest, int *d_next)
>> {
>> int local_newest;
>> int local_next;
>>
>> // get newest descriptor
>> local_newest = smp_load_acquire(*newest);
>>
>> // a new descriptor is set as the newest
>> // (not relevant here)
>>
>> // read descr->next of previous newest
>> // (must be EOL!)
>> local_next = READ_ONCE(*d_next);
>> }
>>
>> exists (1:local_newest=1 /\ 1:local_next=0)
>
> I'm having trouble connecting your P1's READ_ONCE() to the actual
> code. You say that is in the same function, but I cannot find a LOAD
> there that would care about the ACQUIRE.

P1's READ_ONCE() is the READ part of the cmpxchg a few lines below:

WARN_ON_ONCE(cmpxchg_release(&newest_d->next,
EOL, e->id) != EOL);

Note that the cmpxchg is a _release because of MB6 (a different memory
barrier pair). But only the READ part of that cmpxchg synchronizes with
MB5.

Also note that cmpxchg is used only because of bug checking. If instead
it becomes a blind store (such as you suggest below), then it changes to
smp_store_release().

While investigating this (and the lack of a LOAD), I realized that the
smp_load_acquire() is not needed because @seq is dependent on the load
of @newest. I have implemented and tested these changes. I also added
setting the list terminator to this function, since all callers would
have to do it anyway. Also, I spent a lot of time trying to put in
comments that I think are _understandable_ and _acceptable_.

@Peter: I expect they are way too long for you.

@Andrea: Is this starting to become something that you would like to
see?

/**
* add_descr_list() - Add a descriptor to the descriptor list.
*
* @e: An entry that has already reserved data.
*
* The provided entry contains a pointer to a descriptor that has already
* been reserved for this entry. However, the reserved descriptor is not
* yet on the list. Add this descriptor as the newest item.
*
* A descriptor is added in two steps. The first step is to make this
* descriptor the newest. The second step is to update @next of the former
* newest descriptor to point to this one (or set @oldest to this one if
* this will be the first descriptor on the list).
*/
static void add_descr_list(struct prb_reserved_entry *e)
{
struct printk_ringbuffer *rb = e->rb;
struct prb_list *l = &rb->descr_list;
struct prb_descr *d = e->descr;
struct prb_descr *newest_d;
unsigned long newest_id;

WRITE_ONCE(d->next, EOL);

do {
newest_id = READ_ONCE(l->newest);
newest_d = TO_DESC(rb, newest_id);

if (newest_id == EOL) {
WRITE_ONCE(d->seq, 1);
} else {
/*
* MB5-read: synchronize setting newest descr
*
* context-pair: 2 writers adding a descriptor via
* add_descr_list().
*
* @newest will load before @seq due to a data
* dependency, therefore, the stores of @seq
* and @next from the pairing MB5-write context
* will be visible.
*
* Although @next is not loaded by this context,
* this context must overwrite the stored @next
* value of the pairing MB5-write context.
*/
WRITE_ONCE(d->seq, READ_ONCE(newest_d->seq) + 1);
}

/*
* MB5-write: synchronize setting newest descr
*
* context-pair: 2 writers adding a descriptor via
* add_descr_list().
*
* Ensure that @next and @seq are stored before @d is
* visible via @newest. The pairing MB5-read context
* must load this @seq value and must overwrite this
* @next value.
*/
} while (cmpxchg_release(&l->newest, newest_id, e->id) != newest_id);

if (unlikely(newest_id == EOL)) {
/*
* MB0-write: synchronize adding first descr
*
* context-pair: 1 writer adding the first descriptor via
* add_descr_list(), 1 reader getting the beginning of
* the list via iter_peek_next_id().
*
* This context recently assigned new values for @id,
* @next, @seq. Ensure these are stored before the first
* store to @oldest so that the new values are visible
* to the reader in the pairing MB0-read context.
*
* Note: Before this store, the value of @oldest is EOL.
*/
smp_store_release(&l->oldest, e->id);
} else {
/*
* MB6-write: synchronize linking new descr
*
* context-pair-1: 1 writer adding a descriptor via
* add_descr_list(), 1 writer removing a descriptor via
* remove_oldest_descr().
*
* If this is a recycled descriptor, this context
* recently stored a new @oldest value. Ensure that
* @oldest is stored before storing @next so that
* if the pairing MB6-read context sees a non-EOL
* @next value, it is ensured that it will also see
* an updated @oldest value.
*
* context-pair-2: 1 writer adding a descriptor via
* add_descr_list(), 1 reader iterating the list via
* prb_iter_next_valid_entry().
*
* This context recently assigned new values for @id,
* @next, @seq, @data, @data_next. Ensure these are
* stored before storing @next of the previously
* newest descriptor so that the new values are
* visible to the iterating reader in the pairing
* MB6-read context.
*
* Note: Before this store, the value of @next of the
* previously newest descriptor is EOL.
*/
smp_store_release(&newest_d->next, e->id);
}
}

The smp_rmb() calls in the reader functions are then commented and
marked with the appropriate MB0-read and MB6-read labels.

> Afaict prb_list is a list head not a list node (calling it just _list
> is confusing at best).

OK.

> You have a single linked list going from the tail to the head, while
> adding to the head and removing from the tail. And that sounds like a
> FIFO queue:

Yes, but with one important feature: the nodes in the FIFO queue are
labeled with ordered sequence numbers. This is important for printk. I
talk more about this below.

> struct lqueue_head {
> struct lqueue_node *head, *tail;
> };
>
> struct lqueue_node {
> struct lqueue_node *next;
> };
>
> void lqueue_push(struct lqueue_head *h, struct lqueue_node *n)
> {
> struct lqueue_node *prev;
>
> n->next = NULL;

Is this safe? Do all compilers understand that @next must be stored
before the xchg() of @head? I would have chosen WRITE_ONCE().

> /*
> * xchg() implies RELEASE; and thereby ensures @n is
> * complete before getting published.
> */
> prev = xchg(&h->head, n);

Unfortunately it is not that simple because of sequence numbers. A node
must be assigned a sequence number that is +1 of the previous node. This
must be done before exchanging the head because immediately after the
xchg() on the head, another CPU could then add on to us and expects our
sequence number to already be set.

This is why I need cmpxchg() here.

> /*
> * xchg() implies ACQUIRE; and thereby ensures @tail is
> * written after @head, see lqueue_pop()'s smp_rmb().
> */
> if (prev)
> WRITE_ONCE(prev->next, n);

This needs to be a store_release() so that a reader cannot read @n but
the store to @next is not yet visible. The memory barriers of the above
xchg() do not apply here because readers never read @head.

> else
> WRITE_ONCE(h->tail, n);

Ditto, but for the tail node in particular.

> }
>
> struct lqueue_node *lqueue_pop(struct lqueue_head *h)
> {
> struct lqueue_node *head, *tail, *next;
>
> do {
> tail = READ_ONCE(h->tail);
> /* If the list is empty, nothing to remove. */
> if (!tail)
> return NULL;
>
> /*
> * If we see @tail, we must then also see @head.
> * Pairs with the xchg() in lqueue_push(),
> * ensure no false positive on the singleton
> * test below.
> */
> smp_rmb();
> head = READ_ONCE(h->head);
>
> /* If there is but one item; fail to remove. */
> if (head == tail)
> return NULL;
>
> next = smp_cond_load_relaxed(&tail->next, VAL);

What if a writer is adding a 2nd node to the queue and is interrupted by
an NMI directly after the xchg() in lqueue_push()? Then we have:

* head != tail
* tail->next == NULL

If that interrupting NMI calls lqueue_pop(), the NMI will spin
forever. The following cmpxchg() is not allowed to happen as long as
tail->next is NULL.

This is why I synchronize on @next instead, using (tail && !tail->next)
for the singleton test.

> } while (cmpxchg(h->tail, tail, next) != tail);
>
> return tail;
> }
>
> Now, you appear to be using desc_ids instead of pointers, but since
> you're not using the actual wrap value; I don't see the benefit of
> using those IDs over straight pointers.

The documentation mentions that descriptor ids are used to identify
pointers to invalid descriptors. This is used by the readers, see
iter_peek_next_id() and prb_iter_next_valid_entry().

IDs are used for:

- @next of descriptors on the list
- @id, @id_next in the reader iterator
- @id in the data blocks

If changed to pointers, iterators would need to additionally store @seq
values to be able to identifiy if the entry they are pointing to is the
entry they expect.

The only advantage I see with pointers is that the ringbuffer could be
more useful generally, independent of whether the data is separate or
within the nodes or if the nodes are statically or dynamically
allocated. That is something worth having, even if it is not printk
related.

Are you implicitly requesting me to split the prb_ringbuffer and instead
base it on a new "lockless multi-writer multi-reader sequenced FIFO
queue" data structure?

> That is, unless I've overlooked some subtle ABA issue, but then, your
> code doesn't seem to mention that, and I think we're good because if
> we re-use an entry, it can never get back in the same location, since
> we never allow an empty list

I do not understand what you mean here. If a reader has a pointer to an
entry, the entry behind that pointer can certainly change. But that
isn't a problem. The reader will recognize that.

> (might also be fixable, haven't tought too hard on this).

:-)

> That said, the above has cmpxchg() vs WRITE_ONCE() and is therefore
> not safe on a number of our architectures. We can either not care
> about performance and use xchg() for the ->tail store, or use
> atomic_long_t and suffer ugly casting.

cmpxchg_release() vs WRITE_ONCE() is not safe?! Can you point me to
documentation about this?

> But the above is, IMO, a more useful and readable abstraction. Let me
> continue in another email (probably tomorrow).

Thank you for taking the time for this.

John Ogness

2019-06-28 15:46:32

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Fri, Jun 28, 2019 at 11:50:33AM +0200, John Ogness wrote:
> On 2019-06-27, Peter Zijlstra <[email protected]> wrote:

> >>>> +static void add_descr_list(struct prb_reserved_entry *e)
> >>>> +{
> >>>> + struct printk_ringbuffer *rb = e->rb;
> >>>> + struct prb_list *l = &rb->descr_list;
> >>>> + struct prb_descr *d = e->descr;
> >>>> + struct prb_descr *newest_d;
> >>>> + unsigned long newest_id;
> >>>> +
> >>>> + /* set as newest */
> >>>> + do {
> >>>> + /* MB5: synchronize add descr */
> >>>> + newest_id = smp_load_acquire(&l->newest);
> >>>> + newest_d = TO_DESCR(rb, newest_id);
> >>>> +
> >>>> + if (newest_id == EOL)
> >>>> + WRITE_ONCE(d->seq, 1);
> >>>> + else
> >>>> + WRITE_ONCE(d->seq, READ_ONCE(newest_d->seq) + 1);
> >>>> + /*
> >>>> + * MB5: synchronize add descr
> >>>> + *
> >>>> + * In particular: next written before cmpxchg
> >>>> + */
> >>>> + } while (cmpxchg_release(&l->newest, newest_id, e->id) != newest_id);
> >>>
> >>> What does this pair with? I find ->newest usage in:
> >>
> >> It is pairing with the smp_load_acquire() at the beginning of this
> >> loop (also labeled MB5) that is running simultaneously on another
> >> CPU. I am avoiding a possible situation that a new descriptor is
> >> added but the store of "next" from the previous descriptor is not yet
> >> visible and thus the cmpxchg following will fail, which is not
> >> allowed. (Note that "next" is set to EOL shortly before this function
> >> is called.)
> >>
> >> The litmus test for this is:
> >>
> >> P0(int *newest, int *d_next)
> >> {
> >> // set descr->next to EOL (terminates list)
> >> WRITE_ONCE(*d_next, 1);
> >>
> >> // set descr as newest
> >> smp_store_release(*newest, 1);
> >> }
> >>
> >> P1(int *newest, int *d_next)
> >> {
> >> int local_newest;
> >> int local_next;
> >>
> >> // get newest descriptor
> >> local_newest = smp_load_acquire(*newest);
> >>
> >> // a new descriptor is set as the newest
> >> // (not relevant here)
> >>
> >> // read descr->next of previous newest
> >> // (must be EOL!)
> >> local_next = READ_ONCE(*d_next);
> >> }
> >>
> >> exists (1:local_newest=1 /\ 1:local_next=0)
> >
> > I'm having trouble connecting your P1's READ_ONCE() to the actual
> > code. You say that is in the same function, but I cannot find a LOAD
> > there that would care about the ACQUIRE.
>
> P1's READ_ONCE() is the READ part of the cmpxchg a few lines below:
>
> WARN_ON_ONCE(cmpxchg_release(&newest_d->next,
> EOL, e->id) != EOL);
>
> Note that the cmpxchg is a _release because of MB6 (a different memory
> barrier pair). But only the READ part of that cmpxchg synchronizes with
> MB5.
>
> Also note that cmpxchg is used only because of bug checking. If instead
> it becomes a blind store (such as you suggest below), then it changes to
> smp_store_release().
>
> While investigating this (and the lack of a LOAD), I realized that the
> smp_load_acquire() is not needed because @seq is dependent on the load
> of @newest.

That!

> I have implemented and tested these changes. I also added
> setting the list terminator to this function, since all callers would
> have to do it anyway. Also, I spent a lot of time trying to put in
> comments that I think are _understandable_ and _acceptable_.
>
> @Peter: I expect they are way too long for you.
>
> @Andrea: Is this starting to become something that you would like to
> see?
>
> /**
> * add_descr_list() - Add a descriptor to the descriptor list.
> *
> * @e: An entry that has already reserved data.
> *
> * The provided entry contains a pointer to a descriptor that has already
> * been reserved for this entry. However, the reserved descriptor is not
> * yet on the list. Add this descriptor as the newest item.
> *
> * A descriptor is added in two steps. The first step is to make this
> * descriptor the newest. The second step is to update @next of the former
> * newest descriptor to point to this one (or set @oldest to this one if
> * this will be the first descriptor on the list).
> */

I still think it might be useful to explicitly call out the data
structure more. Even if you cannot use a fully abtracted queue.

Also, newest/oldest just looks weird to me; I'm expecting head/tail.

> static void add_descr_list(struct prb_reserved_entry *e)
> {
> struct printk_ringbuffer *rb = e->rb;
> struct prb_list *l = &rb->descr_list;
> struct prb_descr *d = e->descr;
> struct prb_descr *newest_d;
> unsigned long newest_id;
>
> WRITE_ONCE(d->next, EOL);
>
> do {
> newest_id = READ_ONCE(l->newest);
> newest_d = TO_DESC(rb, newest_id);
>
> if (newest_id == EOL) {
> WRITE_ONCE(d->seq, 1);
> } else {
> /*
> * MB5-read: synchronize setting newest descr
> *
> * context-pair: 2 writers adding a descriptor via
> * add_descr_list().
> *
> * @newest will load before @seq due to a data
> * dependency, therefore, the stores of @seq
> * and @next from the pairing MB5-write context
> * will be visible.
> *
> * Although @next is not loaded by this context,
> * this context must overwrite the stored @next
> * value of the pairing MB5-write context.
> */
> WRITE_ONCE(d->seq, READ_ONCE(newest_d->seq) + 1);
> }
>
> /*
> * MB5-write: synchronize setting newest descr
> *
> * context-pair: 2 writers adding a descriptor via
> * add_descr_list().
> *
> * Ensure that @next and @seq are stored before @d is
> * visible via @newest. The pairing MB5-read context
> * must load this @seq value and must overwrite this
> * @next value.
> */
> } while (cmpxchg_release(&l->newest, newest_id, e->id) != newest_id);
>
> if (unlikely(newest_id == EOL)) {
> /*
> * MB0-write: synchronize adding first descr
> *
> * context-pair: 1 writer adding the first descriptor via
> * add_descr_list(), 1 reader getting the beginning of
> * the list via iter_peek_next_id().
> *
> * This context recently assigned new values for @id,
> * @next, @seq. Ensure these are stored before the first
> * store to @oldest so that the new values are visible
> * to the reader in the pairing MB0-read context.
> *
> * Note: Before this store, the value of @oldest is EOL.
> */
> smp_store_release(&l->oldest, e->id);
> } else {
> /*
> * MB6-write: synchronize linking new descr
> *
> * context-pair-1: 1 writer adding a descriptor via
> * add_descr_list(), 1 writer removing a descriptor via
> * remove_oldest_descr().
> *
> * If this is a recycled descriptor, this context
> * recently stored a new @oldest value. Ensure that
> * @oldest is stored before storing @next so that
> * if the pairing MB6-read context sees a non-EOL
> * @next value, it is ensured that it will also see
> * an updated @oldest value.
> *
> * context-pair-2: 1 writer adding a descriptor via
> * add_descr_list(), 1 reader iterating the list via
> * prb_iter_next_valid_entry().
> *
> * This context recently assigned new values for @id,
> * @next, @seq, @data, @data_next. Ensure these are
> * stored before storing @next of the previously
> * newest descriptor so that the new values are
> * visible to the iterating reader in the pairing
> * MB6-read context.
> *
> * Note: Before this store, the value of @next of the
> * previously newest descriptor is EOL.
> */
> smp_store_release(&newest_d->next, e->id);
> }
> }
>
> The smp_rmb() calls in the reader functions are then commented and
> marked with the appropriate MB0-read and MB6-read labels.
>
> > Afaict prb_list is a list head not a list node (calling it just _list
> > is confusing at best).
>
> OK.
>
> > You have a single linked list going from the tail to the head, while
> > adding to the head and removing from the tail. And that sounds like a
> > FIFO queue:
>
> Yes, but with one important feature: the nodes in the FIFO queue are
> labeled with ordered sequence numbers. This is important for printk. I
> talk more about this below.

But nowhere did/do you say what the actual data structure is, with what
modification for which reason.

> > struct lqueue_head {
> > struct lqueue_node *head, *tail;
> > };
> >
> > struct lqueue_node {
> > struct lqueue_node *next;
> > };
> >
> > void lqueue_push(struct lqueue_head *h, struct lqueue_node *n)
> > {
> > struct lqueue_node *prev;
> >
> > n->next = NULL;
>
> Is this safe? Do all compilers understand that @next must be stored
> before the xchg() of @head? I would have chosen WRITE_ONCE().

Yep, xchg() implies an smp_mb() before and after, smp_mb() in turn
implies a compiler barrier. Even if there is compiler induced brain
damage (store-tearing) all that must be done before the actual RmW.

Same with xchg_release(), the RELEASE is sufficient to have all previous
stores complete before the RmW.

> > /*
> > * xchg() implies RELEASE; and thereby ensures @n is
> > * complete before getting published.
> > */
> > prev = xchg(&h->head, n);
>
> Unfortunately it is not that simple because of sequence numbers. A node
> must be assigned a sequence number that is +1 of the previous node. This
> must be done before exchanging the head because immediately after the
> xchg() on the head, another CPU could then add on to us and expects our
> sequence number to already be set.
>
> This is why I need cmpxchg() here.

So far that doens't make sense yet, +1 is implicit in the list order
surely. But yes, if you need the seq like that, then cmpxchg it is.

> > /*
> > * xchg() implies ACQUIRE; and thereby ensures @tail is
> > * written after @head, see lqueue_pop()'s smp_rmb().
> > */
> > if (prev)
> > WRITE_ONCE(prev->next, n);
>
> This needs to be a store_release() so that a reader cannot read @n but
> the store to @next is not yet visible. The memory barriers of the above
> xchg() do not apply here because readers never read @head.

Confused, all stores to @n are before the xchg() so the barrier from
xchg() also order those stores and this store.

> > else
> > WRITE_ONCE(h->tail, n);
>
> Ditto, but for the tail node in particular.
>
> > }
> >
> > struct lqueue_node *lqueue_pop(struct lqueue_head *h)
> > {
> > struct lqueue_node *head, *tail, *next;
> >
> > do {
> > tail = READ_ONCE(h->tail);
> > /* If the list is empty, nothing to remove. */
> > if (!tail)
> > return NULL;
> >
> > /*
> > * If we see @tail, we must then also see @head.
> > * Pairs with the xchg() in lqueue_push(),
> > * ensure no false positive on the singleton
> > * test below.
> > */
> > smp_rmb();
> > head = READ_ONCE(h->head);
> >
> > /* If there is but one item; fail to remove. */
> > if (head == tail)
> > return NULL;
> >
> > next = smp_cond_load_relaxed(&tail->next, VAL);
>
> What if a writer is adding a 2nd node to the queue and is interrupted by
> an NMI directly after the xchg() in lqueue_push()? Then we have:
>
> * head != tail
> * tail->next == NULL
>
> If that interrupting NMI calls lqueue_pop(), the NMI will spin
> forever. The following cmpxchg() is not allowed to happen as long as
> tail->next is NULL.

Indeed. I forgot that you actually use pop on the producer side.

(Note that the qspinlock has a queue not unlike this, but that again
doesn't have to bother with NMIs)

> This is why I synchronize on @next instead, using (tail && !tail->next)
> for the singleton test.

OK.

> > } while (cmpxchg(h->tail, tail, next) != tail);
> >
> > return tail;
> > }
> >
> > Now, you appear to be using desc_ids instead of pointers, but since
> > you're not using the actual wrap value; I don't see the benefit of
> > using those IDs over straight pointers.
>
> The documentation mentions that descriptor ids are used to identify
> pointers to invalid descriptors. This is used by the readers, see
> iter_peek_next_id() and prb_iter_next_valid_entry().
>
> IDs are used for:
>
> - @next of descriptors on the list
> - @id, @id_next in the reader iterator
> - @id in the data blocks
>
> If changed to pointers, iterators would need to additionally store @seq
> values to be able to identifiy if the entry they are pointing to is the
> entry they expect.
>
> The only advantage I see with pointers is that the ringbuffer could be
> more useful generally, independent of whether the data is separate or
> within the nodes or if the nodes are statically or dynamically
> allocated. That is something worth having, even if it is not printk
> related.
>
> Are you implicitly requesting me to split the prb_ringbuffer and instead
> base it on a new "lockless multi-writer multi-reader sequenced FIFO
> queue" data structure?

Not specifically; I was just trying to untangle the code and found a
queue. I still (sorry!) haven't gotten through the lot of it to see how
all the parts fit together.

> > That is, unless I've overlooked some subtle ABA issue, but then, your
> > code doesn't seem to mention that, and I think we're good because if
> > we re-use an entry, it can never get back in the same location, since
> > we never allow an empty list
>
> I do not understand what you mean here. If a reader has a pointer to an
> entry, the entry behind that pointer can certainly change. But that
> isn't a problem. The reader will recognize that.

ABA is where a cmpxchg has a false positive due to values matching but
not the structure.

For example, in the above pop, if h->tail would (again) be @tail, but
@next would not be the correct value. Something like that could happen
if before the cmpxchg a concurrent pop takes the element off and then
sticks it back on, but with a different ->next.

Then our cmpxchg wil succeed and corrupt.

> > (might also be fixable, haven't tought too hard on this).
>
> :-)
>
> > That said, the above has cmpxchg() vs WRITE_ONCE() and is therefore
> > not safe on a number of our architectures. We can either not care
> > about performance and use xchg() for the ->tail store, or use
> > atomic_long_t and suffer ugly casting.
>
> cmpxchg_release() vs WRITE_ONCE() is not safe?! Can you point me to
> documentation about this?

Documentation/atomic_t.txt has this, see the SEMANTICS section on
atomic-set.

2019-06-28 16:08:27

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Fri, Jun 28, 2019 at 05:44:35PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 28, 2019 at 11:50:33AM +0200, John Ogness wrote:

> > cmpxchg_release() vs WRITE_ONCE() is not safe?! Can you point me to
> > documentation about this?
>
> Documentation/atomic_t.txt has this, see the SEMANTICS section on
> atomic-set.

Also see: arch/parisc/lib/bitops.c for one such case.

2019-06-29 21:05:57

by Andrea Parri

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

> /**
> * add_descr_list() - Add a descriptor to the descriptor list.
> *
> * @e: An entry that has already reserved data.
> *
> * The provided entry contains a pointer to a descriptor that has already
> * been reserved for this entry. However, the reserved descriptor is not
> * yet on the list. Add this descriptor as the newest item.
> *
> * A descriptor is added in two steps. The first step is to make this
> * descriptor the newest. The second step is to update @next of the former
> * newest descriptor to point to this one (or set @oldest to this one if
> * this will be the first descriptor on the list).
> */
> static void add_descr_list(struct prb_reserved_entry *e)
> {
> struct printk_ringbuffer *rb = e->rb;
> struct prb_list *l = &rb->descr_list;
> struct prb_descr *d = e->descr;
> struct prb_descr *newest_d;
> unsigned long newest_id;
>
> WRITE_ONCE(d->next, EOL);

/* C */


>
> do {
> newest_id = READ_ONCE(l->newest);

/* A */


> newest_d = TO_DESC(rb, newest_id);
>
> if (newest_id == EOL) {
> WRITE_ONCE(d->seq, 1);
> } else {
> /*
> * MB5-read: synchronize setting newest descr
> *
> * context-pair: 2 writers adding a descriptor via
> * add_descr_list().
> *
> * @newest will load before @seq due to a data
> * dependency, therefore, the stores of @seq
> * and @next from the pairing MB5-write context
> * will be visible.
> *
> * Although @next is not loaded by this context,
> * this context must overwrite the stored @next
> * value of the pairing MB5-write context.
> */
> WRITE_ONCE(d->seq, READ_ONCE(newest_d->seq) + 1);

/* B: this READ_ONCE() */

Hence you're claiming a data dependency from A to B. (FWIW, the LKMM
would call "A ->dep B" an "address dependency.)

This comment also claims that the "pairing MB5-write" orders "stores
of @seq and @next" (which are to different memory locations w.r.t. A
and B): I do not get why this access to @next (C above?, that's also
"unordered" w.r.t. A) can be relevant; can you elaborate?


> }
>
> /*
> * MB5-write: synchronize setting newest descr
> *
> * context-pair: 2 writers adding a descriptor via
> * add_descr_list().
> *
> * Ensure that @next and @seq are stored before @d is
> * visible via @newest. The pairing MB5-read context
> * must load this @seq value and must overwrite this
> * @next value.
> */
> } while (cmpxchg_release(&l->newest, newest_id, e->id) != newest_id);
>
> if (unlikely(newest_id == EOL)) {
> /*
> * MB0-write: synchronize adding first descr
> *
> * context-pair: 1 writer adding the first descriptor via
> * add_descr_list(), 1 reader getting the beginning of
> * the list via iter_peek_next_id().
> *
> * This context recently assigned new values for @id,
> * @next, @seq. Ensure these are stored before the first
> * store to @oldest so that the new values are visible
> * to the reader in the pairing MB0-read context.
> *
> * Note: Before this store, the value of @oldest is EOL.
> */

My gmail-search foo is unable to locate MB0-read: what am I missing?
Also, can you maybe annotate the memory accesses to @id, @next, @seq
and @oldest (as I did above)? I find myself guessing their location.


> smp_store_release(&l->oldest, e->id);
> } else {
> /*
> * MB6-write: synchronize linking new descr
> *
> * context-pair-1: 1 writer adding a descriptor via
> * add_descr_list(), 1 writer removing a descriptor via
> * remove_oldest_descr().
> *
> * If this is a recycled descriptor, this context
> * recently stored a new @oldest value. Ensure that
> * @oldest is stored before storing @next so that
> * if the pairing MB6-read context sees a non-EOL
> * @next value, it is ensured that it will also see
> * an updated @oldest value.
> *
> * context-pair-2: 1 writer adding a descriptor via
> * add_descr_list(), 1 reader iterating the list via
> * prb_iter_next_valid_entry().
> *
> * This context recently assigned new values for @id,
> * @next, @seq, @data, @data_next. Ensure these are
> * stored before storing @next of the previously
> * newest descriptor so that the new values are
> * visible to the iterating reader in the pairing
> * MB6-read context.
> *
> * Note: Before this store, the value of @next of the
> * previously newest descriptor is EOL.
> */

Same as above but for MB6-read and the accesses to @id, @next, @seq,
@data, @data_next.

In conclusion, I have been unable to produce litmus tests by reading
your comments (meaning I'm lost).

Thanks,
Andrea


> smp_store_release(&newest_d->next, e->id);
> }
> }
>
> The smp_rmb() calls in the reader functions are then commented and
> marked with the appropriate MB0-read and MB6-read labels.
>
> > Afaict prb_list is a list head not a list node (calling it just _list
> > is confusing at best).
>
> OK.
>
> > You have a single linked list going from the tail to the head, while
> > adding to the head and removing from the tail. And that sounds like a
> > FIFO queue:
>
> Yes, but with one important feature: the nodes in the FIFO queue are
> labeled with ordered sequence numbers. This is important for printk. I
> talk more about this below.
>
> > struct lqueue_head {
> > struct lqueue_node *head, *tail;
> > };
> >
> > struct lqueue_node {
> > struct lqueue_node *next;
> > };
> >
> > void lqueue_push(struct lqueue_head *h, struct lqueue_node *n)
> > {
> > struct lqueue_node *prev;
> >
> > n->next = NULL;
>
> Is this safe? Do all compilers understand that @next must be stored
> before the xchg() of @head? I would have chosen WRITE_ONCE().
>
> > /*
> > * xchg() implies RELEASE; and thereby ensures @n is
> > * complete before getting published.
> > */
> > prev = xchg(&h->head, n);
>
> Unfortunately it is not that simple because of sequence numbers. A node
> must be assigned a sequence number that is +1 of the previous node. This
> must be done before exchanging the head because immediately after the
> xchg() on the head, another CPU could then add on to us and expects our
> sequence number to already be set.
>
> This is why I need cmpxchg() here.
>
> > /*
> > * xchg() implies ACQUIRE; and thereby ensures @tail is
> > * written after @head, see lqueue_pop()'s smp_rmb().
> > */
> > if (prev)
> > WRITE_ONCE(prev->next, n);
>
> This needs to be a store_release() so that a reader cannot read @n but
> the store to @next is not yet visible. The memory barriers of the above
> xchg() do not apply here because readers never read @head.
>
> > else
> > WRITE_ONCE(h->tail, n);
>
> Ditto, but for the tail node in particular.
>
> > }
> >
> > struct lqueue_node *lqueue_pop(struct lqueue_head *h)
> > {
> > struct lqueue_node *head, *tail, *next;
> >
> > do {
> > tail = READ_ONCE(h->tail);
> > /* If the list is empty, nothing to remove. */
> > if (!tail)
> > return NULL;
> >
> > /*
> > * If we see @tail, we must then also see @head.
> > * Pairs with the xchg() in lqueue_push(),
> > * ensure no false positive on the singleton
> > * test below.
> > */
> > smp_rmb();
> > head = READ_ONCE(h->head);
> >
> > /* If there is but one item; fail to remove. */
> > if (head == tail)
> > return NULL;
> >
> > next = smp_cond_load_relaxed(&tail->next, VAL);
>
> What if a writer is adding a 2nd node to the queue and is interrupted by
> an NMI directly after the xchg() in lqueue_push()? Then we have:
>
> * head != tail
> * tail->next == NULL
>
> If that interrupting NMI calls lqueue_pop(), the NMI will spin
> forever. The following cmpxchg() is not allowed to happen as long as
> tail->next is NULL.
>
> This is why I synchronize on @next instead, using (tail && !tail->next)
> for the singleton test.
>
> > } while (cmpxchg(h->tail, tail, next) != tail);
> >
> > return tail;
> > }
> >
> > Now, you appear to be using desc_ids instead of pointers, but since
> > you're not using the actual wrap value; I don't see the benefit of
> > using those IDs over straight pointers.
>
> The documentation mentions that descriptor ids are used to identify
> pointers to invalid descriptors. This is used by the readers, see
> iter_peek_next_id() and prb_iter_next_valid_entry().
>
> IDs are used for:
>
> - @next of descriptors on the list
> - @id, @id_next in the reader iterator
> - @id in the data blocks
>
> If changed to pointers, iterators would need to additionally store @seq
> values to be able to identifiy if the entry they are pointing to is the
> entry they expect.
>
> The only advantage I see with pointers is that the ringbuffer could be
> more useful generally, independent of whether the data is separate or
> within the nodes or if the nodes are statically or dynamically
> allocated. That is something worth having, even if it is not printk
> related.
>
> Are you implicitly requesting me to split the prb_ringbuffer and instead
> base it on a new "lockless multi-writer multi-reader sequenced FIFO
> queue" data structure?
>
> > That is, unless I've overlooked some subtle ABA issue, but then, your
> > code doesn't seem to mention that, and I think we're good because if
> > we re-use an entry, it can never get back in the same location, since
> > we never allow an empty list
>
> I do not understand what you mean here. If a reader has a pointer to an
> entry, the entry behind that pointer can certainly change. But that
> isn't a problem. The reader will recognize that.
>
> > (might also be fixable, haven't tought too hard on this).
>
> :-)
>
> > That said, the above has cmpxchg() vs WRITE_ONCE() and is therefore
> > not safe on a number of our architectures. We can either not care
> > about performance and use xchg() for the ->tail store, or use
> > atomic_long_t and suffer ugly casting.
>
> cmpxchg_release() vs WRITE_ONCE() is not safe?! Can you point me to
> documentation about this?
>
> > But the above is, IMO, a more useful and readable abstraction. Let me
> > continue in another email (probably tomorrow).
>
> Thank you for taking the time for this.
>
> John Ogness

2019-06-30 02:05:01

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-29, Andrea Parri <[email protected]> wrote:
>> /**
>> * add_descr_list() - Add a descriptor to the descriptor list.
>> *
>> * @e: An entry that has already reserved data.
>> *
>> * The provided entry contains a pointer to a descriptor that has already
>> * been reserved for this entry. However, the reserved descriptor is not
>> * yet on the list. Add this descriptor as the newest item.
>> *
>> * A descriptor is added in two steps. The first step is to make this
>> * descriptor the newest. The second step is to update @next of the former
>> * newest descriptor to point to this one (or set @oldest to this one if
>> * this will be the first descriptor on the list).
>> */
>> static void add_descr_list(struct prb_reserved_entry *e)
>> {
>> struct printk_ringbuffer *rb = e->rb;
>> struct prb_list *l = &rb->descr_list;
>> struct prb_descr *d = e->descr;
>> struct prb_descr *newest_d;
>> unsigned long newest_id;
>>
>> WRITE_ONCE(d->next, EOL);
>
> /* C */
>
>
>>
>> do {
>> newest_id = READ_ONCE(l->newest);
>
> /* A */
>
>
>> newest_d = TO_DESC(rb, newest_id);
>>
>> if (newest_id == EOL) {
>> WRITE_ONCE(d->seq, 1);
>> } else {
>> /*
>> * MB5-read: synchronize setting newest descr
>> *
>> * context-pair: 2 writers adding a descriptor via
>> * add_descr_list().
>> *
>> * @newest will load before @seq due to a data
>> * dependency, therefore, the stores of @seq
>> * and @next from the pairing MB5-write context
>> * will be visible.
>> *
>> * Although @next is not loaded by this context,
>> * this context must overwrite the stored @next
>> * value of the pairing MB5-write context.
>> */
>> WRITE_ONCE(d->seq, READ_ONCE(newest_d->seq) + 1);
>
> /* B: this READ_ONCE() */
>
> Hence you're claiming a data dependency from A to B. (FWIW, the LKMM
> would call "A ->dep B" an "address dependency.)
>
> This comment also claims that the "pairing MB5-write" orders "stores
> of @seq and @next" (which are to different memory locations w.r.t. A
> and B): I do not get why this access to @next (C above?, that's also
> "unordered" w.r.t. A) can be relevant; can you elaborate?

I will add some more labels to complete the picture. All these events
are within this function:

D: the WRITE_ONCE() to @seq

E: the STORE of a successful cmpxchg() for @newest (the MB5-write
cmpxchg())

F: the STORE of a new @next (the last smp_store_release() of this
function, note that the _release() is not relevant for this pair)

The significant events for 2 contexts that are accessing the same
addresses of a descriptor are:

P0(struct desc *d0)
{
// adding a new descriptor d0

WRITE_ONCE(d0->next, EOL); // C
WRITE_ONCE(d0->seq, X); // D
cmpxchg_release(newest, Y, indexof(d0)); // E
}

P1(struct desc *d1)
{
// adding a new descriptor d1 that comes after d0

struct desc *d0;
int r0, r1;

r0 = READ_ONCE(newest); // A
d0 = &array[r0];
r1 = READ_ONCE(d0->seq); // B
WRITE_ONCE(d0->next, Z); // F
}

d0 is the same address for P0 and P1. (The values of EOL, X, Y, Z are
unrelated and irrelevant.)

I am claiming that:

- B comes after D
- F comes after C

>> }
>>
>> /*
>> * MB5-write: synchronize setting newest descr
>> *
>> * context-pair: 2 writers adding a descriptor via
>> * add_descr_list().
>> *
>> * Ensure that @next and @seq are stored before @d is
>> * visible via @newest. The pairing MB5-read context
>> * must load this @seq value and must overwrite this
>> * @next value.
>> */
>> } while (cmpxchg_release(&l->newest, newest_id, e->id) != newest_id);
>>
>> if (unlikely(newest_id == EOL)) {
>> /*
>> * MB0-write: synchronize adding first descr
>> *
>> * context-pair: 1 writer adding the first descriptor via
>> * add_descr_list(), 1 reader getting the beginning of
>> * the list via iter_peek_next_id().
>> *
>> * This context recently assigned new values for @id,
>> * @next, @seq. Ensure these are stored before the first
>> * store to @oldest so that the new values are visible
>> * to the reader in the pairing MB0-read context.
>> *
>> * Note: Before this store, the value of @oldest is EOL.
>> */
>
> My gmail-search foo is unable to locate MB0-read: what am I missing?
> Also, can you maybe annotate the memory accesses to @id, @next, @seq
> and @oldest (as I did above)? I find myself guessing their location.

Sorry. The MB0-read is a _new_ comment that would be added to the
smp_rmb() of the reader functions. I didn't repost everything because I
just wanted to get a feel if the comments for _this_ function are
improving. Really all I care about right now is properly documenting
MB5. It is a good example because MB5 is completely within this
function. If I can satisfactorily document MB5, then I can post a new
version with updated comments for everything.

>> smp_store_release(&l->oldest, e->id);
>> } else {
>> /*
>> * MB6-write: synchronize linking new descr
>> *
>> * context-pair-1: 1 writer adding a descriptor via
>> * add_descr_list(), 1 writer removing a descriptor via
>> * remove_oldest_descr().
>> *
>> * If this is a recycled descriptor, this context
>> * recently stored a new @oldest value. Ensure that
>> * @oldest is stored before storing @next so that
>> * if the pairing MB6-read context sees a non-EOL
>> * @next value, it is ensured that it will also see
>> * an updated @oldest value.
>> *
>> * context-pair-2: 1 writer adding a descriptor via
>> * add_descr_list(), 1 reader iterating the list via
>> * prb_iter_next_valid_entry().
>> *
>> * This context recently assigned new values for @id,
>> * @next, @seq, @data, @data_next. Ensure these are
>> * stored before storing @next of the previously
>> * newest descriptor so that the new values are
>> * visible to the iterating reader in the pairing
>> * MB6-read context.
>> *
>> * Note: Before this store, the value of @next of the
>> * previously newest descriptor is EOL.
>> */
>
> Same as above but for MB6-read and the accesses to @id, @next, @seq,
> @data, @data_next.
>
> In conclusion, I have been unable to produce litmus tests by reading
> your comments (meaning I'm lost).

I feel like I'm stating all the information, but nobody understands it.
If you can help me to correctly document MB5, I can submit a new version
with all the memory barriers correctly documented.

John Ogness

2019-06-30 14:09:27

by Andrea Parri

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Sun, Jun 30, 2019 at 04:03:34AM +0200, John Ogness wrote:
> On 2019-06-29, Andrea Parri <[email protected]> wrote:
> >> /**
> >> * add_descr_list() - Add a descriptor to the descriptor list.
> >> *
> >> * @e: An entry that has already reserved data.
> >> *
> >> * The provided entry contains a pointer to a descriptor that has already
> >> * been reserved for this entry. However, the reserved descriptor is not
> >> * yet on the list. Add this descriptor as the newest item.
> >> *
> >> * A descriptor is added in two steps. The first step is to make this
> >> * descriptor the newest. The second step is to update @next of the former
> >> * newest descriptor to point to this one (or set @oldest to this one if
> >> * this will be the first descriptor on the list).
> >> */
> >> static void add_descr_list(struct prb_reserved_entry *e)
> >> {
> >> struct printk_ringbuffer *rb = e->rb;
> >> struct prb_list *l = &rb->descr_list;
> >> struct prb_descr *d = e->descr;
> >> struct prb_descr *newest_d;
> >> unsigned long newest_id;
> >>
> >> WRITE_ONCE(d->next, EOL);
> >
> > /* C */
> >
> >
> >>
> >> do {
> >> newest_id = READ_ONCE(l->newest);
> >
> > /* A */
> >
> >
> >> newest_d = TO_DESC(rb, newest_id);
> >>
> >> if (newest_id == EOL) {
> >> WRITE_ONCE(d->seq, 1);
> >> } else {
> >> /*
> >> * MB5-read: synchronize setting newest descr
> >> *
> >> * context-pair: 2 writers adding a descriptor via
> >> * add_descr_list().
> >> *
> >> * @newest will load before @seq due to a data
> >> * dependency, therefore, the stores of @seq
> >> * and @next from the pairing MB5-write context
> >> * will be visible.
> >> *
> >> * Although @next is not loaded by this context,
> >> * this context must overwrite the stored @next
> >> * value of the pairing MB5-write context.
> >> */
> >> WRITE_ONCE(d->seq, READ_ONCE(newest_d->seq) + 1);
> >
> > /* B: this READ_ONCE() */
> >
> > Hence you're claiming a data dependency from A to B. (FWIW, the LKMM
> > would call "A ->dep B" an "address dependency.)
> >
> > This comment also claims that the "pairing MB5-write" orders "stores
> > of @seq and @next" (which are to different memory locations w.r.t. A
> > and B): I do not get why this access to @next (C above?, that's also
> > "unordered" w.r.t. A) can be relevant; can you elaborate?
>
> I will add some more labels to complete the picture. All these events
> are within this function:
>
> D: the WRITE_ONCE() to @seq
>
> E: the STORE of a successful cmpxchg() for @newest (the MB5-write
> cmpxchg())
>
> F: the STORE of a new @next (the last smp_store_release() of this
> function, note that the _release() is not relevant for this pair)
>
> The significant events for 2 contexts that are accessing the same
> addresses of a descriptor are:
>
> P0(struct desc *d0)
> {
> // adding a new descriptor d0
>
> WRITE_ONCE(d0->next, EOL); // C
> WRITE_ONCE(d0->seq, X); // D
> cmpxchg_release(newest, Y, indexof(d0)); // E
> }
>
> P1(struct desc *d1)
> {
> // adding a new descriptor d1 that comes after d0
>
> struct desc *d0;
> int r0, r1;
>
> r0 = READ_ONCE(newest); // A
> d0 = &array[r0];
> r1 = READ_ONCE(d0->seq); // B
> WRITE_ONCE(d0->next, Z); // F
> }
>
> d0 is the same address for P0 and P1. (The values of EOL, X, Y, Z are
> unrelated and irrelevant.)
>
> I am claiming that:
>
> - B comes after D
> - F comes after C

I think these are both assuming that A is reading the value stored by E
(shortly, "A reads from E")? If so, then the two claims become/are:

- If A reads from E, then B comes after D

- If A reads from E, then F comes after C

I think you could avoid the (ambiguous) "comes after" and say something
like:

(1) If A reads from E, then B reads from D (or from another store
to ->seq, not reported in the snippet, which overwrites D)

(2) If A reads from E, then F overwrites C

This, IIUC, for the informal descriptions of the (intended) guarantees.
Back to the pairings in question: AFAICT,

(a) For (1), we rely on the pairing:

RELEASE from D to E (matching) ADDRESS DEP. from A to B

(b) For (2), we rely on the pairing:

RELEASE from C to E (matching) ADDRESS DEP. from A to F

Does this make sense?


>
> >> }
> >>
> >> /*
> >> * MB5-write: synchronize setting newest descr
> >> *
> >> * context-pair: 2 writers adding a descriptor via
> >> * add_descr_list().
> >> *
> >> * Ensure that @next and @seq are stored before @d is
> >> * visible via @newest. The pairing MB5-read context
> >> * must load this @seq value and must overwrite this
> >> * @next value.
> >> */
> >> } while (cmpxchg_release(&l->newest, newest_id, e->id) != newest_id);
> >>
> >> if (unlikely(newest_id == EOL)) {
> >> /*
> >> * MB0-write: synchronize adding first descr
> >> *
> >> * context-pair: 1 writer adding the first descriptor via
> >> * add_descr_list(), 1 reader getting the beginning of
> >> * the list via iter_peek_next_id().
> >> *
> >> * This context recently assigned new values for @id,
> >> * @next, @seq. Ensure these are stored before the first
> >> * store to @oldest so that the new values are visible
> >> * to the reader in the pairing MB0-read context.
> >> *
> >> * Note: Before this store, the value of @oldest is EOL.
> >> */
> >
> > My gmail-search foo is unable to locate MB0-read: what am I missing?
> > Also, can you maybe annotate the memory accesses to @id, @next, @seq
> > and @oldest (as I did above)? I find myself guessing their location.
>
> Sorry. The MB0-read is a _new_ comment that would be added to the
> smp_rmb() of the reader functions. I didn't repost everything because I
> just wanted to get a feel if the comments for _this_ function are
> improving. Really all I care about right now is properly documenting
> MB5. It is a good example because MB5 is completely within this
> function. If I can satisfactorily document MB5, then I can post a new
> version with updated comments for everything.

Oh, I see, thanks for this clarification.


>
> >> smp_store_release(&l->oldest, e->id);
> >> } else {
> >> /*
> >> * MB6-write: synchronize linking new descr
> >> *
> >> * context-pair-1: 1 writer adding a descriptor via
> >> * add_descr_list(), 1 writer removing a descriptor via
> >> * remove_oldest_descr().
> >> *
> >> * If this is a recycled descriptor, this context
> >> * recently stored a new @oldest value. Ensure that
> >> * @oldest is stored before storing @next so that
> >> * if the pairing MB6-read context sees a non-EOL
> >> * @next value, it is ensured that it will also see
> >> * an updated @oldest value.
> >> *
> >> * context-pair-2: 1 writer adding a descriptor via
> >> * add_descr_list(), 1 reader iterating the list via
> >> * prb_iter_next_valid_entry().
> >> *
> >> * This context recently assigned new values for @id,
> >> * @next, @seq, @data, @data_next. Ensure these are
> >> * stored before storing @next of the previously
> >> * newest descriptor so that the new values are
> >> * visible to the iterating reader in the pairing
> >> * MB6-read context.
> >> *
> >> * Note: Before this store, the value of @next of the
> >> * previously newest descriptor is EOL.
> >> */
> >
> > Same as above but for MB6-read and the accesses to @id, @next, @seq,
> > @data, @data_next.
> >
> > In conclusion, I have been unable to produce litmus tests by reading
> > your comments (meaning I'm lost).
>
> I feel like I'm stating all the information, but nobody understands it.
> If you can help me to correctly document MB5, I can submit a new version
> with all the memory barriers correctly documented.

IMO (and assuming that what I wrote above makes some sense), (a-b) and
(1-2) above, together with the associated annotations of the code/ops,
provide all the desired and necessary information to document MB5.

For readability purposes, it could be nice to also keep the snippet you
provided above (but let me stress, again, that such a snippet should be
integrated with additional information as suggested above).

As to "where to insert the memory barrier documentation", I really have
no suggestion ATM. I guess someone would split it (say, before A and E)
while others could prefer to keep it within a same inline comment.

Thanks for this information (and for your patience!),

Andrea

2019-07-01 11:49:37

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-28, Peter Zijlstra <[email protected]> wrote:
>> I have implemented and tested these changes. I also added setting the
>> list terminator to this function, since all callers would have to do
>> it anyway. Also, I spent a lot of time trying to put in comments that
>> I think are _understandable_ and _acceptable_.
>>
>> @Peter: I expect they are way too long for you.
>>
>> @Andrea: Is this starting to become something that you would like to
>> see?
>>
>> /**
>> * add_descr_list() - Add a descriptor to the descriptor list.
>> *
>> * @e: An entry that has already reserved data.
>> *
>> * The provided entry contains a pointer to a descriptor that has already
>> * been reserved for this entry. However, the reserved descriptor is not
>> * yet on the list. Add this descriptor as the newest item.
>> *
>> * A descriptor is added in two steps. The first step is to make this
>> * descriptor the newest. The second step is to update @next of the former
>> * newest descriptor to point to this one (or set @oldest to this one if
>> * this will be the first descriptor on the list).
>> */
>
> I still think it might be useful to explicitly call out the data
> structure more. Even if you cannot use a fully abtracted queue.

Agreed. It needs to be clear that the queue management is separate from
the data management.

> Also, newest/oldest just looks weird to me; I'm expecting head/tail.

I will rename it to head/tail.

>>> You have a single linked list going from the tail to the head, while
>>> adding to the head and removing from the tail. And that sounds like
>>> a FIFO queue:
>>
>> Yes, but with one important feature: the nodes in the FIFO queue are
>> labeled with ordered sequence numbers. This is important for
>> printk. I talk more about this below.
>
> But nowhere did/do you say what the actual data structure is, with what
> modification for which reason.

When you dive into the reader code you will see that the sequence
numbers are necessary for readers to recognize that they have missed
records. This means that the sequence numbers must be ordered, and
AFAICT that is only possible if the queue management code is assigning
it.

>>> /*
>>> * xchg() implies ACQUIRE; and thereby ensures @tail is
>>> * written after @head, see lqueue_pop()'s smp_rmb().
>>> */
>>> if (prev)
>>> WRITE_ONCE(prev->next, n);
>>
>> This needs to be a store_release() so that a reader cannot read @n
>> but the store to @next is not yet visible. The memory barriers of the
>> above xchg() do not apply here because readers never read @head.
>
> Confused, all stores to @n are before the xchg() so the barrier from
> xchg() also order those stores and this store.

Sorry. Yes, you are correct. I was confusing your suggested
implementation with mine. I'm starting to overthink things and confuse
myself about memory barriers. I need to be more careful.

> (Note that the qspinlock has a queue not unlike this, but that again
> doesn't have to bother with NMIs)

Thank you for pointing this out! I will look to qspinlock for some
naming guidelines.

>>> Now, you appear to be using desc_ids instead of pointers, but since
>>> you're not using the actual wrap value; I don't see the benefit of
>>> using those IDs over straight pointers. That is, unless I've
>>> overlooked some subtle ABA issue, but then, your code doesn't seem
>>> to mention that, and I think we're good because if we re-use an
>>> entry, it can never get back in the same location, since we never
>>> allow an empty list
>>
>> I do not understand what you mean here. If a reader has a pointer to
>> an entry, the entry behind that pointer can certainly change. But
>> that isn't a problem. The reader will recognize that.
>
> ABA is where a cmpxchg has a false positive due to values matching but
> not the structure.

Thanks for the clarification. And it reminds me why I chose to use
desc_ids. If we are using pointers instead of desc_ids, assuming we have
a queue with currently only 1 node, the following should cause the ABA
problem. CPU0 is perfoming an lqueue_push() to add a 2nd node to the
queue.

CPU0 CPU1
---- ----
head = READ_ONCE(h->head);
seq = READ_ONCE(head->seq);
WRITE_ONCE(n->seq, seq + 1);
WRITE_ONCE(n->next, NULL);
lqueue_push();
lqueue_pop();
lqueue_push();
cmpxchg_release(&h->head, head, n);
WRITE_ONCE(head->next, n);

The queue itself will still be intact, but the sequence numbers are now
wrong. For this to happen using desc_ids, the above set of calls from
CPU1 would need to occur "LONG_MAX/desc_max_count" times in that
window. Basically I am using tagged state references with probably >40
bits for the tag (on 64-bit systems).

The effect for readers is that they will see a sequence number that is
less or equal to the previous seen sequence number. Worthy of a warn
message.

My code needs to mention all this.

>>> That said, the above has cmpxchg() vs WRITE_ONCE() and is therefore
>>> not safe on a number of our architectures. We can either not care
>>> about performance and use xchg() for the ->tail store, or use
>>> atomic_long_t and suffer ugly casting.
>>
>> cmpxchg_release() vs WRITE_ONCE() is not safe?! Can you point me to
>> documentation about this?
>
> Documentation/atomic_t.txt has this, see the SEMANTICS section on
> atomic-set.

Thanks. I overlooked that subtle detail. Can I assume NMIs do not exist
on architectures that need to implement locking for cmpxchg()? Or did I
just hit a major obstacle?

I would prefer to replace the affected WRITE_ONCE() with xchg_relaxed()
(like qspinlock is doing). Or are there some other subtle advantages of
atomic_long_t?

John Ogness

2019-07-01 15:26:44

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Mon, Jul 01, 2019 at 12:39:35PM +0200, John Ogness wrote:

> Thanks. I overlooked that subtle detail. Can I assume NMIs do not exist
> on architectures that need to implement locking for cmpxchg()? Or did I
> just hit a major obstacle?

I think that is a fair assumption, I'm not aware of anybody having NMIs
_and_ 'broken' atomics.

Then again, we also have ARCH_HAVE_NMI_SAFE_CMPXCHG.

2019-07-01 15:27:01

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On Mon, Jul 01, 2019 at 12:39:35PM +0200, John Ogness wrote:
> On 2019-06-28, Peter Zijlstra <[email protected]> wrote:
> > (Note that the qspinlock has a queue not unlike this, but that again
> > doesn't have to bother with NMIs)
>
> Thank you for pointing this out! I will look to qspinlock for some
> naming guidelines.

Fair warning: qspinlock might hurt your brain; but if you do look and
are unsure; let me know and I'll try and improve its comments.

2019-07-02 14:14:39

by John Ogness

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/2] printk-rb: add a new printk ringbuffer implementation

On 2019-06-30, Andrea Parri <[email protected]> wrote:
>> The significant events for 2 contexts that are accessing the same
>> addresses of a descriptor are:
>>
>> P0(struct desc *d0)
>> {
>> // adding a new descriptor d0
>>
>> WRITE_ONCE(d0->next, EOL); // C
>> WRITE_ONCE(d0->seq, X); // D
>> cmpxchg_release(newest, Y, indexof(d0)); // E
>> }
>>
>> P1(struct desc *d1)
>> {
>> // adding a new descriptor d1 that comes after d0
>>
>> struct desc *d0;
>> int r0, r1;
>>
>> r0 = READ_ONCE(newest); // A
>> d0 = &array[r0];
>> r1 = READ_ONCE(d0->seq); // B
>> WRITE_ONCE(d0->next, Z); // F
>> }
>>
>> d0 is the same address for P0 and P1. (The values of EOL, X, Y, Z are
>> unrelated and irrelevant.)
>
> (1) If A reads from E, then B reads from D (or from another store
> to ->seq, not reported in the snippet, which overwrites D)
>
> (2) If A reads from E, then F overwrites C
>
> This, IIUC, for the informal descriptions of the (intended) guarantees.
> Back to the pairings in question: AFAICT,
>
> (a) For (1), we rely on the pairing:
>
> RELEASE from D to E (matching) ADDRESS DEP. from A to B
>
> (b) For (2), we rely on the pairing:
>
> RELEASE from C to E (matching) ADDRESS DEP. from A to F
>
> Does this make sense?

Yes. This is what I needed to see.

> IMO (and assuming that what I wrote above makes some sense), (a-b) and
> (1-2) above, together with the associated annotations of the code/ops,
> provide all the desired and necessary information to document MB5.
>
> For readability purposes, it could be nice to also keep the snippet you
> provided above (but let me stress, again, that such a snippet should be
> integrated with additional information as suggested above).
>
> As to "where to insert the memory barrier documentation", I really have
> no suggestion ATM. I guess someone would split it (say, before A and E)
> while others could prefer to keep it within a same inline comment.

Thank you. This is the level of formalization I've been looking for. I
will rework the comments (and naming) and post a v3. It is probably best
for you to wait until then to look at this again. (And after going
through such formal processes, even _I_ am having difficulties
understanding what some of my memory barriers are supposed to be
synchronizing.)

John Ogness

2019-07-04 10:35:53

by Petr Mladek

[permalink] [raw]
Subject: [PATCH POC] printk_ringbuffer: Alternative implementation of lockless printk ringbuffer

This is POC that implements the lockless printk ringbuffer slightly
different way. I believe that it is worth considering because it looks
much easier to deal with. The reasons are:

+ The state of each entry is always clear.

+ The write access rights and validity of the data
are clear from the state of the entry.

+ It seems that three barriers are enough to synchronize
readers vs. writers. The rest is done implicitely
using atomic operations.

Of course, I might have missed some critical race that can't get
solved by the new design easily. But I do not see it at the moment.

The code compiles but it is not really tested. I wanted to send it
ASAP in a good enough state before we spend more time on cleaning
up either of the proposals.

How it works:

It uses two buffers (data, descriptors) like in John's variant.
The main difference is how the state is handled.

The trick is in the descriptor state "dst" variable included
in struct prb_desc. It consists of 3 values that are manipulated
atomically:

+ sequence number
+ committed flag
+ reuse flag

All the operations rely on the fact that we know what sequence
number we are looking for. From this point of view the descriptor
can be in the following states, see also enum prb_desc_state
and prb_desc_state() function:

+ miss: sequence number is not the expected one
+ reserved: sequence matches, both flags cleared
+ committed: sequence matches, committed flag set
+ reusable: sequence matches, both flags set

Next, the descriptor and data array are crosslinked:

+ descriptor contains: lpos, lpos_next
+ data array contains: seq

The main trickery is in prb_read_desc(). It reads descriptor
for the given sequence number into a buffer. It returns
enum prb_desc_state that says how much the values are valid.
The meaning is the following, by descriptor state:

+ miss: nothing is valid
+ reserved: nothing is valid
+ committed: everything is valid (descriptor, data)
+ reusable: descriptor has valid lpos, lpos_next but
data are not longer valid

The validity is guaranteed by double checking the descriptor
state before and after reading the other values. It might
be less efficient than Jonh's approach. But I find it much
easier to make sure that the consistency more is correct.

Consistency:

1. Writer become owner of the descriptor by pushing rb->seq_newest
forward. From this point, it has exclusive write access to
the descriptor until the committed flag is set.

The descriptor that is being replaced has to be in reusable
state and the related data are not longer in valid lpos_oldest,
lpos_latest range. By other words, nobody need the data from
the old descriptor any longer.

The descriptor can be moved into reusable state only
when it was in the committed state before (atomic operation).

2. Writer reserve data by pushing rb->lpos_newest. From this
point, it has exclusive access to the data until committed
flag is set.

Writers have to make enough space for the data before they
change lpos_newest by pushing rb->lpos_oldest. They use
the crosslinked "seq" to find the right descriptors and
set them reusable.

Again, only committed descriptors can be marked reusable.

3. Writers set committed flag when everything is set correctly.
It means that crosslink and data are set correctly.

From this point, they must not modify the descriptor or
the data any longer. They might get moved into the reusable
state at any time.

4. Readers have a simple life. They just try to access descriptors
in the seq_oldest, seq_newest range. They try to read data
only when the descriptor is in the committed state. They
double check the state after the data read (copied).

5. Finally, I believe that we do not need to be concerned about
overflows of seq or lpos numbers. All operations are done
with interrupts disabled. Therefore they can be interrupted
only by NMI. And NMI could not reserve and already
reserved descriptor before it gets committed.

By other words, I do not see a way how any CPU could
rotate lpos or seq numbers over the entire range while
while another CPU is still looking for a reusable
descriptor/data.

TODO when there is interest in this variant:

+ add printk_ringbuffer.h and Makefile
+ solve bootstrap (first added message)
+ add iterators
+ fix bugs (one-off and other ungly mistakes)

Heavily-based-on: John Ogness <[email protected]>
Signed-off-by: Petr Mladek <[email protected]>
---
lib/Makefile | 1 +
lib/printk_ringbuffer.c | 715 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 716 insertions(+)
create mode 100644 lib/printk_ringbuffer.c

diff --git a/lib/Makefile b/lib/Makefile
index fb7697031a79..150680b02605 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -37,6 +37,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o

+lib-$(CONFIG_PRINTK) += printk_ringbuffer.o
lib-$(CONFIG_PRINTK) += dump_stack.o
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/printk_ringbuffer.c b/lib/printk_ringbuffer.c
new file mode 100644
index 000000000000..27594806c8ab
--- /dev/null
+++ b/lib/printk_ringbuffer.c
@@ -0,0 +1,715 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+/**
+ * struct prb_desc - A descriptor representing an entry in the ringbuffer.
+ * @dst: The state of the descriptor and associated data. It includes:
+ * - sequence number
+ * - flag set when the data are committed
+ * - flag set when the data are freed (buffer
+ * could get reused)
+ * @lpos: The logical position of the data for this entry.
+ * The location of the beginning of the data within the data array
+ * an be determined from this value.
+ * @lpos_next: The logical position of the data next to this entry.
+ * This is used to determine the length of the data as well as
+ * identify where the next data begins.
+ *
+ * Descriptors are used to identify where the data for each entry is and
+ * also provide an ordering for readers. Entry ordering is based on the
+ * descriptor linked list (not the ordering of data in the data array).
+ */
+struct prb_desc {
+ /* private */
+ unsigned long dst;
+ unsigned long lpos;
+ unsigned long lpos_next;
+};
+
+/**
+ * struct printk_ringbuffer - The ringbuffer structure.
+ * @desc: Descriptors array;
+ * @data: Data array;
+ * @desc_size_bits: Size of the descriptors array as a power-of-2
+ * @data_size_bits: Size of the data array as a power-of-2
+ * @seq_oldest: Sequence number of the oldest valid descriptor
+ * @seq_newest: Sequence number of the newest valid descriptor
+ * @lpos_oldest: Logical position of the oldest valid data
+ * @lpos_newest: Logical position right behind the newest data
+ * @lost: Counter tracking how often writers failed to reserve data.
+ */
+struct printk_ringbuffer {
+ /* private */
+ struct prb_desc *desc;
+ char *data;
+
+ unsigned int desc_size_bits;
+ unsigned int data_size_bits;
+
+ unsigned long seq_oldest;
+ unsigned long seq_newest;
+
+ unsigned long lpos_oldest;
+ unsigned long lpos_newest;
+
+ atomic_t lost;
+};
+
+/**
+ * struct prb_reserved_entry - Used by writers to reserve/commit data.
+ * @rb: The printk ringbuffer used for reserve/commit.
+ * @seq: The sequence number of the reserved data.
+ * @irqflags: Local IRQs are disabled during the reserve/commit window.
+ *
+ * A writer provides this structure when reserving and committing data. The
+ * values of all the members are set on reserve and are only valid until
+ * commit.
+ */
+struct prb_reserved_entry {
+ /* private */
+ struct printk_ringbuffer *rb;
+ unsigned long seq;
+ unsigned long irqflags;
+};
+
+/**
+ * struct prb_data_block - A data block.
+ * @seq: Sequence number pointing to the related descriptor.
+ * @data: The data committed by the writer.
+ */
+struct prb_data_block {
+ unsigned long seq;
+ char data[0];
+};
+
+#define PRB_DST_BITS (sizeof(unsigned long) * 8)
+#define PRB_COMMITTED_MASK (1UL << (PRB_DST_BITS - 1))
+#define PRB_REUSE_MASK (1UL << (PRB_DST_BITS - 2))
+#define PRB_FLAGS_MASK (PRB_COMMITTED_MASK | PRB_REUSE_MASK)
+#define PRB_SEQ_MASK (~PRB_FLAGS_MASK)
+
+#define PRB_DESC_SIZE(rb) (1 << rb->desc_size_bits)
+#define PRB_DATA_SIZE(rb) (1 << rb->data_size_bits)
+
+#define PRB_DESC_MASK(rb) (PRB_DESC_SIZE(rb) - 1)
+#define PRB_DATA_MASK(rb) (PRB_DATA_SIZE(rb) - 1)
+
+#define PRB_LPOS_WRAP_CNT(rb, lpos) (lpos & ~PRB_DATA_MASK(rb))
+
+#define SEQ_TO_DESC(rb, seq) \
+ (&rb->desc[seq & PRB_DESC_MASK(rb)])
+#define LPOS_TO_DATAB(rb, lpos) \
+ ((struct prb_data_block *)&rb->data[lpos & PRB_DATA_MASK(rb)])
+
+static bool prb_lpos_in_use(struct printk_ringbuffer *rb,
+ unsigned long lpos)
+{
+ unsigned long lpos_oldest = READ_ONCE(rb->lpos_oldest);
+ unsigned long lpos_newest = READ_ONCE(rb->lpos_newest);
+
+ /*
+ * lpos_newest will be lpos for the next reserved data.
+ * It is right behind the in-use range.
+ */
+ return lpos - lpos_oldest < lpos_newest - lpos_oldest;
+}
+
+/*
+ * Return values from prv_desc_state(). They make it easier
+ * to handle the given state using switch() or if contitions.
+ *
+ * It describes what information is valid in the given
+ * descriptor:
+ *
+ * - miss: nothing is valid
+ * - reserved: nothing is valid
+ * - committed: everything is valid (descriptor, data)
+ * - reusable: descriptor have valid lpos, lpos_next but
+ * data are not longer valid
+ */
+enum prb_desc_state {
+ desc_miss,
+ desc_reserved,
+ desc_committed,
+ desc_reusable,
+};
+
+/* Return the state of the descriptor according to its life cycle. */
+static enum prb_desc_state
+prb_desc_state(unsigned long dst, unsigned long seq)
+{
+ if (seq != (dst & PRB_SEQ_MASK))
+ return desc_miss;
+
+ if (!(dst & PRB_COMMITTED_MASK))
+ return desc_reserved;
+
+ if (!(dst & PRB_REUSE_MASK))
+ return desc_committed;
+
+ return desc_reusable;
+}
+
+/*
+ * Read descriptor for the given sequence number into the given
+ * struct prb_desc buffer. Read lpos, lpos_next when it makes
+ * sense. Return enum desc_state so that the caller know what
+ * information is valid.
+ */
+static enum prb_desc_state
+prb_read_desc(struct printk_ringbuffer *rb,
+ unsigned long seq,
+ struct prb_desc *desc_out)
+{
+ struct prb_desc *desc = SEQ_TO_DESC(rb, seq);
+ enum prb_desc_state desc_state;
+
+ desc_out->dst = READ_ONCE(desc->dst);
+ desc_state = prb_desc_state(desc_out->dst, seq);
+
+ if (desc_state == desc_miss || desc_state == desc_reserved)
+ return desc_state;
+
+ /*
+ * Synchronize lpos, lpos_next vs. desc read. The values
+ * are set before PRB_COMMITTED_MASK is committed in prb_commit().
+ */
+ smp_rmb();
+ desc_out->lpos = READ_ONCE(desc->lpos_next);
+ desc_out->lpos_next = READ_ONCE(desc->lpos_next);
+
+ /*
+ * Make sure the lpos_next still belongs to the seq number.
+ * It might be modified once cmpxchg() a new sequence
+ * number is written in prb_reserve_desc().
+ *
+ * Also make sure that data read before this function is called
+ * are still valid for the sequence number. They are invalidated
+ * by setting reuse flag in prb_make_desc_reusable().
+ */
+ smp_rmb();
+ desc_out->dst = READ_ONCE(desc->dst);
+
+ return prb_desc_state(desc_out->dst, seq);
+}
+
+/* Only committed descriptor can be made reusable. */
+static int prb_make_desc_reusable(struct printk_ringbuffer *rb,
+ unsigned long seq)
+{
+ struct prb_desc *desc = SEQ_TO_DESC(rb, seq);
+ unsigned long dst_committed = seq | PRB_COMMITTED_MASK;
+ unsigned long dst_reusable = dst_committed | PRB_REUSE_MASK;
+
+ /*
+ * Successful exchange works also as a write barrier that
+ * tell readers that the date are not longer valid.
+ * The related read barrier is in prb_read_desc().
+ */
+ if (cmpxchg(&desc->dst, dst_committed, dst_reusable) == dst_committed)
+ return 0;
+ return -EINVAL;
+}
+
+/*
+ * Mark all conflicting data entries as reuasable.
+ *
+ * Fill desc_last with information from the descriptor
+ * related to the last data section.
+ *
+ * This function does not shuffle rb->lpos_oldest. It just
+ * marks the data reusable in the respective descriptors.
+ * Note that lpos_oldest is passed as parameter.
+ *
+ * Return 0 when all descriptors can be marked for reuse.
+ * Return -EBUSY when failed to find the right descriptor
+ * or when it was not yet committed.
+ */
+static int prb_make_data_reusable(struct printk_ringbuffer *rb,
+ unsigned long lpos_oldest,
+ unsigned long lpos_min_new,
+ struct prb_desc *desc_last)
+{
+ unsigned long lpos = lpos_oldest;
+
+ /* Overflow means that lpos reached the required limit. */
+ while (lpos_min_new - lpos - 1 <= PRB_DATA_SIZE(rb)) {
+ struct prb_data_block *datab = LPOS_TO_DATAB(rb, lpos);
+ unsigned long seq;
+ enum prb_desc_state desc_state;
+
+ seq = READ_ONCE(datab->seq);
+ desc_state = prb_read_desc(rb, seq, desc_last);
+
+ switch (desc_state) {
+ case desc_miss:
+ /*
+ * Mismatching sequence number means that the data
+ * are freshly reserved but the prb_data_block have not
+ * been updated with the new seq yet. We might even
+ * be in the middle only newly written data.
+ */
+ return -EBUSY;
+ case desc_reserved:
+ return -EBUSY;
+ case desc_committed:
+ /*
+ * Make sure that this descriptor really points to this
+ * lpos. Otherwise, this data block is already rewritten
+ * by a parallel writer and we got a valid sequence
+ * number only by chance.
+ */
+ if (desc_last->lpos != lpos)
+ return -EBUSY;
+
+ /*
+ * Descriptor is committed and can be freed. It is
+ * perfectly fine when a parallel writer is faster.
+ */
+ prb_make_desc_reusable(rb, seq);
+ break;
+ case desc_reusable:
+ /* nope */
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Move lpos_oldest to lpos_min_oldest or behind. Where lpos_min_oldest
+ * might be an existing lpos or minimal lpos to get enough free space.
+ *
+ * Return 0 when succeeded. Return -ENOMEM when some data were
+ * not committed yet and might still be accessed by parallel
+ * writers.
+ */
+static int prb_remove_data_oldest(struct printk_ringbuffer *rb,
+ unsigned long lpos_min_oldest)
+{
+ struct prb_desc desc_last;
+ unsigned long lpos_oldest;
+ int err;
+
+ /*
+ * Try until lpos_oldest is behind lpos_min_oldest.
+ * There might be several entries in between. Parallel
+ * writers might shift lpos_oldest for their needs.
+ */
+ do {
+ lpos_oldest = READ_ONCE(rb->lpos_oldest);
+
+ /* Is the new position actually still used? */
+ if (!prb_lpos_in_use(rb, lpos_min_oldest))
+ return 0;
+
+ /* OK, mark all conflicting data entries as freed. */
+ err = prb_make_data_reusable(rb, lpos_oldest, lpos_min_oldest,
+ &desc_last);
+ if (err) {
+ /*
+ * Data array might point to wrong descriptors
+ * when reused in the meantime. Try again in
+ * this case.
+ */
+ if (lpos_oldest != READ_ONCE(rb->lpos_oldest))
+ continue;
+
+ /*
+ * Bad luck. Some conflicting data have not been
+ * committed yet.
+ */
+ return -ENOMEM;
+ }
+ /*
+ * We are here only when prb_make_data_reusable() had to proceed
+ * at least one desriptor and desc_last.lpos_next includes
+ * some valid lpos_next.
+ */
+ } while (cmpxchg(&rb->lpos_oldest, lpos_oldest, desc_last.lpos_next) != lpos_oldest);
+
+ return 0;
+}
+
+/*
+ * Must be called with seq number that must be oldest one.
+ * It is safe when it has already been removed in parallel.
+ *
+ * Return 0 when the descriptor and its associated data have
+ * been freed.
+ *
+ * Returns -ENOMEM when the data were not committed yet or
+ * when lpos_oldest could not get moved because there are
+ * not-yet-committed data from another descriptor on the way.
+ */
+static int prb_remove_desc_oldest(struct printk_ringbuffer *rb,
+ unsigned long seq_oldest)
+{
+ struct prb_desc desc;
+ enum prb_desc_state desc_state;
+ int err;
+
+ desc_state = prb_read_desc(rb, seq_oldest, &desc);
+ switch (desc_state) {
+ /*
+ * Another seq means that the oldest desciptor has already been
+ * removed and reused. Return success in this case.
+ */
+ case desc_miss:
+ return 0;
+ /* Bad luck when still reserved but not yet committed. */
+ case desc_reserved:
+ return -ENOMEM;
+ case desc_committed:
+ /*
+ * It was committed => either we or another parallel
+ * writer marks it reusable. Anything is fine.
+ */
+ prb_make_desc_reusable(rb, seq_oldest);
+ /* fall through */
+ case desc_reusable:
+ /*
+ * Might fail when there are another uncommitted data
+ * between lpos_oldest and desc.lpos_next.
+ */
+ err = prb_remove_data_oldest(rb, desc.lpos_next);
+ if (err)
+ return err;
+ break;
+ }
+
+ /* It does not matter who removed the oldest desc */
+ cmpxchg(&rb->seq_oldest, seq_oldest, seq_oldest + 1);
+ return 0;
+}
+
+/*
+ * Get exclusive write access to a descriptor together with
+ * a new sequence number.
+ *
+ * First, remove the conflicting descriptor and related data
+ * from the active ranges by pushing seq_oldest and lpos_oldest
+ * forward.
+ */
+static int prb_reserve_desc(struct prb_reserved_entry *entry)
+{
+ unsigned long seq, seq_newest, seq_prev_wrap;
+ struct printk_ringbuffer *rb = entry->rb;
+ struct prb_desc *desc;
+ int err;
+
+ /* Get descriptor for the next sequence number. */
+ do {
+ seq_newest = READ_ONCE(rb->seq_newest);
+ seq = (seq_newest + 1) & PRB_SEQ_MASK;
+ seq_prev_wrap = (seq - PRB_DESC_SIZE(rb)) & PRB_SEQ_MASK;
+
+ /*
+ * Remove conflicting descriptor from the previous wrap
+ * if ever used. It might fail when the related data
+ * have not been committed yet.
+ */
+ if (seq_prev_wrap == READ_ONCE(rb->seq_oldest)) {
+ err = prb_remove_desc_oldest(rb, seq_prev_wrap);
+ if (err)
+ return err;
+ }
+ } while (cmpxchg(&rb->seq_newest, seq_newest, seq) != seq_newest);
+
+ /*
+ * The descriptor is ours until the COMMITTED bit is set.
+ * Set its sequence number with all flags cleared.
+ */
+ desc = SEQ_TO_DESC(rb, seq);
+ WRITE_ONCE(desc->dst, seq);
+
+ /*
+ * Make sure that anyone sees the new dst/seq before
+ * lpos values and data are manipulated. It is related
+ * to the read berrier in prb_read_desc().
+ */
+ smp_wmb();
+
+ entry->seq = seq;
+ return 0;
+}
+
+/*
+ * Return lpos_next for the given lpos and size of data.
+ * Never wrap the data.
+ */
+static unsigned long
+prb_get_lpos_next(struct printk_ringbuffer *rb,
+ unsigned long lpos,
+ unsigned int size)
+{
+ unsigned long lpos_idx = lpos & PRB_DATA_MASK(rb);
+
+ /*
+ * Wrap lpos when there is not enough space at the end of the buffer.
+ * Reserve space for extra prb_data_block when the next data block
+ * gets wrapped.
+ */
+ if (lpos_idx + size + sizeof(struct prb_data_block) >= PRB_DATA_SIZE(rb)) {
+ lpos = PRB_LPOS_WRAP_CNT(rb, lpos) + PRB_DATA_SIZE(rb);
+ }
+
+ return lpos + size;
+}
+
+/*
+ * Try to make the needed space but do not reserve it. The function can
+ * be used to check whether there is a chance to reserve the space.
+ */
+static int prb_make_space(struct printk_ringbuffer *rb,
+ unsigned int size)
+{
+ unsigned long lpos, lpos_next;
+
+ lpos = READ_ONCE(rb->lpos_newest);
+ lpos_next = prb_get_lpos_next(rb, lpos, size);
+
+ /*
+ * Move lpos_oldest behind the conflicting lpos. Bad luck
+ * when the some conflicting data have not been committed yet.
+ */
+ return prb_remove_data_oldest(rb, lpos_next - PRB_DATA_SIZE(rb));
+}
+
+/*
+ * Reserve a range of logical positions in the data array.
+ *
+ * First, mark all conflicting data entries and related
+ * desciptors reusable. Then move lpos_oldest and
+ * lpos_newest as needed.
+ */
+static int prb_reserve_data(struct prb_reserved_entry *entry,
+ unsigned int size)
+{
+ struct printk_ringbuffer *rb = entry->rb;
+ struct prb_desc *desc = SEQ_TO_DESC(rb, entry->seq);
+ unsigned long lpos, lpos_next;
+ struct prb_data_block *datab;
+ int err;
+
+ do {
+ lpos = READ_ONCE(rb->lpos_newest);
+ lpos_next = prb_get_lpos_next(rb, lpos, size);
+
+ /*
+ * Move lpos_oldest behind the conflicting lpos. Bad luck
+ * when the some conflicting data have not been committed yet.
+ */
+ err = prb_remove_data_oldest(rb, lpos_next - PRB_DATA_SIZE(rb));
+ if (err)
+ return err;
+ } while (cmpxchg(&rb->lpos_newest, lpos, lpos_next) != lpos);
+
+ /*
+ * Data range is reserved. Cross link the data block and descriptor.
+ *
+ * Use the original lpos_newest. The data might be stored into
+ * a wrapped lpos but it can be detected and computed anytime
+ * later, see prb_lpos_wrapped().
+ */
+ datab = LPOS_TO_DATAB(rb, lpos);
+ WRITE_ONCE(datab->seq, entry->seq);
+ desc->lpos = lpos;
+ desc->lpos_next = lpos_next;
+ /*
+ * No barrier is necessary here. Nobody will believe the reserved data
+ * and meta information until they are committed. Nobody could
+ * manipulate the descriptor until the data are committed and freed.
+ *
+ * Any mistake will get detected because the number of active
+ * descriptors is limited by the array. The committed and freed
+ * flags are always manipulated atomically with the sequence
+ * number. An overflow of sequence numbers is not realistic.
+ */
+ return 0;
+}
+
+/*
+ * This function is used when it was not possible to reserve
+ * data for the given descriptor. Allow to reuse this descritor
+ * in the next possible occasion.
+ *
+ * The function is called when the caller has exclusive write
+ * access to the descriptor.
+ */
+static void prb_make_desc_unused(struct printk_ringbuffer *rb,
+ unsigned long seq)
+{
+ /* Use some currently used lpos that will be freed early. */
+ unsigned long lpos_oldest = READ_ONCE(rb->lpos_oldest);
+ struct prb_desc *desc = SEQ_TO_DESC(rb, seq);
+
+ desc->lpos = lpos_oldest;
+ desc->lpos_next = lpos_oldest;
+
+ /*
+ * Make sure that prb_desc_read() see valid lpos, lpos_next
+ * when the committed flag is set.
+ */
+ smp_wmb();
+ desc->dst |= PRB_COMMITTED_MASK;
+}
+
+/*
+ * Return lpos of struct datap where the data are written.
+ *
+ * It is lpos of the beginning of data array from the next
+ * when wrap when desc->lpos and desc->lpos_next are
+ * from different wraps.
+ *
+ * Note that struct datap with the crosslinked seq is
+ * written in the original desc->lpos.
+ */
+static unsigned long prb_lpos_data(struct printk_ringbuffer *rb,
+ struct prb_desc *desc)
+{
+ if (PRB_LPOS_WRAP_CNT(rb, desc->lpos) !=
+ PRB_LPOS_WRAP_CNT(rb, desc->lpos_next))
+ return PRB_LPOS_WRAP_CNT(rb, desc->lpos_next);
+
+ return desc->lpos;
+}
+
+/*
+ * Return pointer to the data for the given descriptor.
+ *
+ * The caller is reposnsible for passing struct prb_desc
+ * that can't be modified in the meantime.
+ */
+static char *prb_data(struct printk_ringbuffer *rb,
+ struct prb_desc *desc)
+{
+ struct prb_data_block *datab;
+ unsigned long lpos_data;
+
+ lpos_data = prb_lpos_data(rb, desc);
+ datab = LPOS_TO_DATAB(rb, lpos_data);
+
+ return datab->data;
+}
+
+/*
+ * Return size of the reserved data buffer.
+ *
+ * The called is reposnsible for passing struct prb_desc
+ * that can't be modified in the meantime.
+ */
+static unsigned int prb_data_size(struct printk_ringbuffer *rb,
+ struct prb_desc *desc)
+{
+ char *data, *data_next;
+
+ /* Need to compare pointer to the data inside prb_data_block. */
+ data = prb_data(rb, desc);
+ data_next = rb->data + (desc->lpos_next & PRB_DATA_MASK(rb));
+
+ return data_next - data;
+}
+
+char *prb_reserve(struct prb_reserved_entry *entry,
+ struct printk_ringbuffer *rb,
+ unsigned int size)
+{
+ int err;
+
+ entry->rb = rb;
+
+ size += sizeof(struct prb_data_block);
+ size = ALIGN(size, sizeof(unsigned long));
+ /*
+ * We need one more prb_data_block when wrapping. Require small
+ * enough entries so that we do not need to mind about such
+ * datails and other corner cases.
+ */
+ if (size > (PRB_DATA_SIZE(rb) / 4)) {
+ atomic_inc(&rb->lost);
+ return NULL;
+ }
+
+ local_irq_save(entry->irqflags);
+
+ /*
+ * Reserve descriptor only when there is a chance
+ * to reserve enough space in the data array.
+ */
+ err = prb_make_space(rb, size);
+ if (err)
+ goto err;
+
+ err = prb_reserve_desc(entry);
+ if (err)
+ goto err;
+
+ err = prb_reserve_data(entry, size);
+ if (err) {
+ prb_make_desc_unused(rb, entry->seq);
+ goto err;
+ }
+
+ /*
+ * Nobody could manipulate the reserved descriptor
+ * until the committed flag is set.
+ */
+ return prb_data(rb, SEQ_TO_DESC(rb, entry->seq));
+
+err:
+ atomic_inc(&rb->lost);
+ local_irq_restore(entry->irqflags);
+ return NULL;
+}
+
+void prb_commit(struct prb_reserved_entry *entry)
+{
+ struct prb_desc *desc = SEQ_TO_DESC(entry->rb, entry->seq);
+
+ /*
+ * Make sure that the data, including crosslink between
+ * the descriptor and the data_block, are written before
+ * the committed flag is set.
+ *
+ * The respective read barrier in prb_read_desc().
+ */
+ smp_wmb();
+
+ desc->dst |= PRB_COMMITTED_MASK;
+
+ local_irq_restore(entry->irqflags);
+}
+
+int prb_read(struct printk_ringbuffer *rb,
+ unsigned long seq,
+ char *buf,
+ unsigned int size)
+{
+ struct prb_desc desc;
+ enum prb_desc_state desc_state;
+ char *data;
+ unsigned int data_size;
+
+ desc_state = prb_read_desc(rb, seq, &desc);
+ if (desc_state != desc_committed)
+ return -EFAULT;
+
+ data = prb_data(rb, &desc);
+ data_size = prb_data_size(rb, &desc);
+
+ if (size > data_size)
+ size = data_size;
+
+ memcpy(data, buf, size);
+
+ desc_state = prb_read_desc(rb, seq, &desc);
+ if (desc_state != desc_committed)
+ return -EFAULT;
+
+ return size;
+}
--
2.16.4

2019-07-04 15:04:49

by John Ogness

[permalink] [raw]
Subject: Re: [PATCH POC] printk_ringbuffer: Alternative implementation of lockless printk ringbuffer

Hi Petr,

On 2019-07-04, Petr Mladek <[email protected]> wrote:
> This is POC that implements the lockless printk ringbuffer slightly
> different way. I believe that it is worth considering because it looks
> much easier to deal with. The reasons are:
>
> + The state of each entry is always clear.
>
> + The write access rights and validity of the data
> are clear from the state of the entry.
>
> + It seems that three barriers are enough to synchronize
> readers vs. writers. The rest is done implicitely
> using atomic operations.
>
> Of course, I might have missed some critical race that can't get
> solved by the new design easily. But I do not see it at the moment.

Two things jump out at me when looking at the implementation:

1. The code claims that the cmpxchg(seq_newest) in prb_reserve_desc()
guarantees that "The descriptor is ours until the COMMITTED bit is set."
This is not true if in that wind seq_newest wraps, allowing another
writer to gain ownership of the same descriptor. For small descriptor
arrays (such as in my test module), this situation is quite easy to
reproduce.

This was one of the reasons I chose to use a linked list. When the
descriptor is atomically removed from the linked list, it can _never_ be
used (or even seen) by any other writer until the owning writer is done
with it.

I'm not yet sure how that could be fixed with this implementation. The
state information is in a separate variable than the head pointer for
the descriptor array (seq_newest). This means you cannot atomically
reserve descriptors.

2. Another issue is when prb_reserve() fails and sets the descriptor as
unused. As it is now, no reader can read beyond that descriptor until it
is recycled. Readers need to know that the descriptor is bad and can be
skipped over. It might be better to handle this the way I did: go ahead
and set the state to committed, but have invalid lpos/lpos_next values
(for example: lpos_next=lpos) so the reader knows it can skip over the
descriptor.

> The code compiles but it is not really tested. I wanted to send it
> ASAP in a good enough state before we spend more time on cleaning
> up either of the proposals.

I am glad to see you put together your implementation. If anything, it
shows you understand the design! If after seeing my next version (v3)
you are still convinced that using a linked list for the descriptors is
too complex, then I can help support your idea to move to an array.

Thank you for taking so much time for this!

John Ogness

2019-07-08 16:06:00

by Petr Mladek

[permalink] [raw]
Subject: Re: [PATCH POC] printk_ringbuffer: Alternative implementation of lockless printk ringbuffer

On Thu 2019-07-04 16:59:54, John Ogness wrote:
> Hi Petr,
>
> On 2019-07-04, Petr Mladek <[email protected]> wrote:
> > This is POC that implements the lockless printk ringbuffer slightly
> > different way. I believe that it is worth considering because it looks
> > much easier to deal with. The reasons are:
> >
> > + The state of each entry is always clear.
> >
> > + The write access rights and validity of the data
> > are clear from the state of the entry.
> >
> > + It seems that three barriers are enough to synchronize
> > readers vs. writers. The rest is done implicitely
> > using atomic operations.
> >
> > Of course, I might have missed some critical race that can't get
> > solved by the new design easily. But I do not see it at the moment.
>
> Two things jump out at me when looking at the implementation:
>
> 1. The code claims that the cmpxchg(seq_newest) in prb_reserve_desc()
> guarantees that "The descriptor is ours until the COMMITTED bit is set."
> This is not true if in that wind seq_newest wraps, allowing another
> writer to gain ownership of the same descriptor. For small descriptor
> arrays (such as in my test module), this situation is quite easy to
> reproduce.

I am not sure that I fully understand the problem. seq_newest
wraps at 2^30 (32-bit) and at 2^62 (64-bit). It takes a while
to reuse an existing one. And it does not depend on the size
of the array.

In addition, new sequence number can get assigned only when
the descriptor with the conflicting (sharing the same struct
prb_desc) sequence number is in reusable state. It means
that it has to be committed before.

> This was one of the reasons I chose to use a linked list. When the
> descriptor is atomically removed from the linked list, it can _never_ be
> used (or even seen) by any other writer until the owning writer is done
> with it.
>
> I'm not yet sure how that could be fixed with this implementation. The
> state information is in a separate variable than the head pointer for
> the descriptor array (seq_newest). This means you cannot atomically
> reserve descriptors.

In my implementation, the sequence numbers are atomically reserved
in prb_reserve_desc() by

} while (cmpxchg(&rb->seq_newest, seq_newest, seq) != newest_seqs);

where seq is always seq_newest + 1. We are here only when the
conflicting seq from the previous wrap is in reusable state
and the related datablock is moved outside valid lpos range.
This is ensured by prb_remove_desc_oldest(rb, seq_prev_wrap).

Now, the CPU that succeeded with cmpxchg() becomes
the exclusive owner of the respective descriptor. The sequence
number is written into this descriptor _after_ cmpxchg() succeeded.

It is safe because:

+ previous values are not longer used (descriptor has been marked
as reusable, lpos from the related datablock were moved
outside valid range (lpos_oldest, lpos_newest).

+ new values are ignored by readers and other writers until
the right sequence number and the committed flag is set
in the descriptor.


> 2. Another issue is when prb_reserve() fails and sets the descriptor as
> unused. As it is now, no reader can read beyond that descriptor until it
> is recycled. Readers need to know that the descriptor is bad and can be
> skipped over. It might be better to handle this the way I did: go ahead
> and set the state to committed, but have invalid lpos/lpos_next values
> (for example: lpos_next=lpos) so the reader knows it can skip over the
> descriptor.

This is exactly what the code does, see prb_make_desc_unused().
It marks the descriptor as committed so that it can get reused.
And it sets lpos and lpos_next to the same value so that
the situation can get eventually detected by readers.


> > The code compiles but it is not really tested. I wanted to send it
> > ASAP in a good enough state before we spend more time on cleaning
> > up either of the proposals.
>
> I am glad to see you put together your implementation. If anything, it
> shows you understand the design! If after seeing my next version (v3)
> you are still convinced that using a linked list for the descriptors is
> too complex, then I can help support your idea to move to an array.

I am definitely interested into seeing v3 of your approach. I believe
that it will be much easier to understand. Then it will be easier to
compare.


BTW: There is one potential problem with my alternative approach.

The descriptors and the related data blocks might get reserved
in different order. Now, the descriptor might get reused only
when the related datablock is moved outside the valid range.
This operation might move also other data blocks outside
the range and invalidate descriptors that were reserved later.
As a result we might need to invalidate more messages in
the log buffer then would be really necessary.

If I get it properly, this problem does not exist with
the implementation using links. It is because the descriptors
are linked in the same order as the reserved data blocks.

I am not sure how big the problem, with more invalidated messages,
would be in reality. I am not sure if it would be worth
a more complicated implementation.

Alternative solution would be to somehow mix tricks from
both approaches and get something that is easier to deal
with and has less drawbacks. I am not sure it is possible.
Anyway, I still need to fully understand the linked approach
first.

Best Regards,
Petr

2019-07-09 01:35:28

by John Ogness

[permalink] [raw]
Subject: Re: [PATCH POC] printk_ringbuffer: Alternative implementation of lockless printk ringbuffer

On 2019-07-08, Petr Mladek <[email protected]> wrote:
>>> This is POC that implements the lockless printk ringbuffer slightly
>>> different way. I believe that it is worth considering because it looks
>>> much easier to deal with. The reasons are:
>>>
>>> + The state of each entry is always clear.
>>>
>>> + The write access rights and validity of the data
>>> are clear from the state of the entry.
>>>
>>> + It seems that three barriers are enough to synchronize
>>> readers vs. writers. The rest is done implicitely
>>> using atomic operations.
>>>
>>> Of course, I might have missed some critical race that can't get
>>> solved by the new design easily. But I do not see it at the moment.
>>
>> Two things jump out at me when looking at the implementation:
>>
>> 1. The code claims that the cmpxchg(seq_newest) in prb_reserve_desc()
>> guarantees that "The descriptor is ours until the COMMITTED bit is
>> set." This is not true if in that wind seq_newest wraps, allowing
>> another writer to gain ownership of the same descriptor. For small
>> descriptor arrays (such as in my test module), this situation is
>> quite easy to reproduce.
>
> I am not sure that I fully understand the problem. seq_newest
> wraps at 2^30 (32-bit) and at 2^62 (64-bit). It takes a while
> to reuse an existing one. And it does not depend on the size
> of the array.

I am not referring to unsigned long overflowing. I am referring to array
index wrapping. This _does_ depend on the size of the array.

> In addition, new sequence number can get assigned only when
> the descriptor with the conflicting (sharing the same struct
> prb_desc) sequence number is in reusable state. It means
> that it has to be committed before.

Correct. But taking it _out_ of the reusable state is not atomic, which
opens the window I am referring to.

>> This was one of the reasons I chose to use a linked list. When the
>> descriptor is atomically removed from the linked list, it can _never_ be
>> used (or even seen) by any other writer until the owning writer is done
>> with it.
>>
>> I'm not yet sure how that could be fixed with this implementation. The
>> state information is in a separate variable than the head pointer for
>> the descriptor array (seq_newest). This means you cannot atomically
>> reserve descriptors.
>
> In my implementation, the sequence numbers are atomically reserved
> in prb_reserve_desc() by
>
> } while (cmpxchg(&rb->seq_newest, seq_newest, seq) != newest_seqs);
>
> where seq is always seq_newest + 1. We are here only when the
> conflicting seq from the previous wrap is in reusable state
> and the related datablock is moved outside valid lpos range.
> This is ensured by prb_remove_desc_oldest(rb, seq_prev_wrap).
>
> Now, the CPU that succeeded with cmpxchg() becomes
> the exclusive owner of the respective descriptor. The sequence
> number is written into this descriptor _after_ cmpxchg() succeeded.
>
> It is safe because:
>
> + previous values are not longer used (descriptor has been marked
> as reusable, lpos from the related datablock were moved
> outside valid range (lpos_oldest, lpos_newest).
>
> + new values are ignored by readers and other writers until
> the right sequence number and the committed flag is set
> in the descriptor.

Let me inline the function are talking about and add commentary to
illustrate what I am saying:

static int prb_reserve_desc(struct prb_reserved_entry *entry)
{
unsigned long seq, seq_newest, seq_prev_wrap;
struct printk_ringbuffer *rb = entry->rb;
struct prb_desc *desc;
int err;

/* Get descriptor for the next sequence number. */
do {
seq_newest = READ_ONCE(rb->seq_newest);
seq = (seq_newest + 1) & PRB_SEQ_MASK;
seq_prev_wrap = (seq - PRB_DESC_SIZE(rb)) & PRB_SEQ_MASK;

/*
* Remove conflicting descriptor from the previous wrap
* if ever used. It might fail when the related data
* have not been committed yet.
*/
if (seq_prev_wrap == READ_ONCE(rb->seq_oldest)) {
err = prb_remove_desc_oldest(rb, seq_prev_wrap);
if (err)
return err;
}
} while (cmpxchg(&rb->seq_newest, seq_newest, seq) != seq_newest);

I am referring to this point in the code, after the
cmpxchg(). seq_newest has been incremented but the descriptor is still
in the unused state and seq is still 1 wrap behind. If an NMI occurs
here and the NMI (or some other CPU) inserts enough entries to wrap the
descriptor array, this descriptor will be reserved again, even though it
has already been reserved.

/*
* The descriptor is ours until the COMMITTED bit is set.
* Set its sequence number with all flags cleared.
*/
desc = SEQ_TO_DESC(rb, seq);
WRITE_ONCE(desc->dst, seq);

/*
* Make sure that anyone sees the new dst/seq before
* lpos values and data are manipulated. It is related
* to the read berrier in prb_read_desc().
*/
smp_wmb();

*Now* the descriptor is ours. Not before. And it is only exclusively
ours if the above mentioned situation doesn't occur.

This window doesn't exist with the list approach because reserving a
descriptor simply involves removing the tail (oldest) of the committed
list, which is an atomic operation.

entry->seq = seq;
return 0;
}

>> 2. Another issue is when prb_reserve() fails and sets the descriptor
>> as unused. As it is now, no reader can read beyond that descriptor
>> until it is recycled. Readers need to know that the descriptor is bad
>> and can be skipped over. It might be better to handle this the way I
>> did: go ahead and set the state to committed, but have invalid
>> lpos/lpos_next values (for example: lpos_next=lpos) so the reader
>> knows it can skip over the descriptor.
>
> This is exactly what the code does, see prb_make_desc_unused().
> It marks the descriptor as committed so that it can get reused.
> And it sets lpos and lpos_next to the same value so that
> the situation can get eventually detected by readers.

Indeed. Sorry for the noise.

> BTW: There is one potential problem with my alternative approach.
>
> The descriptors and the related data blocks might get reserved
> in different order. Now, the descriptor might get reused only
> when the related datablock is moved outside the valid range.
> This operation might move also other data blocks outside
> the range and invalidate descriptors that were reserved later.
> As a result we might need to invalidate more messages in
> the log buffer then would be really necessary.
>
> If I get it properly, this problem does not exist with the
> implementation using links. It is because the descriptors are
> linked in the same order as the reserved data blocks.

Descriptors in the committed list are ordered in commit order (not the
reserve order). However, if there are not enough descriptors
(i.e. avgdatabits is higher than the true average) this problem exists
with the list approach as well.

> I am not sure how big the problem, with more invalidated messages,
> would be in reality. I am not sure if it would be worth
> a more complicated implementation.

I am also not sure how big the problem is in a practical sense. However
to help avoid this issue, I will increase the descbits:avgdatabits ratio
for v3.

> Anyway, I still need to fully understand the linked approach
> first.

You may want to wait for v3. I've now split the ringbuffer into multiple
generic data structures (as unintentionally suggested[0] by PeterZ),
which helps to clarify the role of each data structure and also isolates
the memory barriers so that it is clear which data structure requires
which memory barriers.

John Ogness

[0] https://lkml.kernel.org/r/[email protected]

2019-07-09 09:11:50

by Petr Mladek

[permalink] [raw]
Subject: Re: [PATCH POC] printk_ringbuffer: Alternative implementation of lockless printk ringbuffer

On Tue 2019-07-09 03:34:43, John Ogness wrote:
> On 2019-07-08, Petr Mladek <[email protected]> wrote:
> >> 1. The code claims that the cmpxchg(seq_newest) in prb_reserve_desc()
> >> guarantees that "The descriptor is ours until the COMMITTED bit is
> >> set." This is not true if in that wind seq_newest wraps, allowing
> >> another writer to gain ownership of the same descriptor. For small
> >> descriptor arrays (such as in my test module), this situation is
> >> quite easy to reproduce.
> >
> Let me inline the function are talking about and add commentary to
> illustrate what I am saying:
>
> static int prb_reserve_desc(struct prb_reserved_entry *entry)
> {
> unsigned long seq, seq_newest, seq_prev_wrap;
> struct printk_ringbuffer *rb = entry->rb;
> struct prb_desc *desc;
> int err;
>
> /* Get descriptor for the next sequence number. */
> do {
> seq_newest = READ_ONCE(rb->seq_newest);
> seq = (seq_newest + 1) & PRB_SEQ_MASK;
> seq_prev_wrap = (seq - PRB_DESC_SIZE(rb)) & PRB_SEQ_MASK;
>
> /*
> * Remove conflicting descriptor from the previous wrap
> * if ever used. It might fail when the related data
> * have not been committed yet.
> */
> if (seq_prev_wrap == READ_ONCE(rb->seq_oldest)) {
> err = prb_remove_desc_oldest(rb, seq_prev_wrap);
> if (err)
> return err;
> }
> } while (cmpxchg(&rb->seq_newest, seq_newest, seq) != seq_newest);
>
> I am referring to this point in the code, after the
> cmpxchg(). seq_newest has been incremented but the descriptor is still
> in the unused state and seq is still 1 wrap behind. If an NMI occurs
> here and the NMI (or some other CPU) inserts enough entries to wrap the
> descriptor array, this descriptor will be reserved again, even though it
> has already been reserved.

Not really, the NMI will not reach the cmpxchg() in this case.
prb_remove_desc_oldest() will return error. It will not
be able to remove the conflicting descriptor because it will
still be occupied by a two-wraps-old descriptor.

BTW: I did meet these problems in some early variatns. But everything
started working at some point. I always looked how you solved
a particular situation in the link-based approach. Then I
somehow translated it into the pure-array variant.


> > BTW: There is one potential problem with my alternative approach.
> >
> > The descriptors and the related data blocks might get reserved
> > in different order. Now, the descriptor might get reused only
> > when the related datablock is moved outside the valid range.
> > This operation might move also other data blocks outside
> > the range and invalidate descriptors that were reserved later.
> > As a result we might need to invalidate more messages in
> > the log buffer then would be really necessary.
> >
> > If I get it properly, this problem does not exist with the
> > implementation using links. It is because the descriptors are
> > linked in the same order as the reserved data blocks.
>
> Descriptors in the committed list are ordered in commit order (not the
> reserve order). However, if there are not enough descriptors
> (i.e. avgdatabits is higher than the true average) this problem exists
> with the list approach as well.

Thanks for the explanation.

> > I am not sure how big the problem, with more invalidated messages,
> > would be in reality. I am not sure if it would be worth
> > a more complicated implementation.
>
> I am also not sure how big the problem is in a practical sense. However
> to help avoid this issue, I will increase the descbits:avgdatabits ratio
> for v3.

Yup, it sounds reasonable. The number of reserved but not committed
descriptors is basically limited by the number of CPUs.

The only unknown variable is the length of the messages. Which brings
another problem. We might need a solution for continuous lines.
People would want it. Also storing one line in many entries would
be quite inefficient. But let's discuss this later when
printk() gets converted into the lockless ring buffer.


> > Anyway, I still need to fully understand the linked approach
> > first.
>
> You may want to wait for v3. I've now split the ringbuffer into multiple
> generic data structures (as unintentionally suggested[0] by PeterZ),
> which helps to clarify the role of each data structure and also isolates
> the memory barriers so that it is clear which data structure requires
> which memory barriers.

Sure, I am interested to see v3.

Best Regards,
Petr

2019-07-09 10:23:20

by John Ogness

[permalink] [raw]
Subject: Re: [PATCH POC] printk_ringbuffer: Alternative implementation of lockless printk ringbuffer

On 2019-07-09, Petr Mladek <[email protected]> wrote:
>>>> 1. The code claims that the cmpxchg(seq_newest) in
>>>> prb_reserve_desc() guarantees that "The descriptor is ours until
>>>> the COMMITTED bit is set." This is not true if in that wind
>>>> seq_newest wraps, allowing another writer to gain ownership of the
>>>> same descriptor. For small descriptor arrays (such as in my test
>>>> module), this situation is quite easy to reproduce.
>>>
>> Let me inline the function are talking about and add commentary to
>> illustrate what I am saying:
>>
>> static int prb_reserve_desc(struct prb_reserved_entry *entry)
>> {
>> unsigned long seq, seq_newest, seq_prev_wrap;
>> struct printk_ringbuffer *rb = entry->rb;
>> struct prb_desc *desc;
>> int err;
>>
>> /* Get descriptor for the next sequence number. */
>> do {
>> seq_newest = READ_ONCE(rb->seq_newest);
>> seq = (seq_newest + 1) & PRB_SEQ_MASK;
>> seq_prev_wrap = (seq - PRB_DESC_SIZE(rb)) & PRB_SEQ_MASK;
>>
>> /*
>> * Remove conflicting descriptor from the previous wrap
>> * if ever used. It might fail when the related data
>> * have not been committed yet.
>> */
>> if (seq_prev_wrap == READ_ONCE(rb->seq_oldest)) {
>> err = prb_remove_desc_oldest(rb, seq_prev_wrap);
>> if (err)
>> return err;
>> }
>> } while (cmpxchg(&rb->seq_newest, seq_newest, seq) != seq_newest);
>>
>> I am referring to this point in the code, after the
>> cmpxchg(). seq_newest has been incremented but the descriptor is
>> still in the unused state and seq is still 1 wrap behind. If an NMI
>> occurs here and the NMI (or some other CPU) inserts enough entries to
>> wrap the descriptor array, this descriptor will be reserved again,
>> even though it has already been reserved.
>
> Not really, the NMI will not reach the cmpxchg() in this case.
> prb_remove_desc_oldest() will return error.

Why will prb_remove_desc_oldest() fail? IIUC, it will return success
because the descriptor is in the desc_miss state.

> It will not be able to remove the conflicting descriptor because
> it will still be occupied by a two-wraps-old descriptor.

Please explain why with more details. Perhaps providing a function call
chain? Sorry if I'm missing the obvious here.

This is really the critical point that drove me to use lists: multiple
writers expiring and reserving the same descriptors.

John Ogness

2019-07-09 11:59:37

by Petr Mladek

[permalink] [raw]
Subject: Re: [PATCH POC] printk_ringbuffer: Alternative implementation of lockless printk ringbuffer

On Tue 2019-07-09 12:21:01, John Ogness wrote:
> On 2019-07-09, Petr Mladek <[email protected]> wrote:
> >>>> 1. The code claims that the cmpxchg(seq_newest) in
> >>>> prb_reserve_desc() guarantees that "The descriptor is ours until
> >>>> the COMMITTED bit is set." This is not true if in that wind
> >>>> seq_newest wraps, allowing another writer to gain ownership of the
> >>>> same descriptor. For small descriptor arrays (such as in my test
> >>>> module), this situation is quite easy to reproduce.
> >>>
> >> Let me inline the function are talking about and add commentary to
> >> illustrate what I am saying:
> >>
> >> static int prb_reserve_desc(struct prb_reserved_entry *entry)
> >> {
> >> unsigned long seq, seq_newest, seq_prev_wrap;
> >> struct printk_ringbuffer *rb = entry->rb;
> >> struct prb_desc *desc;
> >> int err;
> >>
> >> /* Get descriptor for the next sequence number. */
> >> do {
> >> seq_newest = READ_ONCE(rb->seq_newest);
> >> seq = (seq_newest + 1) & PRB_SEQ_MASK;
> >> seq_prev_wrap = (seq - PRB_DESC_SIZE(rb)) & PRB_SEQ_MASK;
> >>
> >> /*
> >> * Remove conflicting descriptor from the previous wrap
> >> * if ever used. It might fail when the related data
> >> * have not been committed yet.
> >> */
> >> if (seq_prev_wrap == READ_ONCE(rb->seq_oldest)) {
> >> err = prb_remove_desc_oldest(rb, seq_prev_wrap);
> >> if (err)
> >> return err;
> >> }
> >> } while (cmpxchg(&rb->seq_newest, seq_newest, seq) != seq_newest);
> >>
> >> I am referring to this point in the code, after the
> >> cmpxchg(). seq_newest has been incremented but the descriptor is
> >> still in the unused state and seq is still 1 wrap behind. If an NMI
> >> occurs here and the NMI (or some other CPU) inserts enough entries to
> >> wrap the descriptor array, this descriptor will be reserved again,
> >> even though it has already been reserved.
> >
> > Not really, the NMI will not reach the cmpxchg() in this case.
> > prb_remove_desc_oldest() will return error.
>
> Why will prb_remove_desc_oldest() fail? IIUC, it will return success
> because the descriptor is in the desc_miss state.
>
> > It will not be able to remove the conflicting descriptor because
> > it will still be occupied by a two-wraps-old descriptor.

Ah, I see that this situation was not handled correctly.
But it can get fixed pretty easily, see an updated
prb_remove_desc_oldest() at the end of the mail.


> Please explain why with more details. Perhaps providing a function call
> chain? Sorry if I'm missing the obvious here.

To be on the safe side, let's try it with real numbers.

Let's have array with 8 descriptors filled with the following
sequence numbers pointing to commited messages:

desc[10]: 16 17 18 19 20 21 22 23
rb->seq_oldest = 16;
rb->seq_newest = 23;

then prb_reserve_desc() would do:

seq_newest = 23;
seq = 24;
seq_prew_wrap = 16;

prb_remove_desc_oldest(rb, 16);

// let's say that it succeeds and
// rb->seq_oldest == 17;

cmpxchg(&rb->seq_newest, 23, 24) == 23)

// let's say that it succeds and it is immediately
// interrupted by NMI before desc[0]->dst is set to 24.
// So, we still have:

desc[10]: 16 17 18 19 20 21 22 23
rb->seq_oldest = 17;
rb->seq_newest = 24;

Let's say that NMI tries to print 8 messages.
After 7 successfully reserved and commited messages
we could have:

desc[10]: 16 25 26 27 28 29 30 31
rb->seq_oldest = 24;
rb->seq_newest = 31;

desc[0] still has the outdated information. Now, we try to reserve
the 8th message, then prb_reserve_desc() would do:

seq_newest = 31;
seq = 32;
seq_prew_wrap = 24;

prb_remove_desc_oldest(rb, 24);

desc_state = prb_read_desc(rb, 24, &desc);

// desc_state == desc_miss because the
// descriptor still points to the outdated
// seq = 16.

// prb_remove_desc_oldest(rb, 24) would continue with:
switch (desc_state) {
/*
* Another seq means that the oldest desciptor has already been
* removed and reused. Return success in this case.
*/
case desc_miss:
return 0;

BUG: This is obviously wrong!

But this is a special case that can get detected. desc->seq is exactly
1 wrap back than requested. The proper code would be:

static int prb_remove_desc_oldest(struct printk_ringbuffer *rb,
unsigned long seq_oldest)
{
struct prb_desc desc;
enum prb_desc_state desc_state;
int err;

desc_state = prb_read_desc(rb, seq_oldest, &desc);
switch (desc_state) {
case desc_miss:
unsigned long seq_prev_wrap =
(seq_oldest - PRB_DESC_SIZE(rb)) &
PRB_SEQ_MASK;

if (desc->dst ==
(seq_prev_wrap |
PRB_COMMITED_MASK |
PRB_REUSABLE_MASK)) {
/*
* Special case: Reusable descriptor from the
* previous wrap means that the current
* oldest descriptor is reserved but the dst
* has not been updated yet.
*/
return -ENOMEM;

/*
* Any other desc_misc means that the oldest
* has been already removed and reused by a newer
* sequence number. Return success in this case.
* The attempt to update rb->seq_oldest will fail.
*/
return 0;

At this point, any prb_reserve() would fail exactly this way
until NMI returns and the message with seq == 24 gets commited.

Best Regards,
Petr

2019-08-14 03:48:57

by John Ogness

[permalink] [raw]
Subject: Re: [PATCH POC] printk_ringbuffer: Alternative implementation of lockless printk ringbuffer

Hi Petr,

FWIW, I hacked a patch against my RFCv4[0] series to implement your
POC. Some parts of the patch are not particularly pretty, as I had to
"mold" it to fit numlist usage. And I was extreme with heavy memory
barrier usage to save time. But it boots, runs, and passes all my printk
interface tests. It is all behind a PETRM_POC macro, so toggling between
the two implementations is quite simple.

I varied from your POC by having the sequence number be separate from
the id. We need 64-bit sequence numbers for all architectures, but since
there are cmpxchg() calls on the id, the id probably should remain an
unsigned long. That wasn't a problem to implement because my ringbuffer
implementation has them separate as well. It also has the benefit that
no sequence bits are sacrified for the state bits.

I would like to say that I am glad to see that this works. I was not
convinced that an array-based approach would work. And although I still
think it is too complex, it is not as bad as I expected.

Particularly interesting to note by the implementation is the use of
states. numlist.c:expire_oldest_node() provides the main piece of state
logic to handle the critical point of descriptor recycling. But also
note that readers (numlist.c:numlist_read()) need to watch for two
states (committed and reusable) as well as validating the data itself.

As for performance, my test_prb test module does not provide very good
results. On a 16-core ARM64 I usually only see about 10,000 to 1,000,000
records before a writer fails to reserve. The problem seems to be that a
writer has reserved but not yet committed a record and the descriptor
array has wrapped. With the array implementation this means no further
descriptors can be assigned until that writer has committed. So a new
writer has no choice but to fail.

With the linked list approach this problem is avoided because writers
simply remove a descriptor from the committed list. With it removed, it
is less critical how long that writer takes to commit because it is not
preventing other writers from removing descriptors from the committed
list.

To compare, with the linked list implementation, the test_prb test
module ran 14 days on a 16-core ARM64 machine before I manually stopped
the test. In that time the 15 writers wrote a total of 936,341,256,222
records (average of 735,408 per second). The single reader never lost
more than 29 consecutive records (having lost a total of 0.02% of the
records). Keep in mind all of this is using a 4KB ringbuffer.

It could be argued that the test_prb module (with its massive writing
and small ringbuffer) is unrealistic. However, I would still argue that
using a linked list to manage the committed records is a simpler, more
straight forward, more robust approach that is easier to understand and
maintain. I hope with the splitting of the linked list (numlist) into
separate source files, you will also see that it is not so complicated.

John Ogness

[0] https://lkml.kernel.org/r/[email protected]

------ BEGIN PATCH -----
diff --git a/kernel/printk/numlist.h b/kernel/printk/numlist.h
index d4595fb9a3e9..09535e3fb055 100644
--- a/kernel/printk/numlist.h
+++ b/kernel/printk/numlist.h
@@ -2,6 +2,7 @@

#ifndef _KERNEL_PRINTK_NUMLIST_H
#define _KERNEL_PRINTK_NUMLIST_H
+#define PETRM_POC

#include <linux/atomic.h>

@@ -69,4 +70,25 @@ unsigned long numlist_read_tail(struct numlist *nl, u64 *seq,
bool numlist_read(struct numlist *nl, unsigned long id, u64 *seq,
unsigned long *next_id);

+#ifdef PETRM_POC
+unsigned long prb_descs_count(struct numlist *nl);
+
+enum desc_state {
+ desc_miss,
+ desc_reserved,
+ desc_committed,
+ desc_reusable,
+};
+
+#define DESC_DST_BITS (sizeof(long) * 8)
+#define DESC_COMMITTED_MASK (1UL << (DESC_DST_BITS - 1))
+#define DESC_REUSE_MASK (1UL << (DESC_DST_BITS - 2))
+#define DESC_FLAGS_MASK (DESC_COMMITTED_MASK | DESC_REUSE_MASK)
+#define DESC_ID_MASK (~DESC_FLAGS_MASK)
+
+#define DESC_ID(id) ((id) & DESC_ID_MASK)
+#define DESC_ID_PREV_WRAP(nl, id) \
+ DESC_ID(DESC_ID(id) - prb_descs_count(nl))
+#endif
+
#endif /* _KERNEL_PRINTK_NUMLIST_H */
diff --git a/kernel/printk/numlist.c b/kernel/printk/numlist.c
index 16c6ffa74b01..de3dbdfdb0a8 100644
--- a/kernel/printk/numlist.c
+++ b/kernel/printk/numlist.c
@@ -79,6 +79,27 @@
* Nodes can become invalid while being read by non-consuming readers.
*/

+#ifdef PETRM_POC
+unsigned long prb_desc_get_id(unsigned long id, void *arg);
+void prb_desc_set_id(unsigned long id, unsigned long new_id, void *arg);
+unsigned long prb_desc_cmpxchg_id(unsigned long id, unsigned long old_id,
+ unsigned long new_id, void *arg);
+
+enum desc_state get_desc_state(unsigned long id, unsigned long entry_state)
+{
+ if (id != (entry_state & DESC_ID_MASK))
+ return desc_miss;
+
+ if (!(entry_state & DESC_COMMITTED_MASK))
+ return desc_reserved;
+
+ if (!(entry_state & DESC_REUSE_MASK))
+ return desc_committed;
+
+ return desc_reusable;
+}
+#endif
+
/**
* numlist_read() - Read the information stored within a node.
*
@@ -111,6 +132,10 @@
bool numlist_read(struct numlist *nl, unsigned long id, u64 *seq,
unsigned long *next_id)
{
+#ifdef PETRM_POC
+ enum desc_state desc_state;
+ unsigned long entry_state;
+#endif
struct nl_node *n;

n = nl->node(id, nl->node_arg);
@@ -146,6 +171,28 @@ bool numlist_read(struct numlist *nl, unsigned long id, u64 *seq,
*/
smp_rmb();

+#ifdef PETRM_POC
+ entry_state = prb_desc_get_id(id, nl->node_arg);
+ desc_state = get_desc_state(id, entry_state);
+ if (desc_state != desc_committed && desc_state != desc_reusable)
+ return false;
+
+ if (next_id) {
+ /* mark as EOL if next is not ready */
+
+ if (*next_id != DESC_ID(id + 1)) {
+ *next_id = id;
+ } else {
+ entry_state = prb_desc_get_id(*next_id, nl->node_arg);
+ desc_state = get_desc_state(*next_id, entry_state);
+ if (desc_state != desc_committed &&
+ desc_state != desc_reusable) {
+ *next_id = id;
+ }
+ }
+ }
+#endif
+
return (nl->node(id, nl->node_arg) != NULL);
}

@@ -198,6 +245,12 @@ unsigned long numlist_read_tail(struct numlist *nl, u64 *seq,
* to update @next_id of the former head node to point to this one, which
* makes this node visible to any task that sees the former head node.
*/
+#ifdef PETRM_POC
+void numlist_push(struct numlist *nl, struct nl_node *n, unsigned long id)
+{
+ prb_desc_set_id(DESC_ID(id), id | DESC_COMMITTED_MASK, nl->node_arg);
+}
+#else
void numlist_push(struct numlist *nl, struct nl_node *n, unsigned long id)
{
unsigned long head_id;
@@ -312,6 +365,7 @@ void numlist_push(struct numlist *nl, struct nl_node *n, unsigned long id)
*/
smp_store_release(&n->next_id, id);
}
+#endif

/**
* numlist_pop() - Remove the oldest node from the list.
@@ -328,6 +382,76 @@ void numlist_push(struct numlist *nl, struct nl_node *n, unsigned long id)
*
* Return: The removed node or NULL if the tail node cannot be removed.
*/
+#ifdef PETRM_POC
+bool expire_oldest_node(struct numlist *nl, unsigned long tail_id)
+{
+ enum desc_state desc_state;
+ unsigned long entry_state;
+
+ entry_state = prb_desc_get_id(tail_id, nl->node_arg);
+ desc_state = get_desc_state(tail_id, entry_state);
+
+ switch (desc_state) {
+ case desc_miss:
+ if (DESC_ID(entry_state) == DESC_ID_PREV_WRAP(nl, tail_id))
+ return false;
+ return true;
+ case desc_reserved:
+ return false;
+ case desc_committed:
+ if (nl->busy(tail_id, nl->busy_arg))
+ return false;
+ prb_desc_cmpxchg_id(tail_id, entry_state,
+ entry_state | DESC_REUSE_MASK, nl->node_arg);
+ /* fall through */
+ case desc_reusable:
+ break;
+ }
+
+ atomic_long_cmpxchg(&nl->tail_id, tail_id, DESC_ID(tail_id + 1));
+ return true;
+}
+
+struct nl_node *numlist_pop(struct numlist *nl)
+{
+ unsigned long id_prev_wrap;
+ unsigned long head_id;
+ struct nl_node *n;
+ unsigned long id;
+ unsigned long r;
+
+ head_id = atomic_long_read(&nl->head_id);
+
+ for (;;) {
+ id = DESC_ID(head_id + 1);
+ id_prev_wrap = DESC_ID_PREV_WRAP(nl, id);
+
+ if (id_prev_wrap == atomic_long_read(&nl->tail_id)) {
+ if (!expire_oldest_node(nl, id_prev_wrap))
+ return NULL;
+ }
+
+ r = atomic_long_cmpxchg(&nl->head_id, head_id, id);
+ if (r == head_id)
+ break;
+
+ head_id = r;
+ }
+
+ n = nl->node(id, nl->node_arg);
+
+ /* set to reserved */
+ prb_desc_set_id(id, id, nl->node_arg);
+
+ if (!n->seq)
+ WRITE_ONCE(n->seq, id);
+ else
+ WRITE_ONCE(n->seq, READ_ONCE(n->seq) + prb_descs_count(nl));
+ WRITE_ONCE(n->next_id, DESC_ID(id + 1));
+
+ return n;
+}
+#else
struct nl_node *numlist_pop(struct numlist *nl)
{
unsigned long tail_id;
@@ -374,3 +498,4 @@ struct nl_node *numlist_pop(struct numlist *nl)

return nl->node(tail_id, nl->node_arg);
}
+#endif
diff --git a/kernel/printk/ringbuffer.h b/kernel/printk/ringbuffer.h
index 02b4c53e287e..3e28fbad5359 100644
--- a/kernel/printk/ringbuffer.h
+++ b/kernel/printk/ringbuffer.h
@@ -218,13 +218,23 @@ struct dr_desc *prb_getdesc(unsigned long id, void *arg);
* immediately available and initialized. It is an alternative to
* manually initializing a ringbuffer with prb_init().
*/
+#ifdef PETRM_POC
+#define ID0_INITIALIZER \
+ ATOMIC_LONG_INIT(DESC_COMMITTED_MASK|DESC_REUSE_MASK)
+#define NODE0_INITIALIZER { .seq = 0, .next_id = 1, }
+#else
+#define ID0_INITIALIZER ATOMIC_LONG_INIT(0)
+#define NODE0_INITIALIZER { .seq = 0, .next_id = 0 }
+#endif
+
#define DECLARE_PRINTKRB(name, avgdatabits, descbits, waitq) \
char _##name##_data[1 << ((avgdatabits) + (descbits))] \
__aligned(__alignof__(long)); \
struct prb_desc _##name##_descs[1 << (descbits)] = { \
{ \
- .id = ATOMIC_LONG_INIT(0), \
+ .id = ID0_INITIALIZER, \
.desc = __DR_DESC_INITIALIZER, \
+ .list = NODE0_INITIALIZER, \
}, \
}; \
struct printk_ringbuffer name = { \
@@ -281,7 +291,7 @@ struct prb_entry name = { \
*
* @name: The name for the entry structure variable.
*
- * This macro is declares and initializes an entry structure without any
+ * This macro declares and initializes an entry structure without any
* buffer. This is useful if an iterator is only interested in sequence
* numbers and so does not need to read the entry data. Also, because of
* its small size, it is safe to put on the stack.
diff --git a/kernel/printk/ringbuffer.c b/kernel/printk/ringbuffer.c
index e727d9d72f65..165b2128b34e 100644
--- a/kernel/printk/ringbuffer.c
+++ b/kernel/printk/ringbuffer.c
@@ -219,8 +219,10 @@ struct nl_node *prb_desc_node(unsigned long id, void *arg)
{
struct prb_desc *d = to_desc(arg, id);

+#ifndef PETRM_POC
if (id != atomic_long_read(&d->id))
return NULL;
+#endif

return &d->list;
}
@@ -263,7 +265,11 @@ bool prb_desc_busy(unsigned long id, void *arg)
smp_rmb();

/* hC: */
+#ifdef PETRM_POC
+ return (id == DESC_ID(atomic_long_read(&d->id)));
+#else
return (id == atomic_long_read(&d->id));
+#endif
}
EXPORT_SYMBOL(prb_desc_busy);

@@ -294,7 +300,11 @@ struct dr_desc *prb_getdesc(unsigned long id, void *arg)
* assign_desc(). The smp_rmb() issued by the caller after calling
* this function pairs with that _release(). See jB for details.
*/
+#ifdef PETRM_POC
+ if (id != DESC_ID(atomic_long_read(&d->id)))
+#else
if (id != atomic_long_read(&d->id))
+#endif
return NULL;

/* iB: */
@@ -302,6 +312,44 @@ struct dr_desc *prb_getdesc(unsigned long id, void *arg)
}
EXPORT_SYMBOL(prb_getdesc);

+#ifdef PETRM_POC
+unsigned long prb_descs_count(struct numlist *nl)
+{
+ struct printk_ringbuffer *rb =
+ container_of(nl, struct printk_ringbuffer, nl);
+ return DESCS_COUNT(rb);
+}
+
+unsigned long prb_desc_get_id(unsigned long id, void *arg)
+{
+ struct prb_desc *d = to_desc(arg, DESC_ID(id));
+ unsigned long ret;
+ smp_mb();
+ ret = atomic_long_read(&d->id);
+ smp_mb();
+ return ret;
+}
+
+void prb_desc_set_id(unsigned long id, unsigned long new_id, void *arg)
+{
+ struct prb_desc *d = to_desc(arg, DESC_ID(id));
+ smp_mb();
+ atomic_long_set_release(&d->id, new_id);
+ smp_mb();
+}
+
+unsigned long prb_desc_cmpxchg_id(unsigned long id, unsigned long old_id,
+ unsigned long new_id, void *arg)
+{
+ struct prb_desc *d = to_desc(arg, DESC_ID(id));
+ unsigned long ret;
+ smp_mb();
+ ret = atomic_long_cmpxchg(&d->id, old_id, new_id);
+ smp_mb();
+ return ret;
+}
+#endif
+
/**
* assign_desc() - Assign a descriptor to the caller.
*
@@ -326,7 +374,6 @@ static bool assign_desc(struct prb_reserved_entry *e)
struct printk_ringbuffer *rb = e->rb;
struct prb_desc *d;
struct nl_node *n;
- unsigned long i;

for (;;) {
/*
@@ -340,8 +387,11 @@ static bool assign_desc(struct prb_reserved_entry *e)
break;
}

+#ifndef PETRM_POC
/* Fallback to static never-used descriptors. */
if (atomic_read(&rb->desc_next_unused) < DESCS_COUNT(rb)) {
+ unsigned long i;
+
i = atomic_fetch_inc(&rb->desc_next_unused);
if (i < DESCS_COUNT(rb)) {
d = &rb->descs[i];
@@ -354,6 +404,7 @@ static bool assign_desc(struct prb_reserved_entry *e)
break;
}
}
+#endif

/*
* No descriptor available. Make one available for recycling
@@ -383,8 +434,10 @@ static bool assign_desc(struct prb_reserved_entry *e)
* matching
* RMB between dB->iA and dI
*/
+#ifndef PETRM_POC
atomic_long_set_release(&d->id, atomic_long_read(&d->id) +
DESCS_COUNT(rb));
+#endif

e->desc = d;
return true;
@@ -507,7 +560,11 @@ void prb_commit(struct prb_reserved_entry *e)
struct prb_desc *d = e->desc;
unsigned long id;

+#ifdef PETRM_POC
+ id = DESC_ID(atomic_long_read(&d->id));
+#else
id = atomic_long_read(&d->id);
+#endif

/*
* lA:
@@ -1015,7 +1072,12 @@ void prb_init(struct printk_ringbuffer *rb, char *data, int data_size_bits,

rb->desc_count_bits = desc_count_bits;
rb->descs = descs;
+#ifdef PETRM_POC
+ atomic_long_set(&descs[0].id, DESC_COMMITTED_MASK|DESC_REUSE_MASK);
+ descs[0].list.next_id = 1;
+#else
atomic_long_set(&descs[0].id, 0);
+#endif
descs[0].desc.begin_lpos = 1;
descs[0].desc.next_lpos = 1;
atomic_set(&rb->desc_next_unused, 1);