Since the ringbuffer is lockless, there is no need for it to be
protected by @logbuf_lock. Remove @logbuf_lock.
This means that printk_nmi_direct and printk_safe_flush_on_panic()
no longer need to acquire any lock to run.
The global variables @syslog_seq, @syslog_partial, @syslog_time
were also protected by @logbuf_lock. Introduce @syslog_lock to
protect these.
@console_seq, @exclusive_console_stop_seq, @console_dropped are
protected by @console_lock.
Without @logbuf_lock it is no longer possible to use the single
static buffer for temporarily sprint'ing the message. Instead,
use vsnprintf() to determine the length and perform the real
vscnprintf() using the area reserved from the ringbuffer. This
leads to suboptimal packing of the message data, but will
result in less wasted storage than multiple per-cpu buffers to
support lockless temporary sprint'ing.
Signed-off-by: John Ogness <[email protected]>
---
include/linux/printk.h | 1 +
kernel/printk/internal.h | 4 +-
kernel/printk/printk.c | 285 +++++++++++++++++++-----------------
kernel/printk/printk_safe.c | 18 +--
4 files changed, 157 insertions(+), 151 deletions(-)
diff --git a/include/linux/printk.h b/include/linux/printk.h
index fe7eb2351610..6d8f844bfdff 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -45,6 +45,7 @@ static inline const char *printk_skip_headers(const char *buffer)
}
#define CONSOLE_EXT_LOG_MAX 8192
+#define CONSOLE_LOG_MAX 1024
/* printk's without a loglevel use this.. */
#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 3a8fd491758c..e7acc2888c8e 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -12,8 +12,6 @@
#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000
-extern raw_spinlock_t logbuf_lock;
-
__printf(4, 0)
int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
@@ -59,7 +57,7 @@ void defer_console_output(void);
__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
/*
- * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
+ * In !PRINTK builds we still export console_sem
* semaphore and some of console functions (console_unlock()/etc.), so
* printk-safe must preserve the existing local IRQ guarantees.
*/
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e9018c4e1b66..7385101210be 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -355,48 +355,18 @@ enum log_flags {
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
-/*
- * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
- * within the scheduler's rq lock. It must be released before calling
- * console_unlock() or anything else that might wake up a process.
- */
-DEFINE_RAW_SPINLOCK(logbuf_lock);
-
-/*
- * Helper macros to lock/unlock logbuf_lock and switch between
- * printk-safe/unsafe modes.
- */
-#define logbuf_lock_irq() \
- do { \
- printk_safe_enter_irq(); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irq() \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irq(); \
- } while (0)
-
-#define logbuf_lock_irqsave(flags) \
- do { \
- printk_safe_enter_irqsave(flags); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irqrestore(flags) \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irqrestore(flags); \
- } while (0)
+/* The syslog_lock protects syslog_* variables. */
+static DEFINE_SPINLOCK(syslog_lock);
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
+/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;
+/* All 3 protected by @console_sem. */
/* the next printk record to write to the console */
static u64 console_seq;
static u64 exclusive_console_stop_seq;
@@ -410,7 +380,7 @@ static atomic64_t clear_seq = ATOMIC64_INIT(0);
#else
#define PREFIX_MAX 32
#endif
-#define LOG_LINE_MAX (1024 - PREFIX_MAX)
+#define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX)
#define LOG_LEVEL(v) ((v) & 0x07)
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
@@ -720,27 +690,22 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
if (ret)
return ret;
- logbuf_lock_irq();
if (!prb_read_valid(prb, user->seq, r)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- logbuf_unlock_irq();
goto out;
}
- logbuf_unlock_irq();
ret = wait_event_interruptible(log_wait,
prb_read_valid(prb, user->seq, r));
if (ret)
goto out;
- logbuf_lock_irq();
}
if (user->seq < prb_first_valid_seq(prb)) {
/* our last seen message is gone, return error and reset */
user->seq = prb_first_valid_seq(prb);
ret = -EPIPE;
- logbuf_unlock_irq();
goto out;
}
@@ -750,7 +715,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
&r->info->dev_info);
user->seq = r->info->seq + 1;
- logbuf_unlock_irq();
if (len > count) {
ret = -EINVAL;
@@ -785,7 +749,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
if (offset)
return -ESPIPE;
- logbuf_lock_irq();
switch (whence) {
case SEEK_SET:
/* the first record */
@@ -806,7 +769,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
default:
ret = -EINVAL;
}
- logbuf_unlock_irq();
return ret;
}
@@ -820,7 +782,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
poll_wait(file, &log_wait, wait);
- logbuf_lock_irq();
if (prb_read_valid(prb, user->seq, NULL)) {
/* return error when data has vanished underneath us */
if (user->seq < prb_first_valid_seq(prb))
@@ -828,7 +789,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
else
ret = EPOLLIN|EPOLLRDNORM;
}
- logbuf_unlock_irq();
return ret;
}
@@ -861,9 +821,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
prb_rec_init_rd(&user->record, &user->info,
&user->text_buf[0], sizeof(user->text_buf));
- logbuf_lock_irq();
user->seq = prb_first_valid_seq(prb);
- logbuf_unlock_irq();
file->private_data = user;
return 0;
@@ -1071,7 +1029,6 @@ void __init setup_log_buf(int early)
struct printk_record r;
size_t new_descs_size;
size_t new_infos_size;
- unsigned long flags;
char *new_log_buf;
unsigned int free;
u64 seq;
@@ -1129,8 +1086,6 @@ void __init setup_log_buf(int early)
new_descs, ilog2(new_descs_count),
new_infos);
- logbuf_lock_irqsave(flags);
-
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
@@ -1146,8 +1101,6 @@ void __init setup_log_buf(int early)
*/
prb = &printk_rb_dynamic;
- logbuf_unlock_irqrestore(flags);
-
if (seq != prb_next_seq(&printk_rb_static)) {
pr_err("dropped %llu messages\n",
prb_next_seq(&printk_rb_static) - seq);
@@ -1423,9 +1376,9 @@ static int syslog_print(char __user *buf, int size)
size_t n;
size_t skip;
- logbuf_lock_irq();
+ spin_lock_irq(&syslog_lock);
if (!prb_read_valid(prb, syslog_seq, &r)) {
- logbuf_unlock_irq();
+ spin_unlock_irq(&syslog_lock);
break;
}
if (r.info->seq != syslog_seq) {
@@ -1454,7 +1407,7 @@ static int syslog_print(char __user *buf, int size)
syslog_partial += n;
} else
n = 0;
- logbuf_unlock_irq();
+ spin_unlock_irq(&syslog_lock);
if (!n)
break;
@@ -1479,6 +1432,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
struct printk_info info;
unsigned int line_count;
struct printk_record r;
+ u64 newest_seq;
u64 clr_seq;
char *text;
int len = 0;
@@ -1490,19 +1444,30 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
return -ENOMEM;
time = printk_time;
- logbuf_lock_irq();
clr_seq = atomic64_read(&clear_seq);
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
*/
+
prb_for_each_info(clr_seq, prb, seq, &info, &line_count)
len += get_record_print_text_size(&info, line_count, true, time);
- /* move first record forward until length fits into the buffer */
+ /*
+ * Keep track of the latest in case new records are coming in fast
+ * and overwriting the older records.
+ */
+ newest_seq = seq;
+
+ /*
+ * Move first record forward until length fits into the buffer. This
+ * is a best effort attempt. If @newest_seq is reached because the
+ * ringbuffer is wrapping too fast, just start filling the buffer
+ * from there.
+ */
prb_for_each_info(clr_seq, prb, seq, &info, &line_count) {
- if (len <= size)
+ if (len <= size || info.seq > newest_seq)
break;
len -= get_record_print_text_size(&info, line_count, true, time);
}
@@ -1520,12 +1485,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
break;
}
- logbuf_unlock_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- logbuf_lock_irq();
if (len < 0)
break;
@@ -1533,7 +1496,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
if (clear)
atomic64_set(&clear_seq, seq);
- logbuf_unlock_irq();
kfree(text);
return len;
@@ -1541,9 +1503,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
static void syslog_clear(void)
{
- logbuf_lock_irq();
atomic64_set(&clear_seq, prb_next_seq(prb));
- logbuf_unlock_irq();
}
int do_syslog(int type, char __user *buf, int len, int source)
@@ -1551,6 +1511,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
bool clear = false;
static int saved_console_loglevel = LOGLEVEL_DEFAULT;
int error;
+ u64 seq;
error = check_syslog_permissions(type, source);
if (error)
@@ -1568,8 +1529,11 @@ int do_syslog(int type, char __user *buf, int len, int source)
return 0;
if (!access_ok(buf, len))
return -EFAULT;
+ spin_lock_irq(&syslog_lock);
+ seq = syslog_seq;
+ spin_unlock_irq(&syslog_lock);
error = wait_event_interruptible(log_wait,
- prb_read_valid(prb, syslog_seq, NULL));
+ prb_read_valid(prb, seq, NULL));
if (error)
return error;
error = syslog_print(buf, len);
@@ -1617,7 +1581,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- logbuf_lock_irq();
+ spin_lock_irq(&syslog_lock);
if (syslog_seq < prb_first_valid_seq(prb)) {
/* messages are gone, move to first one */
syslog_seq = prb_first_valid_seq(prb);
@@ -1644,7 +1608,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
}
error -= syslog_partial;
}
- logbuf_unlock_irq();
+ spin_unlock_irq(&syslog_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -1847,6 +1811,65 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
}
}
+#ifdef CONFIG_PRINTK_NMI
+#define NUM_RECURSION_CTX 2
+#else
+#define NUM_RECURSION_CTX 1
+#endif
+
+struct printk_recursion {
+ char count[NUM_RECURSION_CTX];
+};
+
+static DEFINE_PER_CPU(struct printk_recursion, percpu_printk_recursion);
+static char printk_recursion_count[NUM_RECURSION_CTX];
+
+static char *get_printk_count(void)
+{
+ struct printk_recursion *rec;
+ char *count;
+
+ if (!printk_percpu_data_ready()) {
+ count = &printk_recursion_count[0];
+ } else {
+ rec = this_cpu_ptr(&percpu_printk_recursion);
+
+ count = &rec->count[0];
+ }
+
+#ifdef CONFIG_PRINTK_NMI
+ if (in_nmi())
+ count++;
+#endif
+
+ return count;
+}
+
+static bool printk_enter(unsigned long *flags)
+{
+ char *count;
+
+ local_irq_save(*flags);
+ count = get_printk_count();
+ /* Only 1 level of recursion allowed. */
+ if (*count > 1) {
+ local_irq_restore(*flags);
+ return false;
+ }
+ (*count)++;
+
+ return true;
+}
+
+static void printk_exit(unsigned long flags)
+{
+ char *count;
+
+ count = get_printk_count();
+ (*count)--;
+ local_irq_restore(flags);
+}
+
int printk_delay_msec __read_mostly;
static inline void printk_delay(void)
@@ -1867,40 +1890,75 @@ static inline u32 printk_caller_id(void)
0x80000000 + raw_smp_processor_id();
}
-/* Must be called under logbuf_lock. */
+static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lflags,
+ const char *fmt, va_list args)
+{
+ char *orig_text = text;
+ u16 text_len;
+
+ text_len = vscnprintf(text, size, fmt, args);
+
+ /* Mark and strip a trailing newline. */
+ if (text_len && text[text_len - 1] == '\n') {
+ text_len--;
+ *lflags |= LOG_NEWLINE;
+ }
+
+ /* Strip kernel syslog prefix. */
+ if (facility == 0) {
+ while (text_len >= 2 && printk_get_level(text)) {
+ text_len -= 2;
+ text += 2;
+ }
+
+ if (text != orig_text)
+ memmove(orig_text, text, text_len);
+ }
+
+ return text_len;
+}
+
+__printf(4, 0)
int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
const u32 caller_id = printk_caller_id();
- static char textbuf[LOG_LINE_MAX];
struct prb_reserved_entry e;
enum log_flags lflags = 0;
struct printk_record r;
+ unsigned long irqflags;
u16 trunc_msg_len = 0;
- char *text = textbuf;
+ char lvlbuf[8];
+ va_list args2;
u16 text_len;
+ int ret = 0;
u64 ts_nsec;
ts_nsec = local_clock();
+ if (!printk_enter(&irqflags))
+ return 0;
+
+ va_copy(args2, args);
+
/*
* The printf needs to come first; we need the syslog
* prefix which might be passed-in as a parameter.
*/
- text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
+ text_len = vsnprintf(&lvlbuf[0], sizeof(lvlbuf), fmt, args) + 1;
+ if (text_len > CONSOLE_LOG_MAX)
+ text_len = CONSOLE_LOG_MAX;
- /* mark and strip a trailing newline */
- if (text_len && text[text_len-1] == '\n') {
- text_len--;
- lflags |= LOG_NEWLINE;
- }
-
- /* strip kernel syslog prefix and extract log level or control flags */
+ /* Extract log level or control flags. */
if (facility == 0) {
int kern_level;
+ int i;
- while ((kern_level = printk_get_level(text)) != 0) {
+ for (i = 0; i < sizeof(lvlbuf); i += 2) {
+ kern_level = printk_get_level(&lvlbuf[i]);
+ if (!kern_level)
+ break;
switch (kern_level) {
case '0' ... '7':
if (level == LOGLEVEL_DEFAULT)
@@ -1909,9 +1967,6 @@ int vprintk_store(int facility, int level,
case 'c': /* KERN_CONT */
lflags |= LOG_CONT;
}
-
- text_len -= 2;
- text += 2;
}
}
@@ -1924,7 +1979,8 @@ int vprintk_store(int facility, int level,
if (lflags & LOG_CONT) {
prb_rec_init_wr(&r, text_len);
if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) {
- memcpy(&r.text_buf[r.info->text_len], text, text_len);
+ text_len = printk_sprint(&r.text_buf[r.info->text_len], text_len,
+ facility, &lflags, fmt, args2);
r.info->text_len += text_len;
if (lflags & LOG_NEWLINE) {
@@ -1934,7 +1990,8 @@ int vprintk_store(int facility, int level,
prb_commit(&e);
}
- return text_len;
+ ret = text_len;
+ goto out;
}
}
@@ -1945,11 +2002,11 @@ int vprintk_store(int facility, int level,
prb_rec_init_wr(&r, text_len + trunc_msg_len);
if (!prb_reserve(&e, prb, &r))
- return 0;
+ goto out;
}
/* fill message */
- memcpy(&r.text_buf[0], text, text_len);
+ text_len = printk_sprint(&r.text_buf[0], text_len, facility, &lflags, fmt, args2);
if (trunc_msg_len)
memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
r.info->text_len = text_len + trunc_msg_len;
@@ -1967,7 +2024,11 @@ int vprintk_store(int facility, int level,
else
prb_final_commit(&e);
- return (text_len + trunc_msg_len);
+ ret = text_len + trunc_msg_len;
+out:
+ va_end(args2);
+ printk_exit(irqflags);
+ return ret;
}
asmlinkage int vprintk_emit(int facility, int level,
@@ -1976,7 +2037,6 @@ asmlinkage int vprintk_emit(int facility, int level,
{
int printed_len;
bool in_sched = false;
- unsigned long flags;
/* Suppress unimportant messages after panic happens */
if (unlikely(suppress_printk))
@@ -1991,9 +2051,7 @@ asmlinkage int vprintk_emit(int facility, int level,
printk_delay();
/* This stops the holder of console_sem just where we want him */
- logbuf_lock_irqsave(flags);
printed_len = vprintk_store(facility, level, dev_info, fmt, args);
- logbuf_unlock_irqrestore(flags);
/* If called from the scheduler, we can not call up(). */
if (!in_sched) {
@@ -2432,7 +2490,6 @@ void console_unlock(void)
size_t len;
printk_safe_enter_irqsave(flags);
- raw_spin_lock(&logbuf_lock);
skip:
if (!prb_read_valid(prb, console_seq, &r))
break;
@@ -2476,7 +2533,6 @@ void console_unlock(void)
console_msg_format & MSG_FORMAT_SYSLOG,
printk_time);
console_seq++;
- raw_spin_unlock(&logbuf_lock);
/*
* While actively printing out messages, if another printk()
@@ -2503,8 +2559,6 @@ void console_unlock(void)
console_locked = 0;
- raw_spin_unlock(&logbuf_lock);
-
up_console_sem();
/*
@@ -2513,9 +2567,7 @@ void console_unlock(void)
* there's a new owner and the console_unlock() from them will do the
* flush, no worries.
*/
- raw_spin_lock(&logbuf_lock);
retry = prb_read_valid(prb, console_seq, NULL);
- raw_spin_unlock(&logbuf_lock);
printk_safe_exit_irqrestore(flags);
if (retry && console_trylock())
@@ -2579,13 +2631,8 @@ void console_flush_on_panic(enum con_flush_mode mode)
console_trylock();
console_may_schedule = 0;
- if (mode == CONSOLE_REPLAY_ALL) {
- unsigned long flags;
-
- logbuf_lock_irqsave(flags);
+ if (mode == CONSOLE_REPLAY_ALL)
console_seq = prb_first_valid_seq(prb);
- logbuf_unlock_irqrestore(flags);
- }
console_unlock();
}
@@ -2809,11 +2856,7 @@ void register_console(struct console *newcon)
nr_ext_console_drivers++;
if (newcon->flags & CON_PRINTBUFFER) {
- /*
- * console_unlock(); will print out the buffered messages
- * for us.
- */
- logbuf_lock_irqsave(flags);
+ spin_lock_irqsave(&syslog_lock, flags);
/*
* We're about to replay the log buffer. Only do this to the
* just-registered console to avoid excessive message spam to
@@ -2826,7 +2869,7 @@ void register_console(struct console *newcon)
exclusive_console = newcon;
exclusive_console_stop_seq = console_seq;
console_seq = syslog_seq;
- logbuf_unlock_irqrestore(flags);
+ spin_unlock_irqrestore(&syslog_lock, flags);
}
console_unlock();
console_sysfs_notify();
@@ -3190,7 +3233,6 @@ EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);
void kmsg_dump(enum kmsg_dump_reason reason)
{
struct kmsg_dumper *dumper;
- unsigned long flags;
rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list) {
@@ -3210,10 +3252,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
/* initialize iterator with data about the stored records */
dumper->active = true;
- logbuf_lock_irqsave(flags);
- dumper->cur_seq = atomic64_read(&clear_seq);
- dumper->next_seq = prb_next_seq(prb);
- logbuf_unlock_irqrestore(flags);
+ kmsg_dump_rewind_nolock(dumper);
/* invoke dumper which will iterate over records */
dumper->dump(dumper, reason);
@@ -3300,14 +3339,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
char *line, size_t size, size_t *len)
{
- unsigned long flags;
- bool ret;
-
- logbuf_lock_irqsave(flags);
- ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
- logbuf_unlock_irqrestore(flags);
-
- return ret;
+ return kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
@@ -3336,7 +3368,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
struct printk_info info;
unsigned int line_count;
struct printk_record r;
- unsigned long flags;
u64 seq;
u64 next_seq;
size_t l = 0;
@@ -3348,17 +3379,14 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
if (!dumper->active || !buf || !size)
goto out;
- logbuf_lock_irqsave(flags);
if (dumper->cur_seq < prb_first_valid_seq(prb)) {
/* messages are gone, move to first available one */
dumper->cur_seq = prb_first_valid_seq(prb);
}
/* last entry */
- if (dumper->cur_seq >= dumper->next_seq) {
- logbuf_unlock_irqrestore(flags);
+ if (dumper->cur_seq >= dumper->next_seq)
goto out;
- }
/* calculate length of entire buffer */
seq = dumper->cur_seq;
@@ -3398,7 +3426,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
dumper->next_seq = next_seq;
ret = true;
- logbuf_unlock_irqrestore(flags);
out:
if (len)
*len = l;
@@ -3413,8 +3440,6 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
* Reset the dumper's iterator so that kmsg_dump_get_line() and
* kmsg_dump_get_buffer() can be called again and used multiple
* times within the same dumper.dump() callback.
- *
- * The function is similar to kmsg_dump_rewind(), but grabs no locks.
*/
void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
{
@@ -3432,11 +3457,7 @@ void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
*/
void kmsg_dump_rewind(struct kmsg_dumper *dumper)
{
- unsigned long flags;
-
- logbuf_lock_irqsave(flags);
kmsg_dump_rewind_nolock(dumper);
- logbuf_unlock_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index a0e6f746de6c..7248b6e3df6c 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -266,18 +266,6 @@ void printk_safe_flush(void)
*/
void printk_safe_flush_on_panic(void)
{
- /*
- * Make sure that we could access the main ring buffer.
- * Do not risk a double release when more CPUs are up.
- */
- if (raw_spin_is_locked(&logbuf_lock)) {
- if (num_online_cpus() > 1)
- return;
-
- debug_locks_off();
- raw_spin_lock_init(&logbuf_lock);
- }
-
printk_safe_flush();
}
@@ -371,17 +359,15 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
* Try to use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
*/
- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) &&
- raw_spin_trylock(&logbuf_lock)) {
+ if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) {
int len;
len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
- raw_spin_unlock(&logbuf_lock);
defer_console_output();
return len;
}
- /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
+ /* Use extra buffer in NMI or in safe mode. */
if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
return vprintk_nmi(fmt, args);
--
2.20.1
On (20/12/01 21:59), John Ogness wrote:
>
> +#ifdef CONFIG_PRINTK_NMI
> +#define NUM_RECURSION_CTX 2
> +#else
> +#define NUM_RECURSION_CTX 1
> +#endif
> +
> +struct printk_recursion {
> + char count[NUM_RECURSION_CTX];
> +};
> +
> +static DEFINE_PER_CPU(struct printk_recursion, percpu_printk_recursion);
> +static char printk_recursion_count[NUM_RECURSION_CTX];
> +
> +static char *get_printk_count(void)
A nit: I think get_foo() has some sort of special meaning and one would
expect that there should be put_foo() as well, and that those have
something to do with the ref counting.
> +{
> + struct printk_recursion *rec;
> + char *count;
> +
> + if (!printk_percpu_data_ready()) {
> + count = &printk_recursion_count[0];
> + } else {
> + rec = this_cpu_ptr(&percpu_printk_recursion);
> +
> + count = &rec->count[0];
> + }
> +
> +#ifdef CONFIG_PRINTK_NMI
> + if (in_nmi())
> + count++;
> +#endif
> +
> + return count;
> +}
> +
> +static bool printk_enter(unsigned long *flags)
This better explicitly state that it disables local IRQs
printk_enter_irqsave()
I'm not very certain that printk_enter/printk_exit are best names:
if (!printk_enter())
return;
Maybe it can spell out why we couldn't enter printk so the function
name can contain recursion_limit or something.
> +{
> + char *count;
> +
> + local_irq_save(*flags);
> + count = get_printk_count();
> + /* Only 1 level of recursion allowed. */
> + if (*count > 1) {
> + local_irq_restore(*flags);
> + return false;
> + }
> + (*count)++;
> +
> + return true;
> +}
> +
> +static void printk_exit(unsigned long flags)
This enables local IRQs, so
printk_exit_irqrestore()
> +{
> + char *count;
> +
> + count = get_printk_count();
> + (*count)--;
> + local_irq_restore(flags);
> +}
-ss
On Tue 2020-12-01 21:59:41, John Ogness wrote:
> Since the ringbuffer is lockless, there is no need for it to be
> protected by @logbuf_lock. Remove @logbuf_lock.
I am going to split the feedback into few mails. It might make sense
to split also this patch into few more pieces that would remove the lock
from a particular interface.
> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> index e9018c4e1b66..7385101210be 100644
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -785,7 +749,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
> if (offset)
> return -ESPIPE;
>
> - logbuf_lock_irq();
user->seq manipulation is not longer safe from the atomicity point of view.
One solution would be to use atomic variable in struct devkmsg_user().
Another solution would be to synchronize it with user->lock like we do
in devkmsg_read().
user->lock looks like an overhead. But it actually would make sense to
prevent seek in the middle of a read.
> switch (whence) {
> case SEEK_SET:
> /* the first record */
> @@ -820,7 +782,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
>
> poll_wait(file, &log_wait, wait);
>
> - logbuf_lock_irq();
> if (prb_read_valid(prb, user->seq, NULL)) {
Same here. The atomicity of user->seq read/write is not guaranteed.
> /* return error when data has vanished underneath us */
> if (user->seq < prb_first_valid_seq(prb))
Best Regards,
Petr
On Tue 2020-12-01 21:59:41, John Ogness wrote:
> Since the ringbuffer is lockless, there is no need for it to be
> protected by @logbuf_lock. Remove @logbuf_lock.
>
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -1490,19 +1444,30 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
> return -ENOMEM;
>
> time = printk_time;
> - logbuf_lock_irq();
> clr_seq = atomic64_read(&clear_seq);
>
> /*
> * Find first record that fits, including all following records,
> * into the user-provided buffer for this dump.
> */
> +
> prb_for_each_info(clr_seq, prb, seq, &info, &line_count)
> len += get_record_print_text_size(&info, line_count, true, time);
>
> - /* move first record forward until length fits into the buffer */
> + /*
> + * Keep track of the latest in case new records are coming in fast
> + * and overwriting the older records.
> + */
"overwriting the older records" sounds like the code is somehow able
to remove the overwritten records from "len". But it is not true.
> + newest_seq = seq;
> +
> + /*
> + * Move first record forward until length fits into the buffer. This
> + * is a best effort attempt. If @newest_seq is reached because the
> + * ringbuffer is wrapping too fast, just start filling the buffer
> + * from there.
> + */
It might be that I do not understand English well. But "start filling
the buffer from there" sounds like we start filling the buffer from
"newest_seq".
What about the following?
/*
* Move first record forward until length fits into the buffer.
* Ignore newest messages that were not counted in the above
* cycle. Messages might appear and get lost in the meantime.
* This is the best effort that prevents an infinite loop.
*/
newest_seq = seq;
> prb_for_each_info(clr_seq, prb, seq, &info, &line_count) {
> - if (len <= size)
> + if (len <= size || info.seq > newest_seq)
> break;
> len -= get_record_print_text_size(&info, line_count, true, time);
> }
> @@ -1568,8 +1529,11 @@ int do_syslog(int type, char __user *buf, int len, int source)
> return 0;
> if (!access_ok(buf, len))
> return -EFAULT;
> + spin_lock_irq(&syslog_lock);
> + seq = syslog_seq;
> + spin_unlock_irq(&syslog_lock);
It would deserve a comment that the locking is needed to guarantee
atomicity of the operation.
> error = wait_event_interruptible(log_wait,
> - prb_read_valid(prb, syslog_seq, NULL));
> + prb_read_valid(prb, seq, NULL));
> if (error)
> return error;
> error = syslog_print(buf, len);
> @@ -2809,11 +2856,7 @@ void register_console(struct console *newcon)
> nr_ext_console_drivers++;
>
> if (newcon->flags & CON_PRINTBUFFER) {
> - /*
> - * console_unlock(); will print out the buffered messages
> - * for us.
> - */
> - logbuf_lock_irqsave(flags);
> + spin_lock_irqsave(&syslog_lock, flags);
We should take the lock only around assigning syslog_seq. And add a
comment that it guarantees atomic update.
> /*
> * We're about to replay the log buffer. Only do this to the
> * just-registered console to avoid excessive message spam to
> @@ -2826,7 +2869,7 @@ void register_console(struct console *newcon)
> exclusive_console = newcon;
> exclusive_console_stop_seq = console_seq;
> console_seq = syslog_seq;
> - logbuf_unlock_irqrestore(flags);
> + spin_unlock_irqrestore(&syslog_lock, flags);
> }
> console_unlock();
> console_sysfs_notify();
Best Regards,
Petr
On Tue 2020-12-01 21:59:41, John Ogness wrote:
> Since the ringbuffer is lockless, there is no need for it to be
> protected by @logbuf_lock. Remove @logbuf_lock.
>
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -1847,6 +1811,65 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
> }
> }
The recursion-related code needs some explanation or we should do it
another way. I spent quite some time on it and I am still not sure
that I understand it. Let me describe how I understand it.
> +#ifdef CONFIG_PRINTK_NMI
> +#define NUM_RECURSION_CTX 2
> +#else
> +#define NUM_RECURSION_CTX 1
> +#endif
OK, the number of context is limited because interrupts are disabled inside
print_enter()/printk_exit(). It is basically the same reason why
we have only two printk_safe buffers (NNI + other contexts).
What is the exact reason to disable interrupts around the entire
vprintk_store(), please? It should get documented.
One reason is the use of per-cpu variables. Alternative solution would
be to store printk_context into task_struct. Well, I am not sure if
"current" task is available during early boot. But it might solve
problems with per-cpu variables that are not working during early boot.
That said, I am not sure if it is worth it.
> +
> +struct printk_recursion {
> + char count[NUM_RECURSION_CTX];
> +};
>
> +static DEFINE_PER_CPU(struct printk_recursion, percpu_printk_recursion);
> +static char printk_recursion_count[NUM_RECURSION_CTX];
This is pretty confusing. The array is hidden in a struct when per-cpu
variables are used. And a naked array is used for early boot.
Is the structure really needed? What about?
static DEFINE_PER_CPU(char [PRINTK_CTX_NUM], printk_count);
static char printk_count_early[NUM_RECURSION_CTX];
> +
> +static char *get_printk_count(void)
> +{
> + struct printk_recursion *rec;
> + char *count;
> +
> + if (!printk_percpu_data_ready()) {
> + count = &printk_recursion_count[0];
I see why you avoided per-cpu variables for early boot. I am just
curious how printk_context variable works these days. It is used by
any printk(), including early code, see vprintk_func().
> + } else {
> + rec = this_cpu_ptr(&percpu_printk_recursion);
> +
> + count = &rec->count[0];
> + }
> +
> +#ifdef CONFIG_PRINTK_NMI
> + if (in_nmi())
> + count++;
> +#endif
This is extremely confusing. It is far from obvious that
the pointer and not the value is incremented.
If we really need this to avoid per-cpu variables during early boot
then a more clear implementation would be:
char *get_printk_counter_by_ctx()
{
int ctx = 0;
if (in_nmi)
ctx = 1;
if (!printk_percpu_data_ready())
return &printk_count_early[ctx];
return this_cpu_ptr(printk_count[ctx]);
}
> +
> + return count;
> +}
> +
> +static bool printk_enter(unsigned long *flags)
> +{
> + char *count;
> +
> + local_irq_save(*flags);
> + count = get_printk_count();
> + /* Only 1 level of recursion allowed. */
We should allow at least some level of recursion. Otherwise, we would
not see warnings printed from vsprintf code.
> + if (*count > 1) {
> + local_irq_restore(*flags);
> + return false;
> + }
> + (*count)++;
> +
> + return true;
> +}
This should be unified with printk_context, printk_nmi_enter(),
printk_nmi_exit(). It does not make sense to have two separate
printk context counters.
Or is there any plan to remove printk_safe and printk_context?
BTW: I prefer to use the bitmask approach. It allows to check
the normal context by a single operation (no bit is set).
There is no need to go through all counters.
Note that we need at least one more context for gdb.
Best Regards,
Petr
On Tue 2020-12-01 21:59:41, John Ogness wrote:
> Since the ringbuffer is lockless, there is no need for it to be
> protected by @logbuf_lock. Remove @logbuf_lock.
>
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -1867,40 +1890,75 @@ static inline u32 printk_caller_id(void)
> 0x80000000 + raw_smp_processor_id();
> }
>
> -/* Must be called under logbuf_lock. */
> +static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lflags,
> + const char *fmt, va_list args)
> +{
> + char *orig_text = text;
> + u16 text_len;
> +
> + text_len = vscnprintf(text, size, fmt, args);
> +
> + /* Mark and strip a trailing newline. */
> + if (text_len && text[text_len - 1] == '\n') {
> + text_len--;
> + *lflags |= LOG_NEWLINE;
> + }
We reserve the space for '\n' anyway. It would make sense to just
store it and remove all these LOG_NEWLINE-specific hacks.
Well, let's leave it as is now. It is still possible that people
will not love this approach and we will need to format the message
into some temporary buffer first.
> + /* Strip kernel syslog prefix. */
Syslog actually uses: <level> format. We are skipping log level
and control flags here.
> + if (facility == 0) {
> + while (text_len >= 2 && printk_get_level(text)) {
> + text_len -= 2;
> + text += 2;
> + }
We should avoid two completely different approaches
that handle printk_level prefix.
One solution is to implement something like:
static char *parse_prefix(text, &level, &flags)
That would return pointer to the text after the prefix.
And fill level and flags only when non-NULL pointers are passed.
Another solution would be to pass this information from
vprintk_store(). The prefix has already been parsed
after all.
> +
> + if (text != orig_text)
> + memmove(orig_text, text, text_len);
> + }
We should clear the freed space to make the ring buffer as
human readable as possible when someone just dumps the memory.
Sigh, I have to admit that I missed the problem with prefix and
trailing '\n' when I suggested to avoid the temporary buffers.
This memmove() and the space wasting is pity.
Well, it is typically 3 bytes per message. And the copying would
be necessary even with the temporary buffer. So, I am less convinced
but I would still try to avoid the temporary buffers for now.
> +
> + return text_len;
> +}
> +
> +__printf(4, 0)
Best Regards,
Petr
On Tue 2020-12-01 21:59:41, John Ogness wrote:
> Since the ringbuffer is lockless, there is no need for it to be
> protected by @logbuf_lock. Remove @logbuf_lock.
>
> --- a/kernel/printk/internal.h
> +++ b/kernel/printk/internal.h
> @@ -59,7 +57,7 @@ void defer_console_output(void);
> __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
>
> /*
> - * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
> + * In !PRINTK builds we still export console_sem
> * semaphore and some of console functions (console_unlock()/etc.), so
> * printk-safe must preserve the existing local IRQ guarantees.
We should revisit whether it is still needed just for console_sem.
Well, I wonder why we need printk_safe at all.
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -2432,7 +2490,6 @@ void console_unlock(void)
> size_t len;
>
> printk_safe_enter_irqsave(flags);
Why do we actually need to use the printk_safe context here?
There is not longer a risk of deadlock caused by logbuf_lock.
All other recursions should be prevented by console_trylock()
in printk(). Do I miss anything?
Note that we still need to disable interrupts around
console_lock_spinning_enable();
console_lock_spinning_disable_and_check();
to make sure that printk() could busy wait for passing
the console lock.
> - raw_spin_lock(&logbuf_lock);
> skip:
> if (!prb_read_valid(prb, console_seq, &r))
> break;
Best Regards,
Petr
On (20/12/04 17:10), Petr Mladek wrote:
[..]
> char *get_printk_counter_by_ctx()
> {
> int ctx = 0;
>
> if (in_nmi)
> ctx = 1;
>
> if (!printk_percpu_data_ready())
> return &printk_count_early[ctx];
>
> return this_cpu_ptr(printk_count[ctx]);
> }
>
> > +
> > + return count;
> > +}
> > +
> > +static bool printk_enter(unsigned long *flags)
> > +{
> > + char *count;
> > +
> > + local_irq_save(*flags);
> > + count = get_printk_count();
> > + /* Only 1 level of recursion allowed. */
>
> We should allow at least some level of recursion. Otherwise, we would
> not see warnings printed from vsprintf code.
One level of recursion seems reasonable on one hand, but on the other
hand I also wonder if we can get useful info from recursion levels 2
and higher. Would be great to maybe list possible scenarios. vsprintf()
still call re-enter printk() -- WARN_ONCE()-s in the code -- external
format specifiers handlers also can. Would we need to let two levels of
recursion printk->vsprintf->printk->???->printk or one is just enough?
It also would make sense to add the lost messages counter to per-CPU
recursion counter struct, to count the number of times we bailout
of printk due to recursion limit. So that we'll at least have
"%d recursive printk messages lost" hints.
Overall...
I wonder where does the "limit printk recursion" come from? printk_safe
doesn't impose any strict limits (print_context is limited, but still)
and we've been running it for years now; have we ever seen any reports
of printk recursion overflows?
> > + if (*count > 1) {
> > + local_irq_restore(*flags);
> > + return false;
> > + }
> > + (*count)++;
> > +
> > + return true;
> > +}
>
> This should be unified with printk_context, printk_nmi_enter(),
> printk_nmi_exit(). It does not make sense to have two separate
> printk context counters.
Agreed.
> Or is there any plan to remove printk_safe and printk_context?
That's a good point. This patch set and printk_safe answer the
same question in different ways, as far as I understand it. The
question is "Why do we want to track printk recursion"? This patch
set merely wants to, correct me if I'm wrong, avoid the very deep
vprintk_store() recursion stacks (which is a subset of printk()
recursion superset):
vprintk_store()
{
if (!printk_enter())
return;
vsprintf/prb
print_exit();
}
And that's pretty much it, at least for the time being.
printk_safe()'s answer is - we don't want to re-enter parts of
the kernel that sit in the core, behind the scenes, and that are
not ready to be re-entered. Things like
printk()
down_console_sem()
down()
raw_spin_lock_irqsave(&sem->lock)
printk()
down_console_sem()
down()
raw_spin_lock_irqsave(&sem->lock)
-ss
On (20/12/04 17:19), Petr Mladek wrote:
[..]
> > --- a/kernel/printk/printk.c
> > +++ b/kernel/printk/printk.c
> > @@ -2432,7 +2490,6 @@ void console_unlock(void)
> > size_t len;
> >
> > printk_safe_enter_irqsave(flags);
>
> Why do we actually need to use the printk_safe context here?
> There is not longer a risk of deadlock caused by logbuf_lock.
> All other recursions should be prevented by console_trylock()
> in printk().
All semaphore functions, including down_trylock(), acquire
semaphore spin_lock; and then some call into the scheduler
(or other kernel core functions) under semaphore's spin_lock.
For instance
up()
raw_spin_lock_irqsave(&sem->lock)
__up()
wake_up_process()
try_to_wake_up()
-ss
On (20/12/04 17:10), Petr Mladek wrote:
>
> One reason is the use of per-cpu variables. Alternative solution would
> be to store printk_context into task_struct.
We can keep per-CPU, disable preemption and have counters for
every context (task, soft/hard irq, NMI). Shouldn't be a problem
vprintk_emit()
{
preempt_disable()
vprintk_store()
preempt_enable()
preempt_disable()
console_unlock()
preempt_enable()
}
vprintk_store() is a small fraction of console_unlock() time wise.
-ss
On 2020-12-04, Sergey Senozhatsky <[email protected]> wrote:
>> +static char *get_printk_count(void)
>
> A nit: I think get_foo() has some sort of special meaning and one
> would expect that there should be put_foo() as well, and that those
> have something to do with the ref counting.
OK. How about:
static char *printk_recursion_counter(void)
?
>> +static bool printk_enter(unsigned long *flags)
>
> This better explicitly state that it disables local IRQs
>
> printk_enter_irqsave()
>
> I'm not very certain that printk_enter/printk_exit are best names:
>
> if (!printk_enter())
> return;
>
> Maybe it can spell out why we couldn't enter printk so the function
> name can contain recursion_limit or something.
printk_recurse_irqsave() ?
I would prefer printk_enter_irqsave() and just add comments that say it
is checking the recursion level. It is only used in one place.
>> +static void printk_exit(unsigned long flags)
>
> This enables local IRQs, so
>
> printk_exit_irqrestore()
Yes, I like that.
John Ogness
On 2020-12-04, Petr Mladek <[email protected]> wrote:
>> Since the ringbuffer is lockless, there is no need for it to be
>> protected by @logbuf_lock. Remove @logbuf_lock.
>
> It might make sense to split also this patch into few more pieces that
> would remove the lock from a particular interface.
OK.
>> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
>> index e9018c4e1b66..7385101210be 100644
>> --- a/kernel/printk/printk.c
>> +++ b/kernel/printk/printk.c
>> @@ -785,7 +749,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
>> if (offset)
>> return -ESPIPE;
>>
>> - logbuf_lock_irq();
>
> user->seq manipulation is not longer safe from the atomicity point of
> view.
>
> One solution would be to use atomic variable in struct devkmsg_user().
> Another solution would be to synchronize it with user->lock like we do
> in devkmsg_read().
>
> user->lock looks like an overhead. But it actually would make sense to
> prevent seek in the middle of a read.
I would prefer using atomic64_t. Using user->lock could introduce some
wacky regression.
>> switch (whence) {
>> case SEEK_SET:
>> /* the first record */
>> @@ -820,7 +782,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
>>
>> poll_wait(file, &log_wait, wait);
>>
>> - logbuf_lock_irq();
>> if (prb_read_valid(prb, user->seq, NULL)) {
>
> Same here. The atomicity of user->seq read/write is not guaranteed.
Right.
John Ogness
On 2020-12-04, Petr Mladek <[email protected]> wrote:
> On Tue 2020-12-01 21:59:41, John Ogness wrote:
>> Since the ringbuffer is lockless, there is no need for it to be
>> protected by @logbuf_lock. Remove @logbuf_lock.
>>
>> --- a/kernel/printk/printk.c
>> +++ b/kernel/printk/printk.c
>> @@ -1490,19 +1444,30 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
>> return -ENOMEM;
>>
>> time = printk_time;
>> - logbuf_lock_irq();
>> clr_seq = atomic64_read(&clear_seq);
>>
>> /*
>> * Find first record that fits, including all following records,
>> * into the user-provided buffer for this dump.
>> */
>> +
>> prb_for_each_info(clr_seq, prb, seq, &info, &line_count)
>> len += get_record_print_text_size(&info, line_count, true, time);
>>
>> - /* move first record forward until length fits into the buffer */
>> + /*
>> + * Keep track of the latest in case new records are coming in fast
>> + * and overwriting the older records.
>> + */
Your suggestion to merge this and the next comment block is fine.
>> + newest_seq = seq;
>> +
>> + /*
>> + * Move first record forward until length fits into the buffer. This
>> + * is a best effort attempt. If @newest_seq is reached because the
>> + * ringbuffer is wrapping too fast, just start filling the buffer
>> + * from there.
>> + */
>
> It might be that I do not understand English well. But "start filling
> the buffer from there" sounds like we start filling the buffer from
> "newest_seq".
>
> What about the following?
>
> /*
> * Move first record forward until length fits into the buffer.
> * Ignore newest messages that were not counted in the above
> * cycle. Messages might appear and get lost in the meantime.
> * This is the best effort that prevents an infinite loop.
> */
> newest_seq = seq;
OK.
>> prb_for_each_info(clr_seq, prb, seq, &info, &line_count) {
>> - if (len <= size)
>> + if (len <= size || info.seq > newest_seq)
>> break;
>> len -= get_record_print_text_size(&info, line_count, true, time);
>> }
>> @@ -1568,8 +1529,11 @@ int do_syslog(int type, char __user *buf, int len, int source)
>> return 0;
>> if (!access_ok(buf, len))
>> return -EFAULT;
>> + spin_lock_irq(&syslog_lock);
>> + seq = syslog_seq;
>> + spin_unlock_irq(&syslog_lock);
>
> It would deserve a comment that the locking is needed to guarantee
> atomicity of the operation.
OK.
>> error = wait_event_interruptible(log_wait,
>> - prb_read_valid(prb, syslog_seq, NULL));
>> + prb_read_valid(prb, seq, NULL));
>> if (error)
>> return error;
>> error = syslog_print(buf, len);
>> @@ -2809,11 +2856,7 @@ void register_console(struct console *newcon)
>> nr_ext_console_drivers++;
>>
>> if (newcon->flags & CON_PRINTBUFFER) {
>> - /*
>> - * console_unlock(); will print out the buffered messages
>> - * for us.
>> - */
>> - logbuf_lock_irqsave(flags);
>> + spin_lock_irqsave(&syslog_lock, flags);
>
> We should take the lock only around assigning syslog_seq. And add a
> comment that it guarantees atomic update.
OK. So you just want "exclusive_console = newcon;" moved outside the
critical section.
>> /*
>> * We're about to replay the log buffer. Only do this to the
>> * just-registered console to avoid excessive message spam to
>> @@ -2826,7 +2869,7 @@ void register_console(struct console *newcon)
>> exclusive_console = newcon;
>> exclusive_console_stop_seq = console_seq;
>> console_seq = syslog_seq;
>> - logbuf_unlock_irqrestore(flags);
>> + spin_unlock_irqrestore(&syslog_lock, flags);
>> }
>> console_unlock();
>> console_sysfs_notify();
John Ogness
On 2020-12-04, Petr Mladek <[email protected]> wrote:
> On Tue 2020-12-01 21:59:41, John Ogness wrote:
>> Since the ringbuffer is lockless, there is no need for it to be
>> protected by @logbuf_lock. Remove @logbuf_lock.
>>
>> --- a/kernel/printk/printk.c
>> +++ b/kernel/printk/printk.c
>> @@ -1847,6 +1811,65 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
>> }
>> }
>
> The recursion-related code needs some explanation or we should do it
> another way. I spent quite some time on it and I am still not sure
> that I understand it.
Sorry. :-/
> Let me describe how I understand it.
>
>> +#ifdef CONFIG_PRINTK_NMI
>> +#define NUM_RECURSION_CTX 2
>> +#else
>> +#define NUM_RECURSION_CTX 1
>> +#endif
>
> OK, the number of context is limited because interrupts are disabled
> inside print_enter()/printk_exit(). It is basically the same reason
> why we have only two printk_safe buffers (NNI + other contexts).
Correct.
> What is the exact reason to disable interrupts around the entire
> vprintk_store(), please? It should get documented.
It simplifies the context tracking. Also, in mainline interrupts are
already disabled for all of vprintk_store(). AFAIK latencies due to
logbuf_lock contention were not an issue.
> One reason is the use of per-cpu variables. Alternative solution would
> be to store printk_context into task_struct. Well, I am not sure if
> "current" task is available during early boot. But it might solve
> problems with per-cpu variables that are not working during early
> boot.
>
> That said, I am not sure if it is worth it.
I really don't want to touch task_struct. IMHO the usefulness of that
struct is limited, considering that printk can be called from scheduling
and interrupting contexts.
>> +
>> +struct printk_recursion {
>> + char count[NUM_RECURSION_CTX];
>> +};
>>
>> +static DEFINE_PER_CPU(struct printk_recursion, percpu_printk_recursion);
>> +static char printk_recursion_count[NUM_RECURSION_CTX];
>
> This is pretty confusing. The array is hidden in a struct when per-cpu
> variables are used. And a naked array is used for early boot.
>
> Is the structure really needed? What about?
>
> static DEFINE_PER_CPU(char [PRINTK_CTX_NUM], printk_count);
> static char printk_count_early[NUM_RECURSION_CTX];
OK.
>> +
>> +static char *get_printk_count(void)
>> +{
>> + struct printk_recursion *rec;
>> + char *count;
>> +
>> + if (!printk_percpu_data_ready()) {
>> + count = &printk_recursion_count[0];
>
> I see why you avoided per-cpu variables for early boot. I am just
> curious how printk_context variable works these days. It is used by
> any printk(), including early code, see vprintk_func().
IMO printk_context is serving a different purpose. With the existance of
logbuf_lock, printk_context exists for the sole purpose of making sure
logbuf_lock is not taken recursively or that the CPU does not spin on it
in NMI context. printk_context is simply gating calls to the safe
buffers.
For the lockless ringbuffer, there is no issue of taking a lock
recursively or dangers from NMI. There is no need for the printk_context
"gate". However, IMHO there is a real danger if a bug in printk (or its
ringbuffer) lead to infinite recursion. This new recursion counter is
offering safety against this scenario. Until now this scenario has been
ignored. So I suppose if we are comfortable with continuing to ignore
the scenario, then we don't need to track the recursion level.
To test, I triggered artificial WARNs in vsnprintf() of printk code. I
found it nice to be able to see the stack trace going into printk and at
the same time I was relieved that such a nested warning was not blasting
the system into infinite recursion.
>> + } else {
>> + rec = this_cpu_ptr(&percpu_printk_recursion);
>> +
>> + count = &rec->count[0];
>> + }
>> +
>> +#ifdef CONFIG_PRINTK_NMI
>> + if (in_nmi())
>> + count++;
>> +#endif
>
> This is extremely confusing. It is far from obvious that
> the pointer and not the value is incremented.
>
> If we really need this to avoid per-cpu variables during early boot
> then a more clear implementation would be:
>
> char *get_printk_counter_by_ctx()
> {
> int ctx = 0;
>
> if (in_nmi)
> ctx = 1;
>
> if (!printk_percpu_data_ready())
> return &printk_count_early[ctx];
>
> return this_cpu_ptr(printk_count[ctx]);
> }
Yes, much cleaner. Thank you.
>> +
>> + return count;
>> +}
>> +
>> +static bool printk_enter(unsigned long *flags)
>> +{
>> + char *count;
>> +
>> + local_irq_save(*flags);
>> + count = get_printk_count();
>> + /* Only 1 level of recursion allowed. */
>
> We should allow at least some level of recursion. Otherwise, we would
> not see warnings printed from vsprintf code.
With 1 level, you will see warnings from vsprintf code. I'm not sure it
makes sense to allow more than 1 level. It causes exponential logging.
>> + if (*count > 1) {
>> + local_irq_restore(*flags);
>> + return false;
>> + }
>> + (*count)++;
>> +
>> + return true;
>> +}
>
> This should be unified with printk_context, printk_nmi_enter(),
> printk_nmi_exit(). It does not make sense to have two separate
> printk context counters.
>
> Or is there any plan to remove printk_safe and printk_context?
Yes, I plan to remove the safe buffers, which also removes printk_safe.c
and the printk_context "gate".
> BTW: I prefer to use the bitmask approach. It allows to check
> the normal context by a single operation (no bit is set).
> There is no need to go through all counters.
OK.
> Note that we need at least one more context for gdb.
Ah yes, thank you.
John Ogness
On 2020-12-05, Sergey Senozhatsky <[email protected]> wrote:
>> We should allow at least some level of recursion. Otherwise, we would
>> not see warnings printed from vsprintf code.
>
> One level of recursion seems reasonable on one hand, but on the other
> hand I also wonder if we can get useful info from recursion levels 2
> and higher. Would be great to maybe list possible scenarios. vsprintf()
> still call re-enter printk() -- WARN_ONCE()-s in the code -- external
> format specifiers handlers also can. Would we need to let two levels of
> recursion printk->vsprintf->printk->???->printk or one is just enough?
>
> It also would make sense to add the lost messages counter to per-CPU
> recursion counter struct, to count the number of times we bailout
> of printk due to recursion limit. So that we'll at least have
> "%d recursive printk messages lost" hints.
We do not need such a counter to be per-cpu. A global atomic_long_t
would suffice. Although I am not sure if that should be separate from
the @fail member of the ringbuffer.
> Overall...
> I wonder where does the "limit printk recursion" come from? printk_safe
> doesn't impose any strict limits (print_context is limited, but still)
> and we've been running it for years now; have we ever seen any reports
> of printk recursion overflows?
The printk code is undergoing some major changes and we have already had
bugs slip into the merge window. IMHO the additional code to track the
recursion does not add significant complexity or runtime overhead. I
would prefer to keep it until we are finished with this major rework.
John Ogness
On 2020-12-05, Sergey Senozhatsky <[email protected]> wrote:
>> One reason is the use of per-cpu variables. Alternative solution would
>> be to store printk_context into task_struct.
>
> We can keep per-CPU, disable preemption and have counters for
> every context (task, soft/hard irq, NMI). Shouldn't be a problem
These contexts are difficult to track correctly for all the preemption
models (especially when PREEMPT_RT is included). But I will look into
this idea. It would be nice to keep interrupts enabled for that first
vsnprintf() in vprintk_store().
John Ogness
On 2020-12-04, Petr Mladek <[email protected]> wrote:
>> + /* Strip kernel syslog prefix. */
>
> Syslog actually uses: <level> format. We are skipping log level
> and control flags here.
OK.
>> + if (facility == 0) {
>> + while (text_len >= 2 && printk_get_level(text)) {
>> + text_len -= 2;
>> + text += 2;
>> + }
>
> We should avoid two completely different approaches
> that handle printk_level prefix.
>
> One solution is to implement something like:
>
> static char *parse_prefix(text, &level, &flags)
>
> That would return pointer to the text after the prefix.
> And fill level and flags only when non-NULL pointers are passed.
OK.
> Another solution would be to pass this information from
> vprintk_store(). The prefix has already been parsed
> after all.
Well, there is a vscnprintf() that happens in between and I don't think
we should trust the parsed offset from the first vsnprintf().
>> +
>> + if (text != orig_text)
>> + memmove(orig_text, text, text_len);
>> + }
>
> We should clear the freed space to make the ring buffer as
> human readable as possible when someone just dumps the memory.
Data blocks are currently padded and that padding is not cleared. So it
is already not perfectly human readable on a raw dump.
> Sigh, I have to admit that I missed the problem with prefix and
> trailing '\n' when I suggested to avoid the temporary buffers.
> This memmove() and the space wasting is pity.
>
> Well, it is typically 3 bytes per message. And the copying would
> be necessary even with the temporary buffer. So, I am less convinced
> but I would still try to avoid the temporary buffers for now.
Agreed. I think this approach is better than the temporary buffers I
previously used. Also, if we add a trimming feature to the ringbuffer,
it will keep the ringbuffer mostly clean anyway. Something like this:
prb_rec_init_wr(&r, text_len);
prb_reserve(&e, prb, &r);
text_len = printk_sprint(&r.text_buf[0], text_len, ...);
r.info->text_len = text_len;
prb_trim_rec(&e, &r); <--- try to reduce datablock size to @text_len
prb_commit(&e);
I see no urgency to add such a feature. But I think we should keep it on
our radar.
John Ogness
On Sat 2020-12-05 13:39:53, Sergey Senozhatsky wrote:
> On (20/12/04 17:19), Petr Mladek wrote:
> [..]
> > > --- a/kernel/printk/printk.c
> > > +++ b/kernel/printk/printk.c
> > > @@ -2432,7 +2490,6 @@ void console_unlock(void)
> > > size_t len;
> > >
> > > printk_safe_enter_irqsave(flags);
> >
> > Why do we actually need to use the printk_safe context here?
> > There is not longer a risk of deadlock caused by logbuf_lock.
> > All other recursions should be prevented by console_trylock()
> > in printk().
>
> All semaphore functions, including down_trylock(), acquire
> semaphore spin_lock;
This has a very easy solution. The patch adds a code that counts
recursion level. We just must not call console_trylock() when
being inside a recursive printk.
printk_safe() has two functions:
1. It allows to store messages a lockless way. This is obsoleted
by the lockless ringbuffer.
2. It prevents calling consoles. We could move this check
into vprintk_store(). We already have there similar check
for printk_deferred().
> and then some call into the scheduler
> (or other kernel core functions) under semaphore's spin_lock.
> For instance
>
> up()
> raw_spin_lock_irqsave(&sem->lock)
> __up()
> wake_up_process()
> try_to_wake_up()
This problem is partly solved by printk_deferred(). In each
case, printk_safe() does not help here.
I still do _not_ see a reason to keep printk_safe()!
Best Regards,
Petr
On Sun 2020-12-06 21:57:46, John Ogness wrote:
> On 2020-12-04, Petr Mladek <[email protected]> wrote:
> >> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> >> index e9018c4e1b66..7385101210be 100644
> >> --- a/kernel/printk/printk.c
> >> +++ b/kernel/printk/printk.c
> >> @@ -785,7 +749,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
> >> if (offset)
> >> return -ESPIPE;
> >>
> >> - logbuf_lock_irq();
> >
> > user->seq manipulation is not longer safe from the atomicity point of
> > view.
> >
> > One solution would be to use atomic variable in struct devkmsg_user().
> > Another solution would be to synchronize it with user->lock like we do
> > in devkmsg_read().
> >
> > user->lock looks like an overhead. But it actually would make sense to
> > prevent seek in the middle of a read.
>
> I would prefer using atomic64_t. Using user->lock could introduce some
> wacky regression.
OK, fair enough. User space might do crazy stuff.
Best Regards,
Petr
On Sun 2020-12-06 22:12:21, John Ogness wrote:
> On 2020-12-04, Petr Mladek <[email protected]> wrote:
> > On Tue 2020-12-01 21:59:41, John Ogness wrote:
> >> Since the ringbuffer is lockless, there is no need for it to be
> >> protected by @logbuf_lock. Remove @logbuf_lock.
> >>
> >> --- a/kernel/printk/printk.c
> >> +++ b/kernel/printk/printk.c
> >> @@ -2809,11 +2856,7 @@ void register_console(struct console *newcon)
> >> nr_ext_console_drivers++;
> >>
> >> if (newcon->flags & CON_PRINTBUFFER) {
> >> - /*
> >> - * console_unlock(); will print out the buffered messages
> >> - * for us.
> >> - */
> >> - logbuf_lock_irqsave(flags);
> >> + spin_lock_irqsave(&syslog_lock, flags);
> >
> > We should take the lock only around assigning syslog_seq. And add a
> > comment that it guarantees atomic update.
>
> OK. So you just want "exclusive_console = newcon;" moved outside the
> critical section.
Exactly, I would like to make it clear that it synchronizes only the
single assignment. Otherwise, people might get wrong assumption.
I know that there is a comment describing the scope of each lock.
But people might miss it or do not search for it at all.
Best Regards,
Petr
On Sun 2020-12-06 22:50:54, John Ogness wrote:
> On 2020-12-04, Petr Mladek <[email protected]> wrote:
> > On Tue 2020-12-01 21:59:41, John Ogness wrote:
> >> Since the ringbuffer is lockless, there is no need for it to be
> >> protected by @logbuf_lock. Remove @logbuf_lock.
> >>
> >> --- a/kernel/printk/printk.c
> >> +++ b/kernel/printk/printk.c
> > What is the exact reason to disable interrupts around the entire
> > vprintk_store(), please? It should get documented.
>
> It simplifies the context tracking. Also, in mainline interrupts are
> already disabled for all of vprintk_store(). AFAIK latencies due to
> logbuf_lock contention were not an issue.
>
> I really don't want to touch task_struct. IMHO the usefulness of that
> struct is limited, considering that printk can be called from scheduling
> and interrupting contexts.
Fair enough. I am fine with the per-CPU variables and the disabled
interrupts around vprintk_store().
Note: We should also prevent calling console_trylock() for recursive
messages to avoid infinite loop or even deadlock in this part of the code.
> >> +
> >> +struct printk_recursion {
> >> + char count[NUM_RECURSION_CTX];
> >> +};
> >>
> >> +static DEFINE_PER_CPU(struct printk_recursion, percpu_printk_recursion);
> >> +static char printk_recursion_count[NUM_RECURSION_CTX];
> >
> > This is pretty confusing. The array is hidden in a struct when per-cpu
> > variables are used. And a naked array is used for early boot.
> >
> > Is the structure really needed? What about?
> >
> > static DEFINE_PER_CPU(char [PRINTK_CTX_NUM], printk_count);
> > static char printk_count_early[NUM_RECURSION_CTX];
>
> OK.
>
> >> +
> >> +static char *get_printk_count(void)
> >> +{
> >> + struct printk_recursion *rec;
> >> + char *count;
> >> +
> >> + if (!printk_percpu_data_ready()) {
> >> + count = &printk_recursion_count[0];
> >
> > I see why you avoided per-cpu variables for early boot. I am just
> > curious how printk_context variable works these days. It is used by
> > any printk(), including early code, see vprintk_func().
>
> IMO printk_context is serving a different purpose. With the existance of
> logbuf_lock, printk_context exists for the sole purpose of making sure
> logbuf_lock is not taken recursively or that the CPU does not spin on it
> in NMI context. printk_context is simply gating calls to the safe
> buffers.
Well, both printk_count and printk_context are able to count recursion
in different context. They both are used to decide how printk() will
behave...
Anyway, it is not necessary to fight over words. You write below that
the plan is to remove printk_safe, including printk_context. It will
solve my problem.
I am fine with having both of them for some transition period. I guess
that it will make our life easier, from the coding and review point
of view.
> >> +static bool printk_enter(unsigned long *flags)
> >> +{
> >> + char *count;
> >> +
> >> + local_irq_save(*flags);
> >> + count = get_printk_count();
> >> + /* Only 1 level of recursion allowed. */
> >
> > We should allow at least some level of recursion. Otherwise, we would
> > not see warnings printed from vsprintf code.
>
> With 1 level, you will see warnings from vsprintf code. I'm not sure it
> makes sense to allow more than 1 level. It causes exponential logging.
Shame on me. I have missed that 1 level was actually enabled.
Anyway, I would like to allow 2 level recursion at least. However, for
example, 5 level recursion, would be even better.
We need to know when there is problem to store the recursive/nested
message. And I would rather see the same message repeated 3 times than
to do not see it at all.
Note that the risk of infinite recursion is pretty low. We have most of
the code called from vprintk_emit() under control. There are many
pr_once() or WARN_ONCE(). The error messages have rather simple
and commonly used formatting, so the risk of recursive errors in
vsprintf() code is low.
> > Or is there any plan to remove printk_safe and printk_context?
I am feeling relief to read this.
Do not take me wrong. printk_safe() was really great idea and served
well its purpose. But it is yet another tricky lockless code. There
is another complexity with flushing the temporary buffers and handling
panic(). It is nice that we could remove some of this complexity.
Best Regards,
Petr
On Sun 2020-12-06 23:36:53, John Ogness wrote:
> On 2020-12-04, Petr Mladek <[email protected]> wrote:
> >> + if (facility == 0) {
> >> + while (text_len >= 2 && printk_get_level(text)) {
> >> + text_len -= 2;
> >> + text += 2;
> >> + }
> >
> > We should avoid two completely different approaches
> > that handle printk_level prefix.
> >
> > One solution is to implement something like:
> >
> > static char *parse_prefix(text, &level, &flags)
> >
> > That would return pointer to the text after the prefix.
> > And fill level and flags only when non-NULL pointers are passed.
>
> OK.
>
> > Another solution would be to pass this information from
> > vprintk_store(). The prefix has already been parsed
> > after all.
>
> Well, there is a vscnprintf() that happens in between and I don't think
> we should trust the parsed offset from the first vsnprintf().
Good point!
> >> +
> >> + if (text != orig_text)
> >> + memmove(orig_text, text, text_len);
> >> + }
> >
> > We should clear the freed space to make the ring buffer as
> > human readable as possible when someone just dumps the memory.
>
> Data blocks are currently padded and that padding is not cleared. So it
> is already not perfectly human readable on a raw dump.
It would be nice to clean up the padding as well. But it is a cosmetic
improvement that might be done anytime later.
> > Sigh, I have to admit that I missed the problem with prefix and
> > trailing '\n' when I suggested to avoid the temporary buffers.
> > This memmove() and the space wasting is pity.
> >
> > Well, it is typically 3 bytes per message. And the copying would
> > be necessary even with the temporary buffer. So, I am less convinced
> > but I would still try to avoid the temporary buffers for now.
>
> Agreed. I think this approach is better than the temporary buffers I
> previously used.
Another motivation is that it allows to simply handle recursion/nesting.
Othrewise, we would need temporary buffers for each allowed recursion
level or some tricky code.
> Also, if we add a trimming feature to the ringbuffer,
> it will keep the ringbuffer mostly clean anyway. Something like this:
>
> prb_rec_init_wr(&r, text_len);
> prb_reserve(&e, prb, &r);
> text_len = printk_sprint(&r.text_buf[0], text_len, ...);
> r.info->text_len = text_len;
> prb_trim_rec(&e, &r); <--- try to reduce datablock size to @text_len
> prb_commit(&e);
>
> I see no urgency to add such a feature. But I think we should keep it on
> our radar.
Yup. I thought about it as well. I agree that it is not a priority.
Best Regards,
Petr
On (20/12/07 10:50), Petr Mladek wrote:
[..]
> > and then some call into the scheduler
> > (or other kernel core functions) under semaphore's spin_lock.
> > For instance
> >
> > up()
> > raw_spin_lock_irqsave(&sem->lock)
> > __up()
> > wake_up_process()
> > try_to_wake_up()
>
> This problem is partly solved by printk_deferred(). In each
> case, printk_safe() does not help here.
printk_deferred() has never been used in all the critical code paths.
So I think printk_safe does help here; it takes care of all the
remaining cases, that are not "partly solved by printk_deferred()".
> I still do _not_ see a reason to keep printk_safe()!
Not sure I'm following, sorry. To put it simply - we better keep
printk_safe until "new recursion prevention" >= printk_safe().
In this patch set "new recursion prevention" < printk_safe().
-ss