2015-06-10 12:55:22

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH] printk: Fixup the nmi printk mess

Hi,

I just stumbled upon:

a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs")

Which is not dissimilar to what I've proposed in the past. Except its
squirreled away in some far and dark corner of one arch.

Lets fix that.

---
arch/Kconfig | 4 ++
arch/arm/Kconfig | 1 +
arch/avr32/Kconfig | 1 +
arch/blackfin/Kconfig | 1 +
arch/cris/Kconfig | 1 +
arch/powerpc/Kconfig | 1 +
arch/s390/Kconfig | 1 +
arch/sh/Kconfig | 1 +
arch/sparc/Kconfig | 1 +
arch/tile/Kconfig | 1 +
arch/x86/Kconfig | 1 +
arch/x86/kernel/apic/hw_nmi.c | 86 +----------------------------
include/linux/hardirq.h | 2 +
include/linux/percpu.h | 3 -
include/linux/printk.h | 10 +++-
init/Kconfig | 5 ++
init/main.c | 1 +
kernel/printk/printk.c | 124 ++++++++++++++++++++++++++++++++++++++----
18 files changed, 145 insertions(+), 100 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a65eafb24997..ad8db3979e21 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -179,7 +179,11 @@ config HAVE_OPTPROBES
config HAVE_KPROBES_ON_FTRACE
bool

+config HAVE_NMI
+ bool
+
config HAVE_NMI_WATCHDOG
+ depends on HAVE_NMI
bool
#
# An arch should select this if it provides all these things:
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 45df48ba0b12..a26e83699df9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -61,6 +61,7 @@ config ARM
select HAVE_KRETPROBES if (HAVE_KPROBES)
select HAVE_MEMBLOCK
select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
+ select HAVE_NMI if (!CPU_V7)
select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
select HAVE_OPTPROBES if !THUMB2_KERNEL
select HAVE_PERF_EVENTS
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b6878eb64884..9dc3e2b1180b 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -17,6 +17,7 @@ config AVR32
select GENERIC_CLOCKEVENTS
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
+ select HAVE_NMI
help
AVR32 is a high-performance 32-bit RISC microprocessor core,
designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index af76634f8d98..47c0a55acafd 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -40,6 +40,7 @@ config BLACKFIN
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
+ select HAVE_NMI

config GENERIC_CSUM
def_bool y
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 0314e325a669..2e053f709c4b 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -58,6 +58,7 @@ config CRIS
select CLKSRC_MMIO if ETRAX_ARCH_V32
select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
+ select HAVE_NMI

config HZ
int
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 190cc48abc0c..079f4fec9586 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -153,6 +153,7 @@ config PPC
select NO_BOOTMEM
select HAVE_GENERIC_RCU_GUP
select HAVE_PERF_EVENTS_NMI if PPC64
+ select HAVE_NMI if PERF_EVENTS

config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b06dc3839268..1b84036437c0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -152,6 +152,7 @@ config S390
select TTY
select VIRT_CPU_ACCOUNTING
select VIRT_TO_BUS
+ select HAVE_NMI

config SCHED_OMIT_FRAME_POINTER
def_bool y
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 50057fed819d..7803b6863c0e 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
select OLD_SIGSUSPEND
select OLD_SIGACTION
select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_NMI
help
The SuperH is a RISC processor targeted for use in embedded systems
and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e49502acbab4..198abcb6e510 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -80,6 +80,7 @@ config SPARC64
select NO_BOOTMEM
select HAVE_ARCH_AUDITSYSCALL
select ARCH_SUPPORTS_ATOMIC_RMW
+ select HAVE_NMI

config ARCH_DEFCONFIG
string
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index a07e31b50d3f..1503ab5882cd 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -28,6 +28,7 @@ config TILE
select HAVE_DEBUG_STACKOVERFLOW
select ARCH_WANT_FRAME_POINTERS
select HAVE_CONTEXT_TRACKING
+ select HAVE_NMI if USE_PMC

# FIXME: investigate whether we need/want these options.
# select HAVE_IOREMAP_PROT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 228aa35d7e89..67dbc7442499 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -118,6 +118,7 @@ config X86
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MIXED_BREAKPOINTS_REGS
+ select HAVE_NMI
select HAVE_OPROFILE
select HAVE_OPTPROBES
select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 6873ab925d00..90eb8c385e60 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,7 +18,6 @@
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/delay.h>
-#include <linux/seq_buf.h>

#ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(int watchdog_thresh)
@@ -30,35 +29,14 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
#ifdef arch_trigger_all_cpu_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE 4096
-
-struct nmi_seq_buf {
- unsigned char buffer[NMI_BUF_SIZE];
- struct seq_buf seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);

/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;

-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
- const char *buf = s->buffer + start;
-
- printk("%.*s", (end - start) + 1, buf);
-}
-
void arch_trigger_all_cpu_backtrace(bool include_self)
{
- struct nmi_seq_buf *s;
- int len;
- int cpu;
- int i;
int this_cpu = get_cpu();
+ int i;

if (test_and_set_bit(0, &backtrace_flag)) {
/*
@@ -73,16 +51,6 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
if (!include_self)
cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));

- cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
- /*
- * Set up per_cpu seq_buf buffers that the NMIs running on the other
- * CPUs will write to.
- */
- for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
- s = &per_cpu(nmi_print_seq, cpu);
- seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
- }
-
if (!cpumask_empty(to_cpumask(backtrace_mask))) {
pr_info("sending NMI to %s CPUs:\n",
(include_self ? "all" : "other"));
@@ -97,58 +65,11 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
touch_softlockup_watchdog();
}

- /*
- * Now that all the NMIs have triggered, we can dump out their
- * back traces safely to the console.
- */
- for_each_cpu(cpu, &printtrace_mask) {
- int last_i = 0;
-
- s = &per_cpu(nmi_print_seq, cpu);
- len = seq_buf_used(&s->seq);
- if (!len)
- continue;
-
- /* Print line by line. */
- for (i = 0; i < len; i++) {
- if (s->buffer[i] == '\n') {
- print_seq_line(s, last_i, i);
- last_i = i + 1;
- }
- }
- /* Check if there was a partial line. */
- if (last_i < len) {
- print_seq_line(s, last_i, len - 1);
- pr_cont("\n");
- }
- }
-
clear_bit(0, &backtrace_flag);
smp_mb__after_atomic();
put_cpu();
}

-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
- struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- unsigned int len = seq_buf_used(&s->seq);
-
- seq_buf_vprintf(&s->seq, fmt, args);
- return seq_buf_used(&s->seq) - len;
-}
-
static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
@@ -157,13 +78,8 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
cpu = smp_processor_id();

if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- printk_func_t printk_func_save = this_cpu_read(printk_func);
-
- /* Replace printk to write into the NMI seq */
- this_cpu_write(printk_func, nmi_vprintk);
printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
show_regs(regs);
- this_cpu_write(printk_func, printk_func_save);

cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return NMI_HANDLED;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index dfd59d6bc6f0..a477e0766d2e 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -67,10 +67,12 @@ extern void irq_exit(void);
preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
rcu_nmi_enter(); \
trace_hardirq_enter(); \
+ printk_nmi_enter(); \
} while (0)

#define nmi_exit() \
do { \
+ printk_nmi_exit(); \
trace_hardirq_exit(); \
rcu_nmi_exit(); \
BUG_ON(!in_nmi()); \
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index caebf2a758dc..04c68b9f56f8 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -135,7 +135,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
(typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
__alignof__(type))

-/* To avoid include hell, as printk can not declare this, we declare it here */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-
#endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9b30871c9149..ebe017327e6e 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -120,7 +120,15 @@ static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif

-typedef int(*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK_NMI
+extern void printk_init(void);
+extern void printk_nmi_enter(void);
+extern void printk_nmi_exit(void);
+#else
+static inline void printk_init(void) { }
+static inline void printk_nmi_enter(void) { }
+static inline void printk_nmi_exit(void) { }
+#endif /* HAVE_NMI */

#ifdef CONFIG_PRINTK
asmlinkage __printf(5, 0)
diff --git a/init/Kconfig b/init/Kconfig
index b999fa381bf9..fe9a439c8843 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1431,6 +1431,11 @@ config PRINTK
very difficult to diagnose system problems, saying N here is
strongly discouraged.

+config PRINTK_NMI
+ def_bool y
+ depends on PRINTK
+ depends on HAVE_NMI
+
config BUG
bool "BUG() support" if EXPERT
default y
diff --git a/init/main.c b/init/main.c
index 2115055faeac..4a28accaaa98 100644
--- a/init/main.c
+++ b/init/main.c
@@ -586,6 +586,7 @@ asmlinkage __visible void __init start_kernel(void)
timekeeping_init();
time_init();
sched_clock_postinit();
+ printk_init();
perf_event_init();
profile_init();
call_function_init();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c099b082cd02..0149c41be95b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1821,13 +1821,125 @@ int vprintk_default(const char *fmt, va_list args)
}
EXPORT_SYMBOL_GPL(vprintk_default);

+#ifdef CONFIG_PRINTK_NMI
+
+typedef int(*printk_func_t)(const char *fmt, va_list args);
/*
* This allows printk to be diverted to another function per cpu.
* This is useful for calling printk functions from within NMI
* without worrying about race conditions that can lock up the
* box.
*/
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+
+#include <linux/seq_buf.h>
+
+struct nmi_seq_buf {
+ struct seq_buf seq;
+ struct irq_work work;
+ unsigned char buffer[PAGE_SIZE -
+ sizeof(struct seq_buf) -
+ sizeof(struct irq_work)];
+};
+
+/* Safe printing in NMI context */
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+ const char *buf = s->buffer + start;
+
+ printk("%.*s", (end - start) + 1, buf);
+}
+
+static void __printk_nmi_flush(struct irq_work *work)
+{
+ struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+ int len, last_i = 0, i = 0;
+
+again:
+ len = seq_buf_used(&s->seq);
+ if (!len)
+ return;
+
+ /* Print line by line. */
+ for (; i < len; i++) {
+ if (s->buffer[i] == '\n') {
+ print_seq_line(s, last_i, i);
+ last_i = i + 1;
+ }
+ }
+ /* Check if there was a partial line. */
+ if (last_i < len) {
+ print_seq_line(s, last_i, len - 1);
+ pr_cont("\n");
+ }
+
+ /*
+ * Another NMI could have come in while we were printing
+ * check if nothing has been added to the buffer.
+ */
+ if (cmpxchg_local(&s->seq.len, len, 0) != len)
+ goto again;
+}
+
+void printk_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+ init_irq_work(&s->work, __printk_nmi_flush);
+ seq_buf_init(&s->seq, s->buffer, sizeof(s->buffer));
+ }
+}
+
+/*
+ * It is not safe to call printk() directly from NMI handlers.
+ * It may be fine if the NMI detected a lock up and we have no choice
+ * but to do so, but doing a NMI on all other CPUs to get a back trace
+ * can be done with a sysrq-l. We don't want that to lock up, which
+ * can happen if the NMI interrupts a printk in progress.
+ *
+ * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
+ * the content into a per cpu seq_buf buffer. Then when the NMIs are
+ * all done, we can safely dump the contents of the seq_buf to a printk()
+ * from a non NMI context.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+ struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+ unsigned int len = seq_buf_used(&s->seq);
+
+ irq_work_queue(&s->work);
+ seq_buf_vprintf(&s->seq, fmt, args);
+ return seq_buf_used(&s->seq) - len;
+}
+
+void printk_nmi_enter(void)
+{
+ this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+ this_cpu_write(printk_func, vprintk_default);
+}
+
+static inline int vprintk_func(const char *fmt, va_list args)
+{
+ return this_cpu_read(printk_func)(fmt, args);
+}
+
+#else
+
+static inline int vprintk_func(const char *fmt, va_list args)
+{
+ return vprintk_default(fmt, args);
+}
+
+#endif /* PRINTK_NMI */

/**
* printk - print a kernel message
@@ -1852,21 +1964,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
*/
asmlinkage __visible int printk(const char *fmt, ...)
{
- printk_func_t vprintk_func;
va_list args;
int r;

va_start(args, fmt);
-
- /*
- * If a caller overrides the per_cpu printk_func, then it needs
- * to disable preemption when calling printk(). Otherwise
- * the printk_func should be set to the default. No need to
- * disable preemption here.
- */
- vprintk_func = this_cpu_read(printk_func);
r = vprintk_func(fmt, args);
-
va_end(args);

return r;


2015-06-10 14:32:06

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed 2015-06-10 14:55:09, Peter Zijlstra wrote:
> Hi,
>
> I just stumbled upon:
>
> a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs")
>
> Which is not dissimilar to what I've proposed in the past. Except its
> squirreled away in some far and dark corner of one arch.
>
> Lets fix that.

It is very interesting approach. But I see two possible races, see below.

> diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> index dfd59d6bc6f0..a477e0766d2e 100644
> --- a/include/linux/hardirq.h
> +++ b/include/linux/hardirq.h
> @@ -67,10 +67,12 @@ extern void irq_exit(void);
> preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
> rcu_nmi_enter(); \
> trace_hardirq_enter(); \
> + printk_nmi_enter(); \
> } while (0)
>
> #define nmi_exit() \
> do { \
> + printk_nmi_exit(); \
> trace_hardirq_exit(); \
> rcu_nmi_exit(); \
> BUG_ON(!in_nmi()); \
> diff --git a/include/linux/percpu.h b/include/linux/percpu.h
> index caebf2a758dc..04c68b9f56f8 100644
> --- a/include/linux/percpu.h
> +++ b/include/linux/percpu.h
> @@ -135,7 +135,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
> (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
> __alignof__(type))
>
> -/* To avoid include hell, as printk can not declare this, we declare it here */
> -DECLARE_PER_CPU(printk_func_t, printk_func);
> -
> #endif /* __LINUX_PERCPU_H */
> diff --git a/init/main.c b/init/main.c
> index 2115055faeac..4a28accaaa98 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -586,6 +586,7 @@ asmlinkage __visible void __init start_kernel(void)
> timekeeping_init();
> time_init();
> sched_clock_postinit();
> + printk_init();
> perf_event_init();
> profile_init();
> call_function_init();
> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> index c099b082cd02..0149c41be95b 100644
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -1821,13 +1821,125 @@ int vprintk_default(const char *fmt, va_list args)
> }
> EXPORT_SYMBOL_GPL(vprintk_default);
>
> +#ifdef CONFIG_PRINTK_NMI
> +
> +typedef int(*printk_func_t)(const char *fmt, va_list args);
> /*
> * This allows printk to be diverted to another function per cpu.
> * This is useful for calling printk functions from within NMI
> * without worrying about race conditions that can lock up the
> * box.
> */
> -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
> +static DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
> +
> +#include <linux/seq_buf.h>
> +
> +struct nmi_seq_buf {
> + struct seq_buf seq;
> + struct irq_work work;
> + unsigned char buffer[PAGE_SIZE -
> + sizeof(struct seq_buf) -
> + sizeof(struct irq_work)];
> +};
> +
> +/* Safe printing in NMI context */
> +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
> +
> +static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
> +{
> + const char *buf = s->buffer + start;
> +
> + printk("%.*s", (end - start) + 1, buf);
> +}
> +
> +static void __printk_nmi_flush(struct irq_work *work)
> +{
> + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
> + int len, last_i = 0, i = 0;
> +
> +again:
> + len = seq_buf_used(&s->seq);
> + if (!len)
> + return;
> +
> + /* Print line by line. */
> + for (; i < len; i++) {
> + if (s->buffer[i] == '\n') {
> + print_seq_line(s, last_i, i);
> + last_i = i + 1;
> + }
> + }
> + /* Check if there was a partial line. */
> + if (last_i < len) {
> + print_seq_line(s, last_i, len - 1);
> + pr_cont("\n");
> + }
> +
> + /*
> + * Another NMI could have come in while we were printing
> + * check if nothing has been added to the buffer.
> + */
> + if (cmpxchg_local(&s->seq.len, len, 0) != len)
> + goto again;

If another NMI comes at this point, it will start filling the buffer
from the beginning. If it is fast enough, it might override the text
that we print above.

Note that it is easy to be faster because the normal printk() is more
complex and might wait for console.

Well, it is not that critical. It will just print a mess.

> +}
> +
> +void printk_init(void)
> +{
> + int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
> +
> + init_irq_work(&s->work, __printk_nmi_flush);
> + seq_buf_init(&s->seq, s->buffer, sizeof(s->buffer));
> + }
> +}
> +
> +/*
> + * It is not safe to call printk() directly from NMI handlers.
> + * It may be fine if the NMI detected a lock up and we have no choice
> + * but to do so, but doing a NMI on all other CPUs to get a back trace
> + * can be done with a sysrq-l. We don't want that to lock up, which
> + * can happen if the NMI interrupts a printk in progress.
> + *
> + * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
> + * the content into a per cpu seq_buf buffer. Then when the NMIs are
> + * all done, we can safely dump the contents of the seq_buf to a printk()
> + * from a non NMI context.
> + */
> +static int vprintk_nmi(const char *fmt, va_list args)
> +{
> + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
> + unsigned int len = seq_buf_used(&s->seq);
> +
> + irq_work_queue(&s->work);
> + seq_buf_vprintf(&s->seq, fmt, args);

This is more critical. seq_buf_vprintf() has the following code:

len = vsnprintf(s->buffer + s->len, s->size - s->len, fmt, args);

The race might look like:

CPU0 CPU1

__printk_nmi_flush()
...

seq_buf_vprintf()

s->buffer + s->len

cmpxchg_local(&s->seq.len,
len, 0)

s->size - s->len

=> that seq_buf_vprint() will start writing to the end of the seq_buf
but it will think that the entire writer is available and it might
cause buffer overflow


Well, it is rather theoretical. s->len will be most likely read only
once (compiler optimization). Anyway, we must make make sure that
all the used seq_buf operations are save against such races.
It is not obvious and might be prone to regressions.

To be honest, I am not familiar with cmpxchg_local(). But I think that
it can't do much difference. The value has to be synced to the other
CPUs one day.


Note that Steven's solution is safe because the prints are serialized via
@backtrace_flag in arch_trigger_all_cpu_backtrace().


Best Regards,
Petr

2015-06-10 14:57:47

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed 2015-06-10 16:31:55, Petr Mladek wrote:
> On Wed 2015-06-10 14:55:09, Peter Zijlstra wrote:
> > Hi,
> >
> > I just stumbled upon:
> >
> > a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs")
> >
> > Which is not dissimilar to what I've proposed in the past. Except its
> > squirreled away in some far and dark corner of one arch.
> >
> > Lets fix that.
>
> It is very interesting approach. But I see two possible races, see below.
>
> > diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> > index dfd59d6bc6f0..a477e0766d2e 100644
> > --- a/include/linux/hardirq.h
> > +++ b/include/linux/hardirq.h
> > @@ -67,10 +67,12 @@ extern void irq_exit(void);
> > preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
> > rcu_nmi_enter(); \
> > trace_hardirq_enter(); \
> > + printk_nmi_enter(); \
> > } while (0)
> >
> > #define nmi_exit() \
> > do { \
> > + printk_nmi_exit(); \
> > trace_hardirq_exit(); \
> > rcu_nmi_exit(); \
> > BUG_ON(!in_nmi()); \
> > diff --git a/include/linux/percpu.h b/include/linux/percpu.h
> > index caebf2a758dc..04c68b9f56f8 100644
> > --- a/include/linux/percpu.h
> > +++ b/include/linux/percpu.h
> > @@ -135,7 +135,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
> > (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
> > __alignof__(type))
> >
> > -/* To avoid include hell, as printk can not declare this, we declare it here */
> > -DECLARE_PER_CPU(printk_func_t, printk_func);
> > -
> > #endif /* __LINUX_PERCPU_H */
> > diff --git a/init/main.c b/init/main.c
> > index 2115055faeac..4a28accaaa98 100644
> > --- a/init/main.c
> > +++ b/init/main.c
> > @@ -586,6 +586,7 @@ asmlinkage __visible void __init start_kernel(void)
> > timekeeping_init();
> > time_init();
> > sched_clock_postinit();
> > + printk_init();
> > perf_event_init();
> > profile_init();
> > call_function_init();
> > diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> > index c099b082cd02..0149c41be95b 100644
> > --- a/kernel/printk/printk.c
> > +++ b/kernel/printk/printk.c
> > @@ -1821,13 +1821,125 @@ int vprintk_default(const char *fmt, va_list args)
> > }
> > EXPORT_SYMBOL_GPL(vprintk_default);
> >
> > +#ifdef CONFIG_PRINTK_NMI
> > +
> > +typedef int(*printk_func_t)(const char *fmt, va_list args);
> > /*
> > * This allows printk to be diverted to another function per cpu.
> > * This is useful for calling printk functions from within NMI
> > * without worrying about race conditions that can lock up the
> > * box.
> > */
> > -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
> > +static DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
> > +
> > +#include <linux/seq_buf.h>
> > +
> > +struct nmi_seq_buf {
> > + struct seq_buf seq;
> > + struct irq_work work;
> > + unsigned char buffer[PAGE_SIZE -
> > + sizeof(struct seq_buf) -
> > + sizeof(struct irq_work)];
> > +};
> > +
> > +/* Safe printing in NMI context */
> > +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
> > +
> > +static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
> > +{
> > + const char *buf = s->buffer + start;
> > +
> > + printk("%.*s", (end - start) + 1, buf);
> > +}
> > +
> > +static void __printk_nmi_flush(struct irq_work *work)
> > +{
> > + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
> > + int len, last_i = 0, i = 0;
> > +
> > +again:
> > + len = seq_buf_used(&s->seq);
> > + if (!len)
> > + return;
> > +
> > + /* Print line by line. */
> > + for (; i < len; i++) {
> > + if (s->buffer[i] == '\n') {
> > + print_seq_line(s, last_i, i);
> > + last_i = i + 1;
> > + }
> > + }
> > + /* Check if there was a partial line. */
> > + if (last_i < len) {
> > + print_seq_line(s, last_i, len - 1);
> > + pr_cont("\n");
> > + }
> > +
> > + /*
> > + * Another NMI could have come in while we were printing
> > + * check if nothing has been added to the buffer.
> > + */
> > + if (cmpxchg_local(&s->seq.len, len, 0) != len)
> > + goto again;
>
> If another NMI comes at this point, it will start filling the buffer
> from the beginning. If it is fast enough, it might override the text
> that we print above.
>
> Note that it is easy to be faster because the normal printk() is more
> complex and might wait for console.

Also note that show_regs() calls many separate printk()s, the irqwork
is scheduled by the first one => it is quite likely that some
backtrace will get messed.


Anothrer problem is that __printk_nmi_flush() is per-CPU => more
instances might be running in parallel. I haven't tested this but
I quess that it will mix backtraces from different CPUs in
the main ring buffer.

Note that the backtraces used to be serialized via
static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED
in the past. See the commit
a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs")


A solution would be to block __printk_nmi_flush() until
printk_nmi_exit() is called via some bit and cmpxchg.

Also we could add one more bit and vprintk_nmi() could drop messages
until the buffer is printed to avoid any mess.

Huh, it is getting more and more complicated.


Best Regards,
Petr

> > +}
> > +
> > +void printk_init(void)
> > +{
> > + int cpu;
> > +
> > + for_each_possible_cpu(cpu) {
> > + struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
> > +
> > + init_irq_work(&s->work, __printk_nmi_flush);
> > + seq_buf_init(&s->seq, s->buffer, sizeof(s->buffer));
> > + }
> > +}
> > +
> > +/*
> > + * It is not safe to call printk() directly from NMI handlers.
> > + * It may be fine if the NMI detected a lock up and we have no choice
> > + * but to do so, but doing a NMI on all other CPUs to get a back trace
> > + * can be done with a sysrq-l. We don't want that to lock up, which
> > + * can happen if the NMI interrupts a printk in progress.
> > + *
> > + * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
> > + * the content into a per cpu seq_buf buffer. Then when the NMIs are
> > + * all done, we can safely dump the contents of the seq_buf to a printk()
> > + * from a non NMI context.
> > + */
> > +static int vprintk_nmi(const char *fmt, va_list args)
> > +{
> > + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
> > + unsigned int len = seq_buf_used(&s->seq);
> > +
> > + irq_work_queue(&s->work);
> > + seq_buf_vprintf(&s->seq, fmt, args);
>
> This is more critical. seq_buf_vprintf() has the following code:
>
> len = vsnprintf(s->buffer + s->len, s->size - s->len, fmt, args);
>
> The race might look like:
>
> CPU0 CPU1
>
> __printk_nmi_flush()
> ...
>
> seq_buf_vprintf()
>
> s->buffer + s->len
>
> cmpxchg_local(&s->seq.len,
> len, 0)
>
> s->size - s->len
>
> => that seq_buf_vprint() will start writing to the end of the seq_buf
> but it will think that the entire writer is available and it might
> cause buffer overflow
>
>
> Well, it is rather theoretical. s->len will be most likely read only
> once (compiler optimization). Anyway, we must make make sure that
> all the used seq_buf operations are save against such races.
> It is not obvious and might be prone to regressions.
>
> To be honest, I am not familiar with cmpxchg_local(). But I think that
> it can't do much difference. The value has to be synced to the other
> CPUs one day.
>
>
> Note that Steven's solution is safe because the prints are serialized via
> @backtrace_flag in arch_trigger_all_cpu_backtrace().
>
>
> Best Regards,
> Petr
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2015-06-10 15:36:45

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed, Jun 10, 2015 at 04:31:55PM +0200, Petr Mladek wrote:
> > +static void __printk_nmi_flush(struct irq_work *work)
> > +{
> > + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
> > + int len, last_i = 0, i = 0;
> > +
> > +again:
> > + len = seq_buf_used(&s->seq);
> > + if (!len)
> > + return;
> > +
> > + /* Print line by line. */
> > + for (; i < len; i++) {
> > + if (s->buffer[i] == '\n') {
> > + print_seq_line(s, last_i, i);
> > + last_i = i + 1;
> > + }
> > + }
> > + /* Check if there was a partial line. */
> > + if (last_i < len) {
> > + print_seq_line(s, last_i, len - 1);
> > + pr_cont("\n");
> > + }
> > +
> > + /*
> > + * Another NMI could have come in while we were printing
> > + * check if nothing has been added to the buffer.
> > + */
> > + if (cmpxchg_local(&s->seq.len, len, 0) != len)
> > + goto again;
>
> If another NMI comes at this point, it will start filling the buffer
> from the beginning. If it is fast enough, it might override the text
> that we print above.

How so? If the cmpxchg succeeded and len == 0, we flushed everything and
are done with it, if another NMI comes in and 'overwrites' it, that's
fine, right?

> > +static int vprintk_nmi(const char *fmt, va_list args)
> > +{
> > + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
> > + unsigned int len = seq_buf_used(&s->seq);
> > +
> > + irq_work_queue(&s->work);
> > + seq_buf_vprintf(&s->seq, fmt, args);
>
> This is more critical. seq_buf_vprintf() has the following code:
>
> len = vsnprintf(s->buffer + s->len, s->size - s->len, fmt, args);
>
> The race might look like:
>
> CPU0 CPU1
>

No, everything is strictly per cpu.

> To be honest, I am not familiar with cmpxchg_local(). But I think that
> it can't do much difference. The value has to be synced to the other
> CPUs one day.

Nope..

2015-06-10 15:33:09

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess


You are aware that you can delete bits of the email that are not
relevant, right?

On Wed, Jun 10, 2015 at 04:57:37PM +0200, Petr Mladek wrote:

> Also note that show_regs() calls many separate printk()s, the irqwork
> is scheduled by the first one => it is quite likely that some
> backtrace will get messed.

The irq_work is only ever called when we're inside an NMI, the irq_work
will only ever execute once that NMI is done -- on that CPU.

How will that miss anything?

> Anothrer problem is that __printk_nmi_flush() is per-CPU => more
> instances might be running in parallel. I haven't tested this but
> I quess that it will mix backtraces from different CPUs in
> the main ring buffer.

Correct, such is the life of printk().

> Note that the backtraces used to be serialized via
> static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED
> in the past. See the commit
> a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs")

We could easily add a static raw_spinlock_t to __printk_nmi_flush() and
serialize its invocations if people think that is important.

2015-06-10 15:53:32

by Steven Rostedt

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed, 10 Jun 2015 14:55:09 +0200
Peter Zijlstra <[email protected]> wrote:

> diff --git a/include/linux/printk.h b/include/linux/printk.h
> index 9b30871c9149..ebe017327e6e 100644
> --- a/include/linux/printk.h
> +++ b/include/linux/printk.h
> @@ -120,7 +120,15 @@ static inline __printf(1, 2) __cold
> void early_printk(const char *s, ...) { }
> #endif
>
> -typedef int(*printk_func_t)(const char *fmt, va_list args);
> +#ifdef CONFIG_PRINTK_NMI
> +extern void printk_init(void);
> +extern void printk_nmi_enter(void);
> +extern void printk_nmi_exit(void);
> +#else
> +static inline void printk_init(void) { }
> +static inline void printk_nmi_enter(void) { }
> +static inline void printk_nmi_exit(void) { }
> +#endif /* HAVE_NMI */

#endif /* PRINTK_NMI */

-- Steve

>
> #ifdef CONFIG_PRINTK
> asmlinkage __printf(5, 0)
> diff --git a/init/Kconfig b/init/Kconfig
> index b999fa381bf9..fe9a439c8843 100644


2015-06-10 15:54:35

by Steven Rostedt

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed, 10 Jun 2015 17:32:53 +0200
Peter Zijlstra <[email protected]> wrote:

> > Note that the backtraces used to be serialized via
> > static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED
> > in the past. See the commit
> > a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs")
>
> We could easily add a static raw_spinlock_t to __printk_nmi_flush() and
> serialize its invocations if people think that is important.

Yes please. One thing about the original change is that it made the NMI
backtraces readable. I hated the interleaving of the backtraces, as it
made it almost impossible to to debug.

-- Steve

2015-06-10 16:03:06

by Steven Rostedt

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed, 10 Jun 2015 14:55:09 +0200
Peter Zijlstra <[email protected]> wrote:


> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -1821,13 +1821,125 @@ int vprintk_default(const char *fmt, va_list args)
> }
> EXPORT_SYMBOL_GPL(vprintk_default);
>
> +#ifdef CONFIG_PRINTK_NMI
> +
> +typedef int(*printk_func_t)(const char *fmt, va_list args);
> /*
> * This allows printk to be diverted to another function per cpu.
> * This is useful for calling printk functions from within NMI
> * without worrying about race conditions that can lock up the
> * box.
> */
> -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
> +static DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
> +
> +#include <linux/seq_buf.h>
> +
> +struct nmi_seq_buf {
> + struct seq_buf seq;
> + struct irq_work work;
> + unsigned char buffer[PAGE_SIZE -
> + sizeof(struct seq_buf) -
> + sizeof(struct irq_work)];
> +};
> +
> +/* Safe printing in NMI context */
> +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
> +
> +static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
> +{
> + const char *buf = s->buffer + start;
> +
> + printk("%.*s", (end - start) + 1, buf);
> +}
> +
> +static void __printk_nmi_flush(struct irq_work *work)
> +{
> + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
> + int len, last_i = 0, i = 0;
> +
> +again:
> + len = seq_buf_used(&s->seq);
> + if (!len)
> + return;
> +
> + /* Print line by line. */
> + for (; i < len; i++) {
> + if (s->buffer[i] == '\n') {
> + print_seq_line(s, last_i, i);
> + last_i = i + 1;
> + }
> + }
> + /* Check if there was a partial line. */
> + if (last_i < len) {
> + print_seq_line(s, last_i, len - 1);
> + pr_cont("\n");
> + }
> +
> + /*
> + * Another NMI could have come in while we were printing
> + * check if nothing has been added to the buffer.
> + */
> + if (cmpxchg_local(&s->seq.len, len, 0) != len)
> + goto again;
> +}
> +
> +void printk_init(void)
> +{
> + int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
> +
> + init_irq_work(&s->work, __printk_nmi_flush);
> + seq_buf_init(&s->seq, s->buffer, sizeof(s->buffer));
> + }
> +}
> +
> +/*
> + * It is not safe to call printk() directly from NMI handlers.
> + * It may be fine if the NMI detected a lock up and we have no choice
> + * but to do so, but doing a NMI on all other CPUs to get a back trace
> + * can be done with a sysrq-l. We don't want that to lock up, which
> + * can happen if the NMI interrupts a printk in progress.
> + *
> + * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
> + * the content into a per cpu seq_buf buffer. Then when the NMIs are
> + * all done, we can safely dump the contents of the seq_buf to a printk()
> + * from a non NMI context.
> + */
> +static int vprintk_nmi(const char *fmt, va_list args)
> +{
> + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
> + unsigned int len = seq_buf_used(&s->seq);
> +
> + irq_work_queue(&s->work);
> + seq_buf_vprintf(&s->seq, fmt, args);
> + return seq_buf_used(&s->seq) - len;
> +}
> +
> +void printk_nmi_enter(void)
> +{
> + this_cpu_write(printk_func, vprintk_nmi);
> +}
> +
> +void printk_nmi_exit(void)
> +{
> + this_cpu_write(printk_func, vprintk_default);
> +}
> +
> +static inline int vprintk_func(const char *fmt, va_list args)
> +{
> + return this_cpu_read(printk_func)(fmt, args);
> +}
> +
> +#else
> +
> +static inline int vprintk_func(const char *fmt, va_list args)
> +{
> + return vprintk_default(fmt, args);
> +}
> +
> +#endif /* PRINTK_NMI */

BTW, the printk.c file is getting rather big. Can we make a
kernel/printk/nmi.c file that does this work. We can add a local
printk_common.h that can share the global data structures, and this
would move most of the #ifdef out of the C files.

obj-$(CONFIG_PRINTK_NMI) += nmi.o

And the header file could have:

#ifdef CONFIG_PRINTK_NMI
typedef int(*printk_func_t)(const char *fmt, va_list args);
DECLARE_PER_CPU(printk_func_t, printk_func);

static inline int vprintk_func(const char *fmt, va_list args)
{
return this_cpu_read(printk_func)(fmt, args);
}
#else
static inline int vprintk_func(const char *fmt, va_list args)
{
return vprintk_default(fmt, args);
}
#endif /* PRINTK_NMI */

-- Steve

>
> /**
> * printk - print a kernel message
> @@ -1852,21 +1964,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
> */
> asmlinkage __visible int printk(const char *fmt, ...)
> {
> - printk_func_t vprintk_func;
> va_list args;
> int r;
>
> va_start(args, fmt);
> -
> - /*
> - * If a caller overrides the per_cpu printk_func, then it needs
> - * to disable preemption when calling printk(). Otherwise
> - * the printk_func should be set to the default. No need to
> - * disable preemption here.
> - */
> - vprintk_func = this_cpu_read(printk_func);
> r = vprintk_func(fmt, args);
> -
> va_end(args);
>
> return r;

2015-06-10 16:19:27

by Joe Perches

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed, 2015-06-10 at 12:02 -0400, Steven Rostedt wrote:

> BTW, the printk.c file is getting rather big.

No joke that.

> Can we make a
> kernel/printk/nmi.c file that does this work. We can add a local
> printk_common.h that can share the global data structures, and this
> would move most of the #ifdef out of the C files.

Yes please.

There are many things in printk that could be moved
into separate files.

The last time I tried this was before Kay's structured
printk logging stuff:

https://lkml.org/lkml/2012/10/24/748


2015-06-10 19:23:12

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed, Jun 10, 2015 at 05:29:17PM +0200, Peter Zijlstra wrote:
> No, everything is strictly per cpu.

Which has the problem that if the CPU is actually stuck with IRQs
disabled, the irq_work won't run and we'll miss the most important
backtrace.

Below a version which does x-cpu stuff to allow the
trigger_all*_cpu_backtrace() initiator to flush buffers on behalf of
other CPUs.

As a bonus it doesn't need that seq_buf stuff anymore ;-)

Compile tested only.

---
arch/Kconfig | 4 +
arch/arm/Kconfig | 1 +
arch/avr32/Kconfig | 1 +
arch/blackfin/Kconfig | 1 +
arch/cris/Kconfig | 1 +
arch/powerpc/Kconfig | 1 +
arch/s390/Kconfig | 1 +
arch/sh/Kconfig | 1 +
arch/sparc/Kconfig | 1 +
arch/tile/Kconfig | 1 +
arch/x86/Kconfig | 1 +
arch/x86/kernel/apic/hw_nmi.c | 89 ++-----------------
include/linux/hardirq.h | 2 +
include/linux/percpu.h | 3 -
include/linux/printk.h | 12 ++-
init/Kconfig | 5 ++
init/main.c | 1 +
kernel/printk/printk.c | 199 +++++++++++++++++++++++++++++++++++++++---
18 files changed, 226 insertions(+), 99 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a65eafb24997..ad8db3979e21 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -179,7 +179,11 @@ config HAVE_OPTPROBES
config HAVE_KPROBES_ON_FTRACE
bool

+config HAVE_NMI
+ bool
+
config HAVE_NMI_WATCHDOG
+ depends on HAVE_NMI
bool
#
# An arch should select this if it provides all these things:
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 45df48ba0b12..a26e83699df9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -61,6 +61,7 @@ config ARM
select HAVE_KRETPROBES if (HAVE_KPROBES)
select HAVE_MEMBLOCK
select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
+ select HAVE_NMI if (!CPU_V7)
select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
select HAVE_OPTPROBES if !THUMB2_KERNEL
select HAVE_PERF_EVENTS
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b6878eb64884..9dc3e2b1180b 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -17,6 +17,7 @@ config AVR32
select GENERIC_CLOCKEVENTS
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
+ select HAVE_NMI
help
AVR32 is a high-performance 32-bit RISC microprocessor core,
designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index af76634f8d98..47c0a55acafd 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -40,6 +40,7 @@ config BLACKFIN
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
+ select HAVE_NMI

config GENERIC_CSUM
def_bool y
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 0314e325a669..2e053f709c4b 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -58,6 +58,7 @@ config CRIS
select CLKSRC_MMIO if ETRAX_ARCH_V32
select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
+ select HAVE_NMI

config HZ
int
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 190cc48abc0c..079f4fec9586 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -153,6 +153,7 @@ config PPC
select NO_BOOTMEM
select HAVE_GENERIC_RCU_GUP
select HAVE_PERF_EVENTS_NMI if PPC64
+ select HAVE_NMI if PERF_EVENTS

config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b06dc3839268..1b84036437c0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -152,6 +152,7 @@ config S390
select TTY
select VIRT_CPU_ACCOUNTING
select VIRT_TO_BUS
+ select HAVE_NMI

config SCHED_OMIT_FRAME_POINTER
def_bool y
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 50057fed819d..7803b6863c0e 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
select OLD_SIGSUSPEND
select OLD_SIGACTION
select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_NMI
help
The SuperH is a RISC processor targeted for use in embedded systems
and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e49502acbab4..198abcb6e510 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -80,6 +80,7 @@ config SPARC64
select NO_BOOTMEM
select HAVE_ARCH_AUDITSYSCALL
select ARCH_SUPPORTS_ATOMIC_RMW
+ select HAVE_NMI

config ARCH_DEFCONFIG
string
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index a07e31b50d3f..1503ab5882cd 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -28,6 +28,7 @@ config TILE
select HAVE_DEBUG_STACKOVERFLOW
select ARCH_WANT_FRAME_POINTERS
select HAVE_CONTEXT_TRACKING
+ select HAVE_NMI if USE_PMC

# FIXME: investigate whether we need/want these options.
# select HAVE_IOREMAP_PROT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 228aa35d7e89..67dbc7442499 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -118,6 +118,7 @@ config X86
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MIXED_BREAKPOINTS_REGS
+ select HAVE_NMI
select HAVE_OPROFILE
select HAVE_OPTPROBES
select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 6873ab925d00..6e633e020f55 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,7 +18,6 @@
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/delay.h>
-#include <linux/seq_buf.h>

#ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(int watchdog_thresh)
@@ -30,35 +29,14 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
#ifdef arch_trigger_all_cpu_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE 4096
-
-struct nmi_seq_buf {
- unsigned char buffer[NMI_BUF_SIZE];
- struct seq_buf seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);

/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;

-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
- const char *buf = s->buffer + start;
-
- printk("%.*s", (end - start) + 1, buf);
-}
-
void arch_trigger_all_cpu_backtrace(bool include_self)
{
- struct nmi_seq_buf *s;
- int len;
- int cpu;
- int i;
int this_cpu = get_cpu();
+ int i;

if (test_and_set_bit(0, &backtrace_flag)) {
/*
@@ -73,16 +51,6 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
if (!include_self)
cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));

- cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
- /*
- * Set up per_cpu seq_buf buffers that the NMIs running on the other
- * CPUs will write to.
- */
- for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
- s = &per_cpu(nmi_print_seq, cpu);
- seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
- }
-
if (!cpumask_empty(to_cpumask(backtrace_mask))) {
pr_info("sending NMI to %s CPUs:\n",
(include_self ? "all" : "other"));
@@ -98,57 +66,15 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
}

/*
- * Now that all the NMIs have triggered, we can dump out their
- * back traces safely to the console.
+ * Force flush any remote buffers that might be stuck in IRQ context
+ * and therefore could not run their irq_work.
*/
- for_each_cpu(cpu, &printtrace_mask) {
- int last_i = 0;
-
- s = &per_cpu(nmi_print_seq, cpu);
- len = seq_buf_used(&s->seq);
- if (!len)
- continue;
-
- /* Print line by line. */
- for (i = 0; i < len; i++) {
- if (s->buffer[i] == '\n') {
- print_seq_line(s, last_i, i);
- last_i = i + 1;
- }
- }
- /* Check if there was a partial line. */
- if (last_i < len) {
- print_seq_line(s, last_i, len - 1);
- pr_cont("\n");
- }
- }
+ printk_nmi_flush();

- clear_bit(0, &backtrace_flag);
- smp_mb__after_atomic();
+ clear_bit_unlock(0, &backtrace_flag);
put_cpu();
}

-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
- struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- unsigned int len = seq_buf_used(&s->seq);
-
- seq_buf_vprintf(&s->seq, fmt, args);
- return seq_buf_used(&s->seq) - len;
-}
-
static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
@@ -157,13 +83,8 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
cpu = smp_processor_id();

if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- printk_func_t printk_func_save = this_cpu_read(printk_func);
-
- /* Replace printk to write into the NMI seq */
- this_cpu_write(printk_func, nmi_vprintk);
printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
show_regs(regs);
- this_cpu_write(printk_func, printk_func_save);

cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return NMI_HANDLED;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index dfd59d6bc6f0..a477e0766d2e 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -67,10 +67,12 @@ extern void irq_exit(void);
preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
rcu_nmi_enter(); \
trace_hardirq_enter(); \
+ printk_nmi_enter(); \
} while (0)

#define nmi_exit() \
do { \
+ printk_nmi_exit(); \
trace_hardirq_exit(); \
rcu_nmi_exit(); \
BUG_ON(!in_nmi()); \
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index caebf2a758dc..04c68b9f56f8 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -135,7 +135,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
(typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
__alignof__(type))

-/* To avoid include hell, as printk can not declare this, we declare it here */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-
#endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9b30871c9149..05b7f6ed1663 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -120,7 +120,17 @@ static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif

-typedef int(*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK_NMI
+extern void printk_init(void);
+extern void printk_nmi_enter(void);
+extern void printk_nmi_exit(void);
+extern void printk_nmi_flush(void);
+#else
+static inline void printk_init(void) { }
+static inline void printk_nmi_enter(void) { }
+static inline void printk_nmi_exit(void) { }
+static inline void printk_nmi_flush(void) { }
+#endif /* PRINTK_NMI */

#ifdef CONFIG_PRINTK
asmlinkage __printf(5, 0)
diff --git a/init/Kconfig b/init/Kconfig
index b999fa381bf9..fe9a439c8843 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1431,6 +1431,11 @@ config PRINTK
very difficult to diagnose system problems, saying N here is
strongly discouraged.

+config PRINTK_NMI
+ def_bool y
+ depends on PRINTK
+ depends on HAVE_NMI
+
config BUG
bool "BUG() support" if EXPERT
default y
diff --git a/init/main.c b/init/main.c
index 2115055faeac..4a28accaaa98 100644
--- a/init/main.c
+++ b/init/main.c
@@ -586,6 +586,7 @@ asmlinkage __visible void __init start_kernel(void)
timekeeping_init();
time_init();
sched_clock_postinit();
+ printk_init();
perf_event_init();
profile_init();
call_function_init();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c099b082cd02..99bfc1e3f32a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1821,13 +1821,200 @@ int vprintk_default(const char *fmt, va_list args)
}
EXPORT_SYMBOL_GPL(vprintk_default);

+#ifdef CONFIG_PRINTK_NMI
+
+typedef int(*printk_func_t)(const char *fmt, va_list args);
/*
* This allows printk to be diverted to another function per cpu.
+ *
* This is useful for calling printk functions from within NMI
* without worrying about race conditions that can lock up the
* box.
*/
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+
+struct nmi_seq_buf {
+ atomic_t head, tail, read;
+ struct irq_work work;
+ unsigned char buffer[PAGE_SIZE -
+ 3*sizeof(atomic_t) -
+ sizeof(struct irq_work)];
+};
+
+/* Safe printing in NMI context */
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+ const char *buf = s->buffer + start;
+
+ printk("%.*s", (end - start) + 1, buf);
+}
+
+static void __printk_nmi_flush(struct irq_work *work)
+{
+ static raw_spinlock_t lock = __RAW_SPIN_LOCK_INITIALIZER(lock);
+ struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+ int len, head, size, i, last_i;
+
+again:
+ /*
+ * vprintk_nmi() truncate
+ *
+ * [S] head [S] head
+ * wmb mb
+ * [S] tail [S] read
+ *
+ * therefore:
+ */
+ i = atomic_read(&s->read);
+ len = atomic_read(&s->tail); /* up to the tail is stable */
+ smp_rmb();
+ head = atomic_read(&s->head);
+
+ /*
+ * We cannot truncate tail because it could overwrite a store from
+ * vprintk_nmi(), however vprintk_nmi() will always update tail to the
+ * correct value.
+ *
+ * Therefore if head < tail, we missed a truncate and should do so now.
+ */
+ if (head < len)
+ len = 0;
+
+ if (len - i <= 0) /* nothing to do */
+ return;
+
+ /*
+ * 'Consume' this chunk, avoids concurrent callers printing the same
+ * stuff.
+ */
+ if (atomic_cmpxchg(&s->read, i, len) != i)
+ goto again;
+
+ /*
+ * Serialize the actual printing of the chunks
+ * to avoid interleaving lines.
+ */
+ raw_spin_lock(&lock);
+
+ size = min_t(int, len, sizeof(s->buffer));
+ last_i = i;
+
+ /* Print line by line. */
+ for (; i < size; i++) {
+ if (s->buffer[i] == '\n') {
+ print_seq_line(s, last_i, i);
+ last_i = i + 1;
+ }
+ }
+ /* Check if there was a partial line. */
+ if (last_i < size) {
+ print_seq_line(s, last_i, len - 1);
+ pr_cont("\n");
+ }
+
+ raw_spin_unlock(&lock);
+
+ /*
+ * Check nothing got added and truncate the buffer. If head doesn't
+ * match the irq_work is guaranteed to be queued.
+ */
+ if (atomic_cmpxchg(&s->head, len, 0) != len)
+ return; /* next irq_work run */
+
+ /*
+ * We cannot truncate tail; see above.
+ */
+
+ /*
+ * If we call __printk_nmi_flush() concurrently before this
+ * it will see s->read > s->len and not print anything.
+ */
+ atomic_set(&s->read, 0);
+}
+
+/*
+ * flush all per-cpu nmi buffers
+ */
+void printk_nmi_flush(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+void printk_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+ init_irq_work(&s->work, __printk_nmi_flush);
+ }
+}
+
+/*
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+ struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+ int add = -1, len;
+
+ irq_work_queue(&s->work);
+
+ /*
+ * head > tail; indicates someone is writing.
+ * avoids the buffer being truncated.
+ *
+ * we rely on the memory barrier to ensure the increment
+ * is visible before we start writing to the buffer.
+ */
+ len = atomic_inc_return(&s->head);
+
+ if (len < sizeof(s->buffer)) {
+ add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+ /*
+ * Fwd head to the right location; NMIs do not nest, therefore
+ * we can use regular stores.
+ */
+ atomic_set(&s->head, len + add);
+ /*
+ * Once the data is written and head is correct, update tail to
+ * match; indicating we're complete.
+ */
+ smp_wmb();
+ atomic_set(&s->tail, len + add);
+ }
+
+ return add;
+}
+
+void printk_nmi_enter(void)
+{
+ this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+ this_cpu_write(printk_func, vprintk_default);
+}
+
+static inline int vprintk_func(const char *fmt, va_list args)
+{
+ return this_cpu_read(printk_func)(fmt, args);
+}
+
+#else
+
+static inline int vprintk_func(const char *fmt, va_list args)
+{
+ return vprintk_default(fmt, args);
+}
+
+#endif /* PRINTK_NMI */

/**
* printk - print a kernel message
@@ -1852,21 +2039,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
*/
asmlinkage __visible int printk(const char *fmt, ...)
{
- printk_func_t vprintk_func;
va_list args;
int r;

va_start(args, fmt);
-
- /*
- * If a caller overrides the per_cpu printk_func, then it needs
- * to disable preemption when calling printk(). Otherwise
- * the printk_func should be set to the default. No need to
- * disable preemption here.
- */
- vprintk_func = this_cpu_read(printk_func);
r = vprintk_func(fmt, args);
-
va_end(args);

return r;

2015-06-10 19:25:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed, Jun 10, 2015 at 09:19:16AM -0700, Joe Perches wrote:
> On Wed, 2015-06-10 at 12:02 -0400, Steven Rostedt wrote:
>
> > BTW, the printk.c file is getting rather big.
>
> No joke that.

$ find kernel/ -type f -name \*.[ch] | while read file; do wc -l $file; done | sort -nr | awk '{ print ++i " " $0 }' | grep "\<printk\.c"
13 3245 kernel/printk/printk.c

so its the 13th biggest file in kernel/, can't really be bothered by that.

2015-06-10 19:35:28

by Steven Rostedt

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed, 10 Jun 2015 21:25:16 +0200
Peter Zijlstra <[email protected]> wrote:

> $ find kernel/ -type f -name \*.[ch] | while read file; do wc -l $file; done | sort -nr | awk '{ print ++i " " $0 }' | grep "\<printk\.c"
> 13 3245 kernel/printk/printk.c

Well, it also removes #ifdef from the C files.

>
> so its the 13th biggest file in kernel/, can't really be bothered by that.

But you're different ;-) How many emails do you have in your INBOX?

-- Steve

2015-06-10 19:36:24

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed, Jun 10, 2015 at 09:23:04PM +0200, Peter Zijlstra wrote:
> +static int vprintk_nmi(const char *fmt, va_list args)
> +{
> + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
> + int add = -1, len;
> +
> + irq_work_queue(&s->work);
> +
> + /*
> + * head > tail; indicates someone is writing.
> + * avoids the buffer being truncated.
> + *
> + * we rely on the memory barrier to ensure the increment
> + * is visible before we start writing to the buffer.
> + */
> + len = atomic_inc_return(&s->head);

That wants to be:

len = atomic_inc_return(&s->head) - 1;

we need the value before the increment, not after it.

> +
> + if (len < sizeof(s->buffer)) {
> + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
> + /*
> + * Fwd head to the right location; NMIs do not nest, therefore
> + * we can use regular stores.
> + */
> + atomic_set(&s->head, len + add);
> + /*
> + * Once the data is written and head is correct, update tail to
> + * match; indicating we're complete.
> + */
> + smp_wmb();
> + atomic_set(&s->tail, len + add);
> + }
> +
> + return add;
> +}

2015-06-10 19:36:48

by Steven Rostedt

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed, 10 Jun 2015 21:23:04 +0200
Peter Zijlstra <[email protected]> wrote:

> As a bonus it doesn't need that seq_buf stuff anymore ;-)
>

That's because you created your own version.

-- Steve

2015-06-10 19:40:04

by Joe Perches

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed, 2015-06-10 at 21:25 +0200, Peter Zijlstra wrote:
> On Wed, Jun 10, 2015 at 09:19:16AM -0700, Joe Perches wrote:
> > On Wed, 2015-06-10 at 12:02 -0400, Steven Rostedt wrote:
> >
> > > BTW, the printk.c file is getting rather big.
> >
> > No joke that.
>
> $ find kernel/ -type f -name \*.[ch] | while read file; do wc -l $file; done | sort -nr | awk '{ print ++i " " $0 }' | grep "\<printk\.c"
> 13 3245 kernel/printk/printk.c
>
> so its the 13th biggest file in kernel/, can't really be bothered by that.

printk.c is ~80k of fairly convoluted.
Refactoring would help maintenance.

2015-06-11 09:57:12

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed 2015-06-10 17:29:17, Peter Zijlstra wrote:
> On Wed, Jun 10, 2015 at 04:31:55PM +0200, Petr Mladek wrote:
> > If another NMI comes at this point, it will start filling the buffer
> > from the beginning. If it is fast enough, it might override the text
> > that we print above.
>
> How so? If the cmpxchg succeeded and len == 0, we flushed everything and
> are done with it, if another NMI comes in and 'overwrites' it, that's
> fine, right?

Shame on me. Somehow I thought that there was xchg() and not cmpxchg().


> > > +static int vprintk_nmi(const char *fmt, va_list args)
> > > +{
> > > + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
> > > + unsigned int len = seq_buf_used(&s->seq);
> > > +
> > > + irq_work_queue(&s->work);
> > > + seq_buf_vprintf(&s->seq, fmt, args);
>
> No, everything is strictly per cpu.

I do not know why but I expected that irq_work could get proceed on
any CPU. You are right. They are proceed on the same one.

Similar with the other mail. Sigh, I was too fast yesterday. I am
sorry for the noise.

Best Regards,
Petr

2015-06-11 14:55:57

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Wed 2015-06-10 21:23:04, Peter Zijlstra wrote:
> Below a version which does x-cpu stuff to allow the
> trigger_all*_cpu_backtrace() initiator to flush buffers on behalf of
> other CPUs.
>
> Compile tested only.

The output from "echo l >/proc/sysrq-trigger" looks reasonable.
It does not mix output from different CPUs. This is expected
because of the @lock.

The other observation is that it prints CPUs in _random_ order:
28, 24, 25, 1, 26, 2, 27, 3, ...

The order is fine when I disable the irq_work.

It means that irq_works are usually faster than printk_nmi_flush() =>
printk_nmi_flush() is not that useful => all the complexity with
the three atomic variables (head, tail, read) did not bring
much win.

Anyway, I think that the current solution is racy and it cannot be fixed
easily, see below.


> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> index c099b082cd02..99bfc1e3f32a 100644
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -1821,13 +1821,200 @@ int vprintk_default(const char *fmt, va_list args)
> +static void __printk_nmi_flush(struct irq_work *work)
> +{
> + static raw_spinlock_t lock = __RAW_SPIN_LOCK_INITIALIZER(lock);
> + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
> + int len, head, size, i, last_i;
> +
> +again:
> + /*
> + * vprintk_nmi() truncate
> + *
> + * [S] head [S] head
> + * wmb mb
> + * [S] tail [S] read

BTW, this is quite cryptic for me. Coffee did not help :-)

*
> + * therefore:
> + */
> + i = atomic_read(&s->read);
> + len = atomic_read(&s->tail); /* up to the tail is stable */
> + smp_rmb();
> + head = atomic_read(&s->head);
> +
> + /*
> + * We cannot truncate tail because it could overwrite a store from
> + * vprintk_nmi(), however vprintk_nmi() will always update tail to the
> + * correct value.
> + *
> + * Therefore if head < tail, we missed a truncate and should do so now.
> + */
> + if (head < len)
> + len = 0;

This is a bit confusing. It is a complicated way how to return on the next test.

If I get this correctly. This might happen only inside
_printk_nmi_flush() called on another CPU (from
printk_nmi_flush()) when it interferes with the queued
irq_work. The irq_work is faster and truncates the buffer.

So, the return is fine after all because the irq_work printed
everything.


> + if (len - i <= 0) /* nothing to do */
> + return;
> + /*
> + * 'Consume' this chunk, avoids concurrent callers printing the same
> + * stuff.
> + */
> + if (atomic_cmpxchg(&s->read, i, len) != i)
> + goto again;

I think that this is racy:

CPU0 CPU7

printk_nmi_flush()

__printk_nmi_flush(for CPU7)

i = atomic_read(&s->read); (100)
len = atomic_read(&s->tail); (200)
head = atomic_read(&s->head); (200)

if (atomic_cmpxchg(&s->read, i, len) != i)

we pass but we get interrupted
or rescheduled on preemptive kernel

another vprintk_nmi()
leaves: head(400), tail(400)

__printk_nmi_flush() in irq_work

it prints string between 200-400
truncate buffer: head(0), read(0)

another vprintk_nmi()
returns: head(150), tail(150)

print string between (100-200) =>
part of the new and part of old message
and modifies @head and @read a wrong way

I think that such races are hard to avoid without indexing the printed
messages. But it would make the approach too complicated.

I think that ordering CPUs is not worth it. I would go back to the
first solution, add the @lock there, and double check races with
seq_buf().

I stop here with commenting the code for now.

Best Regards,
Petr

PS: I had two cups of coffee and hope that my comments are smaller fiasco
than yesterday.

2015-06-12 12:00:45

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Thu, Jun 11, 2015 at 04:55:47PM +0200, Petr Mladek wrote:
> The output from "echo l >/proc/sysrq-trigger" looks reasonable.
> It does not mix output from different CPUs. This is expected
> because of the @lock.
>
> The other observation is that it prints CPUs in _random_ order:
> 28, 24, 25, 1, 26, 2, 27, 3, ...
>
> The order is fine when I disable the irq_work.
>
> It means that irq_works are usually faster than printk_nmi_flush() =>
> printk_nmi_flush() is not that useful => all the complexity with
> the three atomic variables (head, tail, read) did not bring
> much win.

The reason I did the printk_nmi_flush() is that the irq_work might
_never_ happen if the cpu in question is stuck with interrupts disabled.

Do something like:

local_irq_disable();
for (;;)
;
local_irq_enable(); /* like this'll ever happen */

And you won't see the offending stacktrace that makes your machine a
useless piece of power sucking junk.

> > diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> > index c099b082cd02..99bfc1e3f32a 100644
> > --- a/kernel/printk/printk.c
> > +++ b/kernel/printk/printk.c
> > @@ -1821,13 +1821,200 @@ int vprintk_default(const char *fmt, va_list args)
> > +static void __printk_nmi_flush(struct irq_work *work)
> > +{
> > + static raw_spinlock_t lock = __RAW_SPIN_LOCK_INITIALIZER(lock);
> > + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
> > + int len, head, size, i, last_i;
> > +
> > +again:
> > + /*
> > + * vprintk_nmi() truncate
> > + *
> > + * [S] head [S] head
> > + * wmb mb
> > + * [S] tail [S] read
>
> BTW, this is quite cryptic for me. Coffee did not help :-)

It states that those two sites store head first, then issue a barrier
and then store something else. This means we need to read the lot in the
reverse order and issue a matching barrier.

> *
> > + * therefore:
> > + */
> > + i = atomic_read(&s->read);
> > + len = atomic_read(&s->tail); /* up to the tail is stable */
> > + smp_rmb();
> > + head = atomic_read(&s->head);
> > +
> > + /*
> > + * We cannot truncate tail because it could overwrite a store from
> > + * vprintk_nmi(), however vprintk_nmi() will always update tail to the
> > + * correct value.
> > + *
> > + * Therefore if head < tail, we missed a truncate and should do so now.
> > + */
> > + if (head < len)
> > + len = 0;
>
> This is a bit confusing. It is a complicated way how to return on the next test.

Yes, but its two separate pieces of logic. The above deals with the
truncate not being able to also reset tail. While the below test is a
'simple' do I have to actually do anything test.

> If I get this correctly. This might happen only inside
> _printk_nmi_flush() called on another CPU (from
> printk_nmi_flush()) when it interferes with the queued
> irq_work. The irq_work is faster and truncates the buffer.
>
> So, the return is fine after all because the irq_work printed
> everything.

Either that, or another NMI happened while we started the irq_work, this
will re(enqueue) the irq_work, this one will in fact observe all data,
print the whole buffer and leave an empty buffer behind.

Then the irq_work happens _again_, sees an empty buffer and figures
that's ok, nothing to do.

> > + if (len - i <= 0) /* nothing to do */
> > + return;
> > + /*
> > + * 'Consume' this chunk, avoids concurrent callers printing the same
> > + * stuff.
> > + */
> > + if (atomic_cmpxchg(&s->read, i, len) != i)
> > + goto again;
>
> I think that this is racy:

Indeed. I'll think a bit on that.

> I think that ordering CPUs is not worth it. I would go back to the
> first solution, add the @lock there, and double check races with
> seq_buf().

You mean, provide printk_nmi_flush() but completely screw concurrency?
And basically reserve it for a last ditch effort for getting crap out?

2015-06-12 14:28:09

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Fri 2015-06-12 14:00:37, Peter Zijlstra wrote:
> On Thu, Jun 11, 2015 at 04:55:47PM +0200, Petr Mladek wrote:
> > > + * 'Consume' this chunk, avoids concurrent callers printing the same
> > > + * stuff.
> > > + */
> > > + if (atomic_cmpxchg(&s->read, i, len) != i)
> > > + goto again;
> >
> > I think that this is racy:
>
> Indeed. I'll think a bit on that.
>
> > I think that ordering CPUs is not worth it. I would go back to the
> > first solution, add the @lock there, and double check races with
> > seq_buf().
>
> You mean, provide printk_nmi_flush() but completely screw concurrency?
> And basically reserve it for a last ditch effort for getting crap out?

This looks much easier to me. We could call this only when the system
is going down.

Another possibility would be to ignore this race because it is rather
theoretical and it better to print some mess than nothing.


I wonder what Andrew thinks about it. I really admired the atomic
operations in the 2nd patch but I was also scared by them.

Note that I have already had two attempts to make printk() safe in
NMI. The first one was too precise, too complicated, and thus
rejected, see https://lkml.org/lkml/2014/6/10/388.

The second one was rather defensive and quite simple. It has not
been completely rejected but there is still some resistance to
accept non-trivial code for handling printk() in NMI, see
https://lkml.org/lkml/2015/5/29/699

I hope that your approach has a chance if we do not make it too
complicated. The big plus is that it generalizes the already used
solution for arch_trigger_all_cpu_backtrace() and the
1st variant was rather easy but quite functional.

Best Regards,
Petr

2015-06-12 15:35:13

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Fri 2015-06-12 16:28:05, Petr Mladek wrote:
> On Fri 2015-06-12 14:00:37, Peter Zijlstra wrote:
> > On Thu, Jun 11, 2015 at 04:55:47PM +0200, Petr Mladek wrote:
> > > > + * 'Consume' this chunk, avoids concurrent callers printing the same
> > > > + * stuff.
> > > > + */
> > > > + if (atomic_cmpxchg(&s->read, i, len) != i)
> > > > + goto again;
> > >
> > > I think that this is racy:
> >
> > Indeed. I'll think a bit on that.
> >
> > > I think that ordering CPUs is not worth it. I would go back to the
> > > first solution, add the @lock there, and double check races with
> > > seq_buf().
> >
> > You mean, provide printk_nmi_flush() but completely screw concurrency?
> > And basically reserve it for a last ditch effort for getting crap out?
>
> This looks much easier to me. We could call this only when the system
> is going down.
>
> Another possibility would be to ignore this race because it is rather
> theoretical and it better to print some mess than nothing.

I have got another idea. We already use a lock to synchronize
readers. It might be used to avoid the race and keep the global
flush().

I played with it, see below. It is based on 2nd Peter's patch
where I modified only struct nmi_seq_buf, __printk_nmi_flush(),
and vprintk_nmi().

It forces the reader to flush everything or nothing. It is less
effective than the previous code but much more simple.

It seems to work but it is just a proof-of-concept. Also I think that
it would be possible to use the normal seq_buf in the end.

---
arch/Kconfig | 4 ++
arch/arm/Kconfig | 1 +
arch/avr32/Kconfig | 1 +
arch/blackfin/Kconfig | 1 +
arch/cris/Kconfig | 1 +
arch/powerpc/Kconfig | 1 +
arch/s390/Kconfig | 1 +
arch/sh/Kconfig | 1 +
arch/sparc/Kconfig | 1 +
arch/tile/Kconfig | 1 +
arch/x86/Kconfig | 1 +
arch/x86/kernel/apic/hw_nmi.c | 89 ++---------------------
include/linux/hardirq.h | 2 +
include/linux/percpu.h | 3 -
include/linux/printk.h | 12 +++-
init/Kconfig | 5 ++
init/main.c | 1 +
kernel/printk/printk.c | 159 +++++++++++++++++++++++++++++++++++++++---
18 files changed, 186 insertions(+), 99 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a65eafb24997..ad8db3979e21 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -179,7 +179,11 @@ config HAVE_OPTPROBES
config HAVE_KPROBES_ON_FTRACE
bool

+config HAVE_NMI
+ bool
+
config HAVE_NMI_WATCHDOG
+ depends on HAVE_NMI
bool
#
# An arch should select this if it provides all these things:
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 45df48ba0b12..a26e83699df9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -61,6 +61,7 @@ config ARM
select HAVE_KRETPROBES if (HAVE_KPROBES)
select HAVE_MEMBLOCK
select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
+ select HAVE_NMI if (!CPU_V7)
select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
select HAVE_OPTPROBES if !THUMB2_KERNEL
select HAVE_PERF_EVENTS
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b6878eb64884..9dc3e2b1180b 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -17,6 +17,7 @@ config AVR32
select GENERIC_CLOCKEVENTS
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
+ select HAVE_NMI
help
AVR32 is a high-performance 32-bit RISC microprocessor core,
designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index af76634f8d98..47c0a55acafd 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -40,6 +40,7 @@ config BLACKFIN
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
+ select HAVE_NMI

config GENERIC_CSUM
def_bool y
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 0314e325a669..2e053f709c4b 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -58,6 +58,7 @@ config CRIS
select CLKSRC_MMIO if ETRAX_ARCH_V32
select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
+ select HAVE_NMI

config HZ
int
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 190cc48abc0c..079f4fec9586 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -153,6 +153,7 @@ config PPC
select NO_BOOTMEM
select HAVE_GENERIC_RCU_GUP
select HAVE_PERF_EVENTS_NMI if PPC64
+ select HAVE_NMI if PERF_EVENTS

config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b06dc3839268..1b84036437c0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -152,6 +152,7 @@ config S390
select TTY
select VIRT_CPU_ACCOUNTING
select VIRT_TO_BUS
+ select HAVE_NMI

config SCHED_OMIT_FRAME_POINTER
def_bool y
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 50057fed819d..7803b6863c0e 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
select OLD_SIGSUSPEND
select OLD_SIGACTION
select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_NMI
help
The SuperH is a RISC processor targeted for use in embedded systems
and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e49502acbab4..198abcb6e510 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -80,6 +80,7 @@ config SPARC64
select NO_BOOTMEM
select HAVE_ARCH_AUDITSYSCALL
select ARCH_SUPPORTS_ATOMIC_RMW
+ select HAVE_NMI

config ARCH_DEFCONFIG
string
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index a07e31b50d3f..1503ab5882cd 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -28,6 +28,7 @@ config TILE
select HAVE_DEBUG_STACKOVERFLOW
select ARCH_WANT_FRAME_POINTERS
select HAVE_CONTEXT_TRACKING
+ select HAVE_NMI if USE_PMC

# FIXME: investigate whether we need/want these options.
# select HAVE_IOREMAP_PROT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 226d5696e1d1..1d1a59e7eaac 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -38,6 +38,7 @@ config X86
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
select HAVE_IOREMAP_PROT
+ select HAVE_NMI
select HAVE_KPROBES
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 6873ab925d00..6e633e020f55 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,7 +18,6 @@
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/delay.h>
-#include <linux/seq_buf.h>

#ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(int watchdog_thresh)
@@ -30,35 +29,14 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
#ifdef arch_trigger_all_cpu_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE 4096
-
-struct nmi_seq_buf {
- unsigned char buffer[NMI_BUF_SIZE];
- struct seq_buf seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);

/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;

-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
- const char *buf = s->buffer + start;
-
- printk("%.*s", (end - start) + 1, buf);
-}
-
void arch_trigger_all_cpu_backtrace(bool include_self)
{
- struct nmi_seq_buf *s;
- int len;
- int cpu;
- int i;
int this_cpu = get_cpu();
+ int i;

if (test_and_set_bit(0, &backtrace_flag)) {
/*
@@ -73,16 +51,6 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
if (!include_self)
cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));

- cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
- /*
- * Set up per_cpu seq_buf buffers that the NMIs running on the other
- * CPUs will write to.
- */
- for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
- s = &per_cpu(nmi_print_seq, cpu);
- seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
- }
-
if (!cpumask_empty(to_cpumask(backtrace_mask))) {
pr_info("sending NMI to %s CPUs:\n",
(include_self ? "all" : "other"));
@@ -98,57 +66,15 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
}

/*
- * Now that all the NMIs have triggered, we can dump out their
- * back traces safely to the console.
+ * Force flush any remote buffers that might be stuck in IRQ context
+ * and therefore could not run their irq_work.
*/
- for_each_cpu(cpu, &printtrace_mask) {
- int last_i = 0;
-
- s = &per_cpu(nmi_print_seq, cpu);
- len = seq_buf_used(&s->seq);
- if (!len)
- continue;
-
- /* Print line by line. */
- for (i = 0; i < len; i++) {
- if (s->buffer[i] == '\n') {
- print_seq_line(s, last_i, i);
- last_i = i + 1;
- }
- }
- /* Check if there was a partial line. */
- if (last_i < len) {
- print_seq_line(s, last_i, len - 1);
- pr_cont("\n");
- }
- }
+ printk_nmi_flush();

- clear_bit(0, &backtrace_flag);
- smp_mb__after_atomic();
+ clear_bit_unlock(0, &backtrace_flag);
put_cpu();
}

-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
- struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- unsigned int len = seq_buf_used(&s->seq);
-
- seq_buf_vprintf(&s->seq, fmt, args);
- return seq_buf_used(&s->seq) - len;
-}
-
static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
@@ -157,13 +83,8 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
cpu = smp_processor_id();

if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- printk_func_t printk_func_save = this_cpu_read(printk_func);
-
- /* Replace printk to write into the NMI seq */
- this_cpu_write(printk_func, nmi_vprintk);
printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
show_regs(regs);
- this_cpu_write(printk_func, printk_func_save);

cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return NMI_HANDLED;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f4af03404b97..8c9040934ea0 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -67,10 +67,12 @@ extern void irq_exit(void);
preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
rcu_nmi_enter(); \
trace_hardirq_enter(); \
+ printk_nmi_enter(); \
} while (0)

#define nmi_exit() \
do { \
+ printk_nmi_exit(); \
trace_hardirq_exit(); \
rcu_nmi_exit(); \
BUG_ON(!in_nmi()); \
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index caebf2a758dc..04c68b9f56f8 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -135,7 +135,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
(typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
__alignof__(type))

-/* To avoid include hell, as printk can not declare this, we declare it here */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-
#endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9b30871c9149..05b7f6ed1663 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -120,7 +120,17 @@ static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif

-typedef int(*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK_NMI
+extern void printk_init(void);
+extern void printk_nmi_enter(void);
+extern void printk_nmi_exit(void);
+extern void printk_nmi_flush(void);
+#else
+static inline void printk_init(void) { }
+static inline void printk_nmi_enter(void) { }
+static inline void printk_nmi_exit(void) { }
+static inline void printk_nmi_flush(void) { }
+#endif /* PRINTK_NMI */

#ifdef CONFIG_PRINTK
asmlinkage __printf(5, 0)
diff --git a/init/Kconfig b/init/Kconfig
index dc24dec60232..acab5c67d2a9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1445,6 +1445,11 @@ config PRINTK
very difficult to diagnose system problems, saying N here is
strongly discouraged.

+config PRINTK_NMI
+ def_bool y
+ depends on PRINTK
+ depends on HAVE_NMI
+
config BUG
bool "BUG() support" if EXPERT
default y
diff --git a/init/main.c b/init/main.c
index 2115055faeac..4a28accaaa98 100644
--- a/init/main.c
+++ b/init/main.c
@@ -586,6 +586,7 @@ asmlinkage __visible void __init start_kernel(void)
timekeeping_init();
time_init();
sched_clock_postinit();
+ printk_init();
perf_event_init();
profile_init();
call_function_init();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c099b082cd02..640460d881e5 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1821,13 +1821,160 @@ int vprintk_default(const char *fmt, va_list args)
}
EXPORT_SYMBOL_GPL(vprintk_default);

+#ifdef CONFIG_PRINTK_NMI
+
+typedef int(*printk_func_t)(const char *fmt, va_list args);
/*
* This allows printk to be diverted to another function per cpu.
+ *
* This is useful for calling printk functions from within NMI
* without worrying about race conditions that can lock up the
* box.
*/
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+
+struct nmi_seq_buf {
+ atomic_t tail;
+ struct irq_work work;
+ unsigned char buffer[PAGE_SIZE -
+ 3*sizeof(atomic_t) -
+ sizeof(struct irq_work)];
+};
+
+/* Safe printing in NMI context */
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+ const char *buf = s->buffer + start;
+
+ printk("%.*s", (end - start) + 1, buf);
+}
+
+static void __printk_nmi_flush(struct irq_work *work)
+{
+ static raw_spinlock_t read_lock = __RAW_SPIN_LOCK_INITIALIZER(read_lock);
+ struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+ int len, size, i, last_i;
+
+ /*
+ * Serialize against the actual printing of the chunks
+ * to avoid interleaving lines and interleaving readers.
+ */
+ raw_spin_lock(&read_lock);
+
+ i = 0;
+more:
+ len = atomic_read(&s->tail);
+
+ /*
+ * we have the read lock; @tail must only increase if
+ * we printed something
+ */
+ WARN_ON(len && i >= len);
+
+ if (!len)
+ goto out; /* someone else already flushed the buffer */
+
+ /* make sure that data has ben written up to the @tail */
+ smp_rmb();
+
+ size = min_t(int, len, sizeof(s->buffer));
+ last_i = i;
+
+ /* Print line by line. */
+ for (; i < size; i++) {
+ if (s->buffer[i] == '\n') {
+ print_seq_line(s, last_i, i);
+ last_i = i + 1;
+ }
+ }
+ /* Check if there was a partial line. */
+ if (last_i < size) {
+ print_seq_line(s, last_i, len - 1);
+ pr_cont("\n");
+ }
+
+ /*
+ * Check nothing got added and truncate the buffer.
+ */
+ if (atomic_cmpxchg(&s->tail, len, 0) != len)
+ goto more;
+
+out:
+ raw_spin_unlock(&read_lock);
+}
+
+/*
+ * flush all per-cpu nmi buffers
+ */
+void printk_nmi_flush(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+void printk_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+ init_irq_work(&s->work, __printk_nmi_flush);
+ }
+}
+
+/*
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+ struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+ int add = 0, len;
+
+ irq_work_queue(&s->work);
+
+again:
+ len = atomic_read(&s->tail);
+
+ if (len < sizeof(s->buffer)) {
+ add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len,
+ fmt, args);
+ /* write the data before updating the used buffer size */
+ smp_wmb();
+ /* once again if someone flushed the buffer in the meantime */
+ if (atomic_cmpxchg(&s->tail, len, len + add) != len)
+ goto again;
+
+ }
+ return add;
+}
+
+void printk_nmi_enter(void)
+{
+ this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+ this_cpu_write(printk_func, vprintk_default);
+}
+
+static inline int vprintk_func(const char *fmt, va_list args)
+{
+ return this_cpu_read(printk_func)(fmt, args);
+}
+
+#else
+
+static inline int vprintk_func(const char *fmt, va_list args)
+{
+ return vprintk_default(fmt, args);
+}
+
+#endif /* PRINTK_NMI */

/**
* printk - print a kernel message
@@ -1852,21 +1999,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
*/
asmlinkage __visible int printk(const char *fmt, ...)
{
- printk_func_t vprintk_func;
va_list args;
int r;

va_start(args, fmt);
-
- /*
- * If a caller overrides the per_cpu printk_func, then it needs
- * to disable preemption when calling printk(). Otherwise
- * the printk_func should be set to the default. No need to
- * disable preemption here.
- */
- vprintk_func = this_cpu_read(printk_func);
r = vprintk_func(fmt, args);
-
va_end(args);

return r;
--
1.8.5.6

2015-06-25 18:15:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Fri, Jun 12, 2015 at 05:35:05PM +0200, Petr Mladek wrote:
> I have got another idea. We already use a lock to synchronize
> readers. It might be used to avoid the race and keep the global
> flush().
>
> I played with it, see below. It is based on 2nd Peter's patch
> where I modified only struct nmi_seq_buf, __printk_nmi_flush(),
> and vprintk_nmi().
>
> It forces the reader to flush everything or nothing. It is less
> effective than the previous code but much more simple.
>
> It seems to work but it is just a proof-of-concept. Also I think that
> it would be possible to use the normal seq_buf in the end.

Sorry, got side tracked. Yes this looks ok. Want to submit a proper
patch?

2015-06-26 08:06:21

by Petr Mladek

[permalink] [raw]
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess

On Thu 2015-06-25 20:14:59, Peter Zijlstra wrote:
> On Fri, Jun 12, 2015 at 05:35:05PM +0200, Petr Mladek wrote:
> > I have got another idea. We already use a lock to synchronize
> > readers. It might be used to avoid the race and keep the global
> > flush().
> >
> > I played with it, see below. It is based on 2nd Peter's patch
> > where I modified only struct nmi_seq_buf, __printk_nmi_flush(),
> > and vprintk_nmi().
> >
> > It forces the reader to flush everything or nothing. It is less
> > effective than the previous code but much more simple.
> >
> > It seems to work but it is just a proof-of-concept. Also I think that
> > it would be possible to use the normal seq_buf in the end.
>
> Sorry, got side tracked. Yes this looks ok. Want to submit a proper
> patch?

No problem. OK, I will prepare a proper patch the following week if
nothing break my plans ;-)

Thanks for feedback.

Best Regards,
Petr