Date: Wed, 10 Jun 2015 21:23:04 +0200
From: Peter Zijlstra <peterz@infradead.org>
To: Petr Mladek <pmladek@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>, linux-kernel@vger.kernel.org,
        jkosina@suse.cz, paulmck@linux.vnet.ibm.com,
        Ingo Molnar <mingo@kernel.org>, Thomas Gleixner <tglx@linutronix.de>
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess
Message-ID: <20150610192304.GY18673@twins.programming.kicks-ass.net>
References: <20150610125509.GO19282@twins.programming.kicks-ass.net>
 <20150610143155.GD9409@pathway.suse.cz>
 <20150610152917.GI3644@twins.programming.kicks-ass.net>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20150610152917.GI3644@twins.programming.kicks-ass.net>
User-Agent: Mutt/1.5.21 (2012-12-30)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 17702
Lines: 627

On Wed, Jun 10, 2015 at 05:29:17PM +0200, Peter Zijlstra wrote:
> No, everything is strictly per cpu.

Which has the problem that if the CPU is actually stuck with IRQs
disabled, the irq_work won't run and we'll miss the most important
backtrace.

Below a version which does x-cpu stuff to allow the
trigger_all*_cpu_backtrace() initiator to flush buffers on behalf of
other CPUs.

As a bonus it doesn't need that seq_buf stuff anymore ;-)

Compile tested only.

---
 arch/Kconfig                  |   4 +
 arch/arm/Kconfig              |   1 +
 arch/avr32/Kconfig            |   1 +
 arch/blackfin/Kconfig         |   1 +
 arch/cris/Kconfig             |   1 +
 arch/powerpc/Kconfig          |   1 +
 arch/s390/Kconfig             |   1 +
 arch/sh/Kconfig               |   1 +
 arch/sparc/Kconfig            |   1 +
 arch/tile/Kconfig             |   1 +
 arch/x86/Kconfig              |   1 +
 arch/x86/kernel/apic/hw_nmi.c |  89 ++-----------------
 include/linux/hardirq.h       |   2 +
 include/linux/percpu.h        |   3 -
 include/linux/printk.h        |  12 ++-
 init/Kconfig                  |   5 ++
 init/main.c                   |   1 +
 kernel/printk/printk.c        | 199 +++++++++++++++++++++++++++++++++++++++---
 18 files changed, 226 insertions(+), 99 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a65eafb24997..ad8db3979e21 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -179,7 +179,11 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
+config HAVE_NMI
+	bool
+
 config HAVE_NMI_WATCHDOG
+	depends on HAVE_NMI
 	bool
 #
 # An arch should select this if it provides all these things:
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 45df48ba0b12..a26e83699df9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -61,6 +61,7 @@ config ARM
 	select HAVE_KRETPROBES if (HAVE_KPROBES)
 	select HAVE_MEMBLOCK
 	select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
+	select HAVE_NMI if (!CPU_V7)
 	select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
 	select HAVE_OPTPROBES if !THUMB2_KERNEL
 	select HAVE_PERF_EVENTS
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b6878eb64884..9dc3e2b1180b 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -17,6 +17,7 @@ config AVR32
 	select GENERIC_CLOCKEVENTS
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
+	select HAVE_NMI
 	help
 	  AVR32 is a high-performance 32-bit RISC microprocessor core,
 	  designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index af76634f8d98..47c0a55acafd 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -40,6 +40,7 @@ config BLACKFIN
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select HAVE_NMI
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 0314e325a669..2e053f709c4b 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -58,6 +58,7 @@ config CRIS
 	select CLKSRC_MMIO if ETRAX_ARCH_V32
 	select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
 	select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
+	select HAVE_NMI
 
 config HZ
 	int
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 190cc48abc0c..079f4fec9586 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -153,6 +153,7 @@ config PPC
 	select NO_BOOTMEM
 	select HAVE_GENERIC_RCU_GUP
 	select HAVE_PERF_EVENTS_NMI if PPC64
+	select HAVE_NMI if PERF_EVENTS
 
 config GENERIC_CSUM
 	def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b06dc3839268..1b84036437c0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -152,6 +152,7 @@ config S390
 	select TTY
 	select VIRT_CPU_ACCOUNTING
 	select VIRT_TO_BUS
+	select HAVE_NMI
 
 config SCHED_OMIT_FRAME_POINTER
 	def_bool y
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 50057fed819d..7803b6863c0e 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_NMI
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e49502acbab4..198abcb6e510 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -80,6 +80,7 @@ config SPARC64
 	select NO_BOOTMEM
 	select HAVE_ARCH_AUDITSYSCALL
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select HAVE_NMI
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index a07e31b50d3f..1503ab5882cd 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -28,6 +28,7 @@ config TILE
 	select HAVE_DEBUG_STACKOVERFLOW
 	select ARCH_WANT_FRAME_POINTERS
 	select HAVE_CONTEXT_TRACKING
+	select HAVE_NMI if USE_PMC
 
 # FIXME: investigate whether we need/want these options.
 #	select HAVE_IOREMAP_PROT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 228aa35d7e89..67dbc7442499 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -118,6 +118,7 @@ config X86
 	select HAVE_MEMBLOCK
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_MIXED_BREAKPOINTS_REGS
+	select HAVE_NMI
 	select HAVE_OPROFILE
 	select HAVE_OPTPROBES
 	select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 6873ab925d00..6e633e020f55 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,7 +18,6 @@
 #include <linux/nmi.h>
 #include <linux/module.h>
 #include <linux/delay.h>
-#include <linux/seq_buf.h>
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 u64 hw_nmi_get_sample_period(int watchdog_thresh)
@@ -30,35 +29,14 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
 #ifdef arch_trigger_all_cpu_backtrace
 /* For reliability, we're prepared to waste bits here. */
 static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE		4096
-
-struct nmi_seq_buf {
-	unsigned char		buffer[NMI_BUF_SIZE];
-	struct seq_buf		seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
 
 /* "in progress" flag of arch_trigger_all_cpu_backtrace */
 static unsigned long backtrace_flag;
 
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
-	const char *buf = s->buffer + start;
-
-	printk("%.*s", (end - start) + 1, buf);
-}
-
 void arch_trigger_all_cpu_backtrace(bool include_self)
 {
-	struct nmi_seq_buf *s;
-	int len;
-	int cpu;
-	int i;
 	int this_cpu = get_cpu();
+	int i;
 
 	if (test_and_set_bit(0, &backtrace_flag)) {
 		/*
@@ -73,16 +51,6 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
 	if (!include_self)
 		cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
 
-	cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
-	/*
-	 * Set up per_cpu seq_buf buffers that the NMIs running on the other
-	 * CPUs will write to.
-	 */
-	for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
-		s = &per_cpu(nmi_print_seq, cpu);
-		seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
-	}
-
 	if (!cpumask_empty(to_cpumask(backtrace_mask))) {
 		pr_info("sending NMI to %s CPUs:\n",
 			(include_self ? "all" : "other"));
@@ -98,57 +66,15 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
 	}
 
 	/*
-	 * Now that all the NMIs have triggered, we can dump out their
-	 * back traces safely to the console.
+	 * Force flush any remote buffers that might be stuck in IRQ context
+	 * and therefore could not run their irq_work.
 	 */
-	for_each_cpu(cpu, &printtrace_mask) {
-		int last_i = 0;
-
-		s = &per_cpu(nmi_print_seq, cpu);
-		len = seq_buf_used(&s->seq);
-		if (!len)
-			continue;
-
-		/* Print line by line. */
-		for (i = 0; i < len; i++) {
-			if (s->buffer[i] == '\n') {
-				print_seq_line(s, last_i, i);
-				last_i = i + 1;
-			}
-		}
-		/* Check if there was a partial line. */
-		if (last_i < len) {
-			print_seq_line(s, last_i, len - 1);
-			pr_cont("\n");
-		}
-	}
+	printk_nmi_flush();
 
-	clear_bit(0, &backtrace_flag);
-	smp_mb__after_atomic();
+	clear_bit_unlock(0, &backtrace_flag);
 	put_cpu();
 }
 
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
-	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-	unsigned int len = seq_buf_used(&s->seq);
-
-	seq_buf_vprintf(&s->seq, fmt, args);
-	return seq_buf_used(&s->seq) - len;
-}
-
 static int
 arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
 {
@@ -157,13 +83,8 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
 	cpu = smp_processor_id();
 
 	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-		printk_func_t printk_func_save = this_cpu_read(printk_func);
-
-		/* Replace printk to write into the NMI seq */
-		this_cpu_write(printk_func, nmi_vprintk);
 		printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
 		show_regs(regs);
-		this_cpu_write(printk_func, printk_func_save);
 
 		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
 		return NMI_HANDLED;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index dfd59d6bc6f0..a477e0766d2e 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -67,10 +67,12 @@ extern void irq_exit(void);
 		preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);	\
 		rcu_nmi_enter();				\
 		trace_hardirq_enter();				\
+		printk_nmi_enter();				\
 	} while (0)
 
 #define nmi_exit()						\
 	do {							\
+		printk_nmi_exit();				\
 		trace_hardirq_exit();				\
 		rcu_nmi_exit();					\
 		BUG_ON(!in_nmi());				\
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index caebf2a758dc..04c68b9f56f8 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -135,7 +135,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
 	(typeof(type) __percpu *)__alloc_percpu(sizeof(type),		\
 						__alignof__(type))
 
-/* To avoid include hell, as printk can not declare this, we declare it here */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-
 #endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9b30871c9149..05b7f6ed1663 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -120,7 +120,17 @@ static inline __printf(1, 2) __cold
 void early_printk(const char *s, ...) { }
 #endif
 
-typedef int(*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK_NMI
+extern void printk_init(void);
+extern void printk_nmi_enter(void);
+extern void printk_nmi_exit(void);
+extern void printk_nmi_flush(void);
+#else
+static inline void printk_init(void) { }
+static inline void printk_nmi_enter(void) { }
+static inline void printk_nmi_exit(void) { }
+static inline void printk_nmi_flush(void) { }
+#endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
 asmlinkage __printf(5, 0)
diff --git a/init/Kconfig b/init/Kconfig
index b999fa381bf9..fe9a439c8843 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1431,6 +1431,11 @@ config PRINTK
 	  very difficult to diagnose system problems, saying N here is
 	  strongly discouraged.
 
+config PRINTK_NMI
+	def_bool y
+	depends on PRINTK
+	depends on HAVE_NMI
+
 config BUG
 	bool "BUG() support" if EXPERT
 	default y
diff --git a/init/main.c b/init/main.c
index 2115055faeac..4a28accaaa98 100644
--- a/init/main.c
+++ b/init/main.c
@@ -586,6 +586,7 @@ asmlinkage __visible void __init start_kernel(void)
 	timekeeping_init();
 	time_init();
 	sched_clock_postinit();
+	printk_init();
 	perf_event_init();
 	profile_init();
 	call_function_init();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c099b082cd02..99bfc1e3f32a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1821,13 +1821,200 @@ int vprintk_default(const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(vprintk_default);
 
+#ifdef CONFIG_PRINTK_NMI
+
+typedef int(*printk_func_t)(const char *fmt, va_list args);
 /*
  * This allows printk to be diverted to another function per cpu.
+ *
  * This is useful for calling printk functions from within NMI
  * without worrying about race conditions that can lock up the
  * box.
  */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+
+struct nmi_seq_buf {
+	atomic_t		head, tail, read;
+	struct irq_work		work;
+	unsigned char		buffer[PAGE_SIZE -
+					3*sizeof(atomic_t) -
+					sizeof(struct irq_work)];
+};
+
+/* Safe printing in NMI context */
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+	const char *buf = s->buffer + start;
+
+	printk("%.*s", (end - start) + 1, buf);
+}
+
+static void __printk_nmi_flush(struct irq_work *work)
+{
+	static raw_spinlock_t lock = __RAW_SPIN_LOCK_INITIALIZER(lock);
+	struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+	int len, head, size, i, last_i;
+
+again:
+	/*
+	 * vprintk_nmi()	truncate
+	 *
+	 * [S] head		[S] head
+	 *     wmb		mb
+	 * [S] tail		[S] read
+	 *
+	 * therefore:
+	 */
+	i = atomic_read(&s->read);
+	len = atomic_read(&s->tail); /* up to the tail is stable */
+	smp_rmb();
+	head = atomic_read(&s->head);
+
+	/*
+	 * We cannot truncate tail because it could overwrite a store from
+	 * vprintk_nmi(), however vprintk_nmi() will always update tail to the
+	 * correct value.
+	 *
+	 * Therefore if head < tail, we missed a truncate and should do so now.
+	 */
+	if (head < len)
+		len = 0;
+
+	if (len - i <= 0) /* nothing to do */
+		return;
+
+	/*
+	 * 'Consume' this chunk, avoids concurrent callers printing the same
+	 * stuff.
+	 */
+	if (atomic_cmpxchg(&s->read, i, len) != i)
+		goto again;
+
+	/*
+	 * Serialize the actual printing of the chunks
+	 * to avoid interleaving lines.
+	 */
+	raw_spin_lock(&lock);
+
+	size = min_t(int, len, sizeof(s->buffer));
+	last_i = i;
+
+	/* Print line by line. */
+	for (; i < size; i++) {
+		if (s->buffer[i] == '\n') {
+			print_seq_line(s, last_i, i);
+			last_i = i + 1;
+		}
+	}
+	/* Check if there was a partial line. */
+	if (last_i < size) {
+		print_seq_line(s, last_i, len - 1);
+		pr_cont("\n");
+	}
+
+	raw_spin_unlock(&lock);
+
+	/*
+	 * Check nothing got added and truncate the buffer. If head doesn't
+	 * match the irq_work is guaranteed to be queued.
+	 */
+	if (atomic_cmpxchg(&s->head, len, 0) != len)
+		return; /* next irq_work run */
+
+	/*
+	 * We cannot truncate tail; see above.
+	 */
+
+	/*
+	 * If we call __printk_nmi_flush() concurrently before this
+	 * it will see s->read > s->len and not print anything.
+	 */
+	atomic_set(&s->read, 0);
+}
+
+/*
+ * flush all per-cpu nmi buffers
+ */
+void printk_nmi_flush(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		__printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+void printk_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+		init_irq_work(&s->work, __printk_nmi_flush);
+	}
+}
+
+/*
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+	int add = -1, len;
+
+	irq_work_queue(&s->work);
+
+	/*
+	 * head > tail; indicates someone is writing.
+	 * avoids the buffer being truncated.
+	 *
+	 * we rely on the memory barrier to ensure the increment
+	 * is visible before we start writing to the buffer.
+	 */
+	len = atomic_inc_return(&s->head);
+
+	if (len < sizeof(s->buffer)) {
+		add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+		/*
+		 * Fwd head to the right location; NMIs do not nest, therefore
+		 * we can use regular stores.
+		 */
+		atomic_set(&s->head, len + add);
+		/*
+		 * Once the data is written and head is correct, update tail to
+		 * match; indicating we're complete.
+		 */
+		smp_wmb();
+		atomic_set(&s->tail, len + add);
+	}
+
+	return add;
+}
+
+void printk_nmi_enter(void)
+{
+	this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+	this_cpu_write(printk_func, vprintk_default);
+}
+
+static inline int vprintk_func(const char *fmt, va_list args)
+{
+	return this_cpu_read(printk_func)(fmt, args);
+}
+
+#else
+
+static inline int vprintk_func(const char *fmt, va_list args)
+{
+	return vprintk_default(fmt, args);
+}
+
+#endif /* PRINTK_NMI */
 
 /**
  * printk - print a kernel message
@@ -1852,21 +2039,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
  */
 asmlinkage __visible int printk(const char *fmt, ...)
 {
-	printk_func_t vprintk_func;
 	va_list args;
 	int r;
 
 	va_start(args, fmt);
-
-	/*
-	 * If a caller overrides the per_cpu printk_func, then it needs
-	 * to disable preemption when calling printk(). Otherwise
-	 * the printk_func should be set to the default. No need to
-	 * disable preemption here.
-	 */
-	vprintk_func = this_cpu_read(printk_func);
 	r = vprintk_func(fmt, args);
-
 	va_end(args);
 
 	return r;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/