Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754406AbbFJTXM (ORCPT ); Wed, 10 Jun 2015 15:23:12 -0400 Received: from casper.infradead.org ([85.118.1.10]:33972 "EHLO casper.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752444AbbFJTXL (ORCPT ); Wed, 10 Jun 2015 15:23:11 -0400 Date: Wed, 10 Jun 2015 21:23:04 +0200 From: Peter Zijlstra To: Petr Mladek Cc: Steven Rostedt , linux-kernel@vger.kernel.org, jkosina@suse.cz, paulmck@linux.vnet.ibm.com, Ingo Molnar , Thomas Gleixner Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess Message-ID: <20150610192304.GY18673@twins.programming.kicks-ass.net> References: <20150610125509.GO19282@twins.programming.kicks-ass.net> <20150610143155.GD9409@pathway.suse.cz> <20150610152917.GI3644@twins.programming.kicks-ass.net> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20150610152917.GI3644@twins.programming.kicks-ass.net> User-Agent: Mutt/1.5.21 (2012-12-30) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 17702 Lines: 627 On Wed, Jun 10, 2015 at 05:29:17PM +0200, Peter Zijlstra wrote: > No, everything is strictly per cpu. Which has the problem that if the CPU is actually stuck with IRQs disabled, the irq_work won't run and we'll miss the most important backtrace. Below a version which does x-cpu stuff to allow the trigger_all*_cpu_backtrace() initiator to flush buffers on behalf of other CPUs. As a bonus it doesn't need that seq_buf stuff anymore ;-) Compile tested only. --- arch/Kconfig | 4 + arch/arm/Kconfig | 1 + arch/avr32/Kconfig | 1 + arch/blackfin/Kconfig | 1 + arch/cris/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sh/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/tile/Kconfig | 1 + arch/x86/Kconfig | 1 + arch/x86/kernel/apic/hw_nmi.c | 89 ++----------------- include/linux/hardirq.h | 2 + include/linux/percpu.h | 3 - include/linux/printk.h | 12 ++- init/Kconfig | 5 ++ init/main.c | 1 + kernel/printk/printk.c | 199 +++++++++++++++++++++++++++++++++++++++--- 18 files changed, 226 insertions(+), 99 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index a65eafb24997..ad8db3979e21 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -179,7 +179,11 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool +config HAVE_NMI + bool + config HAVE_NMI_WATCHDOG + depends on HAVE_NMI bool # # An arch should select this if it provides all these things: diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 45df48ba0b12..a26e83699df9 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -61,6 +61,7 @@ config ARM select HAVE_KRETPROBES if (HAVE_KPROBES) select HAVE_MEMBLOCK select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND + select HAVE_NMI if (!CPU_V7) select HAVE_OPROFILE if (HAVE_PERF_EVENTS) select HAVE_OPTPROBES if !THUMB2_KERNEL select HAVE_PERF_EVENTS diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index b6878eb64884..9dc3e2b1180b 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -17,6 +17,7 @@ config AVR32 select GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA + select HAVE_NMI help AVR32 is a high-performance 32-bit RISC microprocessor core, designed for cost-sensitive embedded applications, with particular diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index af76634f8d98..47c0a55acafd 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -40,6 +40,7 @@ config BLACKFIN select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA select HAVE_DEBUG_STACKOVERFLOW + select HAVE_NMI config GENERIC_CSUM def_bool y diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 0314e325a669..2e053f709c4b 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -58,6 +58,7 @@ config CRIS select CLKSRC_MMIO if ETRAX_ARCH_V32 select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32 select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32 + select HAVE_NMI config HZ int diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 190cc48abc0c..079f4fec9586 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -153,6 +153,7 @@ config PPC select NO_BOOTMEM select HAVE_GENERIC_RCU_GUP select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_NMI if PERF_EVENTS config GENERIC_CSUM def_bool CPU_LITTLE_ENDIAN diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b06dc3839268..1b84036437c0 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -152,6 +152,7 @@ config S390 select TTY select VIRT_CPU_ACCOUNTING select VIRT_TO_BUS + select HAVE_NMI config SCHED_OMIT_FRAME_POINTER def_bool y diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 50057fed819d..7803b6863c0e 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -44,6 +44,7 @@ config SUPERH select OLD_SIGSUSPEND select OLD_SIGACTION select HAVE_ARCH_AUDITSYSCALL + select HAVE_NMI help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index e49502acbab4..198abcb6e510 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -80,6 +80,7 @@ config SPARC64 select NO_BOOTMEM select HAVE_ARCH_AUDITSYSCALL select ARCH_SUPPORTS_ATOMIC_RMW + select HAVE_NMI config ARCH_DEFCONFIG string diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index a07e31b50d3f..1503ab5882cd 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -28,6 +28,7 @@ config TILE select HAVE_DEBUG_STACKOVERFLOW select ARCH_WANT_FRAME_POINTERS select HAVE_CONTEXT_TRACKING + select HAVE_NMI if USE_PMC # FIXME: investigate whether we need/want these options. # select HAVE_IOREMAP_PROT diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 228aa35d7e89..67dbc7442499 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -118,6 +118,7 @@ config X86 select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS + select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES select HAVE_PCSPKR_PLATFORM diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 6873ab925d00..6e633e020f55 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,7 +18,6 @@ #include #include #include -#include #ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh) @@ -30,35 +29,14 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #ifdef arch_trigger_all_cpu_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static cpumask_t printtrace_mask; - -#define NMI_BUF_SIZE 4096 - -struct nmi_seq_buf { - unsigned char buffer[NMI_BUF_SIZE]; - struct seq_buf seq; -}; - -/* Safe printing in NMI context */ -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; -static void print_seq_line(struct nmi_seq_buf *s, int start, int end) -{ - const char *buf = s->buffer + start; - - printk("%.*s", (end - start) + 1, buf); -} - void arch_trigger_all_cpu_backtrace(bool include_self) { - struct nmi_seq_buf *s; - int len; - int cpu; - int i; int this_cpu = get_cpu(); + int i; if (test_and_set_bit(0, &backtrace_flag)) { /* @@ -73,16 +51,6 @@ void arch_trigger_all_cpu_backtrace(bool include_self) if (!include_self) cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); - cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); - /* - * Set up per_cpu seq_buf buffers that the NMIs running on the other - * CPUs will write to. - */ - for_each_cpu(cpu, to_cpumask(backtrace_mask)) { - s = &per_cpu(nmi_print_seq, cpu); - seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); - } - if (!cpumask_empty(to_cpumask(backtrace_mask))) { pr_info("sending NMI to %s CPUs:\n", (include_self ? "all" : "other")); @@ -98,57 +66,15 @@ void arch_trigger_all_cpu_backtrace(bool include_self) } /* - * Now that all the NMIs have triggered, we can dump out their - * back traces safely to the console. + * Force flush any remote buffers that might be stuck in IRQ context + * and therefore could not run their irq_work. */ - for_each_cpu(cpu, &printtrace_mask) { - int last_i = 0; - - s = &per_cpu(nmi_print_seq, cpu); - len = seq_buf_used(&s->seq); - if (!len) - continue; - - /* Print line by line. */ - for (i = 0; i < len; i++) { - if (s->buffer[i] == '\n') { - print_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < len) { - print_seq_line(s, last_i, len - 1); - pr_cont("\n"); - } - } + printk_nmi_flush(); - clear_bit(0, &backtrace_flag); - smp_mb__after_atomic(); + clear_bit_unlock(0, &backtrace_flag); put_cpu(); } -/* - * It is not safe to call printk() directly from NMI handlers. - * It may be fine if the NMI detected a lock up and we have no choice - * but to do so, but doing a NMI on all other CPUs to get a back trace - * can be done with a sysrq-l. We don't want that to lock up, which - * can happen if the NMI interrupts a printk in progress. - * - * Instead, we redirect the vprintk() to this nmi_vprintk() that writes - * the content into a per cpu seq_buf buffer. Then when the NMIs are - * all done, we can safely dump the contents of the seq_buf to a printk() - * from a non NMI context. - */ -static int nmi_vprintk(const char *fmt, va_list args) -{ - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - unsigned int len = seq_buf_used(&s->seq); - - seq_buf_vprintf(&s->seq, fmt, args); - return seq_buf_used(&s->seq) - len; -} - static int arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { @@ -157,13 +83,8 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - printk_func_t printk_func_save = this_cpu_read(printk_func); - - /* Replace printk to write into the NMI seq */ - this_cpu_write(printk_func, nmi_vprintk); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); - this_cpu_write(printk_func, printk_func_save); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return NMI_HANDLED; diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index dfd59d6bc6f0..a477e0766d2e 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -67,10 +67,12 @@ extern void irq_exit(void); preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ rcu_nmi_enter(); \ trace_hardirq_enter(); \ + printk_nmi_enter(); \ } while (0) #define nmi_exit() \ do { \ + printk_nmi_exit(); \ trace_hardirq_exit(); \ rcu_nmi_exit(); \ BUG_ON(!in_nmi()); \ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index caebf2a758dc..04c68b9f56f8 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -135,7 +135,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ __alignof__(type)) -/* To avoid include hell, as printk can not declare this, we declare it here */ -DECLARE_PER_CPU(printk_func_t, printk_func); - #endif /* __LINUX_PERCPU_H */ diff --git a/include/linux/printk.h b/include/linux/printk.h index 9b30871c9149..05b7f6ed1663 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -120,7 +120,17 @@ static inline __printf(1, 2) __cold void early_printk(const char *s, ...) { } #endif -typedef int(*printk_func_t)(const char *fmt, va_list args); +#ifdef CONFIG_PRINTK_NMI +extern void printk_init(void); +extern void printk_nmi_enter(void); +extern void printk_nmi_exit(void); +extern void printk_nmi_flush(void); +#else +static inline void printk_init(void) { } +static inline void printk_nmi_enter(void) { } +static inline void printk_nmi_exit(void) { } +static inline void printk_nmi_flush(void) { } +#endif /* PRINTK_NMI */ #ifdef CONFIG_PRINTK asmlinkage __printf(5, 0) diff --git a/init/Kconfig b/init/Kconfig index b999fa381bf9..fe9a439c8843 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1431,6 +1431,11 @@ config PRINTK very difficult to diagnose system problems, saying N here is strongly discouraged. +config PRINTK_NMI + def_bool y + depends on PRINTK + depends on HAVE_NMI + config BUG bool "BUG() support" if EXPERT default y diff --git a/init/main.c b/init/main.c index 2115055faeac..4a28accaaa98 100644 --- a/init/main.c +++ b/init/main.c @@ -586,6 +586,7 @@ asmlinkage __visible void __init start_kernel(void) timekeeping_init(); time_init(); sched_clock_postinit(); + printk_init(); perf_event_init(); profile_init(); call_function_init(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c099b082cd02..99bfc1e3f32a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1821,13 +1821,200 @@ int vprintk_default(const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(vprintk_default); +#ifdef CONFIG_PRINTK_NMI + +typedef int(*printk_func_t)(const char *fmt, va_list args); /* * This allows printk to be diverted to another function per cpu. + * * This is useful for calling printk functions from within NMI * without worrying about race conditions that can lock up the * box. */ -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; +static DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; + +struct nmi_seq_buf { + atomic_t head, tail, read; + struct irq_work work; + unsigned char buffer[PAGE_SIZE - + 3*sizeof(atomic_t) - + sizeof(struct irq_work)]; +}; + +/* Safe printing in NMI context */ +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); + +static void print_seq_line(struct nmi_seq_buf *s, int start, int end) +{ + const char *buf = s->buffer + start; + + printk("%.*s", (end - start) + 1, buf); +} + +static void __printk_nmi_flush(struct irq_work *work) +{ + static raw_spinlock_t lock = __RAW_SPIN_LOCK_INITIALIZER(lock); + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); + int len, head, size, i, last_i; + +again: + /* + * vprintk_nmi() truncate + * + * [S] head [S] head + * wmb mb + * [S] tail [S] read + * + * therefore: + */ + i = atomic_read(&s->read); + len = atomic_read(&s->tail); /* up to the tail is stable */ + smp_rmb(); + head = atomic_read(&s->head); + + /* + * We cannot truncate tail because it could overwrite a store from + * vprintk_nmi(), however vprintk_nmi() will always update tail to the + * correct value. + * + * Therefore if head < tail, we missed a truncate and should do so now. + */ + if (head < len) + len = 0; + + if (len - i <= 0) /* nothing to do */ + return; + + /* + * 'Consume' this chunk, avoids concurrent callers printing the same + * stuff. + */ + if (atomic_cmpxchg(&s->read, i, len) != i) + goto again; + + /* + * Serialize the actual printing of the chunks + * to avoid interleaving lines. + */ + raw_spin_lock(&lock); + + size = min_t(int, len, sizeof(s->buffer)); + last_i = i; + + /* Print line by line. */ + for (; i < size; i++) { + if (s->buffer[i] == '\n') { + print_seq_line(s, last_i, i); + last_i = i + 1; + } + } + /* Check if there was a partial line. */ + if (last_i < size) { + print_seq_line(s, last_i, len - 1); + pr_cont("\n"); + } + + raw_spin_unlock(&lock); + + /* + * Check nothing got added and truncate the buffer. If head doesn't + * match the irq_work is guaranteed to be queued. + */ + if (atomic_cmpxchg(&s->head, len, 0) != len) + return; /* next irq_work run */ + + /* + * We cannot truncate tail; see above. + */ + + /* + * If we call __printk_nmi_flush() concurrently before this + * it will see s->read > s->len and not print anything. + */ + atomic_set(&s->read, 0); +} + +/* + * flush all per-cpu nmi buffers + */ +void printk_nmi_flush(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); +} + +void printk_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu); + + init_irq_work(&s->work, __printk_nmi_flush); + } +} + +/* + */ +static int vprintk_nmi(const char *fmt, va_list args) +{ + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); + int add = -1, len; + + irq_work_queue(&s->work); + + /* + * head > tail; indicates someone is writing. + * avoids the buffer being truncated. + * + * we rely on the memory barrier to ensure the increment + * is visible before we start writing to the buffer. + */ + len = atomic_inc_return(&s->head); + + if (len < sizeof(s->buffer)) { + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + /* + * Fwd head to the right location; NMIs do not nest, therefore + * we can use regular stores. + */ + atomic_set(&s->head, len + add); + /* + * Once the data is written and head is correct, update tail to + * match; indicating we're complete. + */ + smp_wmb(); + atomic_set(&s->tail, len + add); + } + + return add; +} + +void printk_nmi_enter(void) +{ + this_cpu_write(printk_func, vprintk_nmi); +} + +void printk_nmi_exit(void) +{ + this_cpu_write(printk_func, vprintk_default); +} + +static inline int vprintk_func(const char *fmt, va_list args) +{ + return this_cpu_read(printk_func)(fmt, args); +} + +#else + +static inline int vprintk_func(const char *fmt, va_list args) +{ + return vprintk_default(fmt, args); +} + +#endif /* PRINTK_NMI */ /** * printk - print a kernel message @@ -1852,21 +2039,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; */ asmlinkage __visible int printk(const char *fmt, ...) { - printk_func_t vprintk_func; va_list args; int r; va_start(args, fmt); - - /* - * If a caller overrides the per_cpu printk_func, then it needs - * to disable preemption when calling printk(). Otherwise - * the printk_func should be set to the default. No need to - * disable preemption here. - */ - vprintk_func = this_cpu_read(printk_func); r = vprintk_func(fmt, args); - va_end(args); return r; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/