Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755379AbbFLPfN (ORCPT ); Fri, 12 Jun 2015 11:35:13 -0400 Received: from cantor2.suse.de ([195.135.220.15]:42482 "EHLO mx2.suse.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753666AbbFLPfH (ORCPT ); Fri, 12 Jun 2015 11:35:07 -0400 Date: Fri, 12 Jun 2015 17:35:05 +0200 From: Petr Mladek To: Peter Zijlstra Cc: Steven Rostedt , linux-kernel@vger.kernel.org, jkosina@suse.cz, paulmck@linux.vnet.ibm.com, Ingo Molnar , Thomas Gleixner Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess Message-ID: <20150612153505.GJ9409@pathway.suse.cz> References: <20150610125509.GO19282@twins.programming.kicks-ass.net> <20150610143155.GD9409@pathway.suse.cz> <20150610152917.GI3644@twins.programming.kicks-ass.net> <20150610192304.GY18673@twins.programming.kicks-ass.net> <20150611145547.GA3234@dhcp128.suse.cz> <20150612120037.GR3644@twins.programming.kicks-ass.net> <20150612142805.GI9409@pathway.suse.cz> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20150612142805.GI9409@pathway.suse.cz> User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 17738 Lines: 615 On Fri 2015-06-12 16:28:05, Petr Mladek wrote: > On Fri 2015-06-12 14:00:37, Peter Zijlstra wrote: > > On Thu, Jun 11, 2015 at 04:55:47PM +0200, Petr Mladek wrote: > > > > + * 'Consume' this chunk, avoids concurrent callers printing the same > > > > + * stuff. > > > > + */ > > > > + if (atomic_cmpxchg(&s->read, i, len) != i) > > > > + goto again; > > > > > > I think that this is racy: > > > > Indeed. I'll think a bit on that. > > > > > I think that ordering CPUs is not worth it. I would go back to the > > > first solution, add the @lock there, and double check races with > > > seq_buf(). > > > > You mean, provide printk_nmi_flush() but completely screw concurrency? > > And basically reserve it for a last ditch effort for getting crap out? > > This looks much easier to me. We could call this only when the system > is going down. > > Another possibility would be to ignore this race because it is rather > theoretical and it better to print some mess than nothing. I have got another idea. We already use a lock to synchronize readers. It might be used to avoid the race and keep the global flush(). I played with it, see below. It is based on 2nd Peter's patch where I modified only struct nmi_seq_buf, __printk_nmi_flush(), and vprintk_nmi(). It forces the reader to flush everything or nothing. It is less effective than the previous code but much more simple. It seems to work but it is just a proof-of-concept. Also I think that it would be possible to use the normal seq_buf in the end. --- arch/Kconfig | 4 ++ arch/arm/Kconfig | 1 + arch/avr32/Kconfig | 1 + arch/blackfin/Kconfig | 1 + arch/cris/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sh/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/tile/Kconfig | 1 + arch/x86/Kconfig | 1 + arch/x86/kernel/apic/hw_nmi.c | 89 ++--------------------- include/linux/hardirq.h | 2 + include/linux/percpu.h | 3 - include/linux/printk.h | 12 +++- init/Kconfig | 5 ++ init/main.c | 1 + kernel/printk/printk.c | 159 +++++++++++++++++++++++++++++++++++++++--- 18 files changed, 186 insertions(+), 99 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index a65eafb24997..ad8db3979e21 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -179,7 +179,11 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool +config HAVE_NMI + bool + config HAVE_NMI_WATCHDOG + depends on HAVE_NMI bool # # An arch should select this if it provides all these things: diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 45df48ba0b12..a26e83699df9 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -61,6 +61,7 @@ config ARM select HAVE_KRETPROBES if (HAVE_KPROBES) select HAVE_MEMBLOCK select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND + select HAVE_NMI if (!CPU_V7) select HAVE_OPROFILE if (HAVE_PERF_EVENTS) select HAVE_OPTPROBES if !THUMB2_KERNEL select HAVE_PERF_EVENTS diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index b6878eb64884..9dc3e2b1180b 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -17,6 +17,7 @@ config AVR32 select GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA + select HAVE_NMI help AVR32 is a high-performance 32-bit RISC microprocessor core, designed for cost-sensitive embedded applications, with particular diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index af76634f8d98..47c0a55acafd 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -40,6 +40,7 @@ config BLACKFIN select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA select HAVE_DEBUG_STACKOVERFLOW + select HAVE_NMI config GENERIC_CSUM def_bool y diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 0314e325a669..2e053f709c4b 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -58,6 +58,7 @@ config CRIS select CLKSRC_MMIO if ETRAX_ARCH_V32 select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32 select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32 + select HAVE_NMI config HZ int diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 190cc48abc0c..079f4fec9586 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -153,6 +153,7 @@ config PPC select NO_BOOTMEM select HAVE_GENERIC_RCU_GUP select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_NMI if PERF_EVENTS config GENERIC_CSUM def_bool CPU_LITTLE_ENDIAN diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b06dc3839268..1b84036437c0 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -152,6 +152,7 @@ config S390 select TTY select VIRT_CPU_ACCOUNTING select VIRT_TO_BUS + select HAVE_NMI config SCHED_OMIT_FRAME_POINTER def_bool y diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 50057fed819d..7803b6863c0e 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -44,6 +44,7 @@ config SUPERH select OLD_SIGSUSPEND select OLD_SIGACTION select HAVE_ARCH_AUDITSYSCALL + select HAVE_NMI help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index e49502acbab4..198abcb6e510 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -80,6 +80,7 @@ config SPARC64 select NO_BOOTMEM select HAVE_ARCH_AUDITSYSCALL select ARCH_SUPPORTS_ATOMIC_RMW + select HAVE_NMI config ARCH_DEFCONFIG string diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index a07e31b50d3f..1503ab5882cd 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -28,6 +28,7 @@ config TILE select HAVE_DEBUG_STACKOVERFLOW select ARCH_WANT_FRAME_POINTERS select HAVE_CONTEXT_TRACKING + select HAVE_NMI if USE_PMC # FIXME: investigate whether we need/want these options. # select HAVE_IOREMAP_PROT diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 226d5696e1d1..1d1a59e7eaac 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -38,6 +38,7 @@ config X86 select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_IOREMAP_PROT + select HAVE_NMI select HAVE_KPROBES select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 6873ab925d00..6e633e020f55 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,7 +18,6 @@ #include #include #include -#include #ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh) @@ -30,35 +29,14 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #ifdef arch_trigger_all_cpu_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static cpumask_t printtrace_mask; - -#define NMI_BUF_SIZE 4096 - -struct nmi_seq_buf { - unsigned char buffer[NMI_BUF_SIZE]; - struct seq_buf seq; -}; - -/* Safe printing in NMI context */ -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; -static void print_seq_line(struct nmi_seq_buf *s, int start, int end) -{ - const char *buf = s->buffer + start; - - printk("%.*s", (end - start) + 1, buf); -} - void arch_trigger_all_cpu_backtrace(bool include_self) { - struct nmi_seq_buf *s; - int len; - int cpu; - int i; int this_cpu = get_cpu(); + int i; if (test_and_set_bit(0, &backtrace_flag)) { /* @@ -73,16 +51,6 @@ void arch_trigger_all_cpu_backtrace(bool include_self) if (!include_self) cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); - cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); - /* - * Set up per_cpu seq_buf buffers that the NMIs running on the other - * CPUs will write to. - */ - for_each_cpu(cpu, to_cpumask(backtrace_mask)) { - s = &per_cpu(nmi_print_seq, cpu); - seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); - } - if (!cpumask_empty(to_cpumask(backtrace_mask))) { pr_info("sending NMI to %s CPUs:\n", (include_self ? "all" : "other")); @@ -98,57 +66,15 @@ void arch_trigger_all_cpu_backtrace(bool include_self) } /* - * Now that all the NMIs have triggered, we can dump out their - * back traces safely to the console. + * Force flush any remote buffers that might be stuck in IRQ context + * and therefore could not run their irq_work. */ - for_each_cpu(cpu, &printtrace_mask) { - int last_i = 0; - - s = &per_cpu(nmi_print_seq, cpu); - len = seq_buf_used(&s->seq); - if (!len) - continue; - - /* Print line by line. */ - for (i = 0; i < len; i++) { - if (s->buffer[i] == '\n') { - print_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < len) { - print_seq_line(s, last_i, len - 1); - pr_cont("\n"); - } - } + printk_nmi_flush(); - clear_bit(0, &backtrace_flag); - smp_mb__after_atomic(); + clear_bit_unlock(0, &backtrace_flag); put_cpu(); } -/* - * It is not safe to call printk() directly from NMI handlers. - * It may be fine if the NMI detected a lock up and we have no choice - * but to do so, but doing a NMI on all other CPUs to get a back trace - * can be done with a sysrq-l. We don't want that to lock up, which - * can happen if the NMI interrupts a printk in progress. - * - * Instead, we redirect the vprintk() to this nmi_vprintk() that writes - * the content into a per cpu seq_buf buffer. Then when the NMIs are - * all done, we can safely dump the contents of the seq_buf to a printk() - * from a non NMI context. - */ -static int nmi_vprintk(const char *fmt, va_list args) -{ - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - unsigned int len = seq_buf_used(&s->seq); - - seq_buf_vprintf(&s->seq, fmt, args); - return seq_buf_used(&s->seq) - len; -} - static int arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { @@ -157,13 +83,8 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - printk_func_t printk_func_save = this_cpu_read(printk_func); - - /* Replace printk to write into the NMI seq */ - this_cpu_write(printk_func, nmi_vprintk); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); - this_cpu_write(printk_func, printk_func_save); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return NMI_HANDLED; diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index f4af03404b97..8c9040934ea0 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -67,10 +67,12 @@ extern void irq_exit(void); preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ rcu_nmi_enter(); \ trace_hardirq_enter(); \ + printk_nmi_enter(); \ } while (0) #define nmi_exit() \ do { \ + printk_nmi_exit(); \ trace_hardirq_exit(); \ rcu_nmi_exit(); \ BUG_ON(!in_nmi()); \ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index caebf2a758dc..04c68b9f56f8 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -135,7 +135,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ __alignof__(type)) -/* To avoid include hell, as printk can not declare this, we declare it here */ -DECLARE_PER_CPU(printk_func_t, printk_func); - #endif /* __LINUX_PERCPU_H */ diff --git a/include/linux/printk.h b/include/linux/printk.h index 9b30871c9149..05b7f6ed1663 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -120,7 +120,17 @@ static inline __printf(1, 2) __cold void early_printk(const char *s, ...) { } #endif -typedef int(*printk_func_t)(const char *fmt, va_list args); +#ifdef CONFIG_PRINTK_NMI +extern void printk_init(void); +extern void printk_nmi_enter(void); +extern void printk_nmi_exit(void); +extern void printk_nmi_flush(void); +#else +static inline void printk_init(void) { } +static inline void printk_nmi_enter(void) { } +static inline void printk_nmi_exit(void) { } +static inline void printk_nmi_flush(void) { } +#endif /* PRINTK_NMI */ #ifdef CONFIG_PRINTK asmlinkage __printf(5, 0) diff --git a/init/Kconfig b/init/Kconfig index dc24dec60232..acab5c67d2a9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1445,6 +1445,11 @@ config PRINTK very difficult to diagnose system problems, saying N here is strongly discouraged. +config PRINTK_NMI + def_bool y + depends on PRINTK + depends on HAVE_NMI + config BUG bool "BUG() support" if EXPERT default y diff --git a/init/main.c b/init/main.c index 2115055faeac..4a28accaaa98 100644 --- a/init/main.c +++ b/init/main.c @@ -586,6 +586,7 @@ asmlinkage __visible void __init start_kernel(void) timekeeping_init(); time_init(); sched_clock_postinit(); + printk_init(); perf_event_init(); profile_init(); call_function_init(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c099b082cd02..640460d881e5 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1821,13 +1821,160 @@ int vprintk_default(const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(vprintk_default); +#ifdef CONFIG_PRINTK_NMI + +typedef int(*printk_func_t)(const char *fmt, va_list args); /* * This allows printk to be diverted to another function per cpu. + * * This is useful for calling printk functions from within NMI * without worrying about race conditions that can lock up the * box. */ -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; +static DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; + +struct nmi_seq_buf { + atomic_t tail; + struct irq_work work; + unsigned char buffer[PAGE_SIZE - + 3*sizeof(atomic_t) - + sizeof(struct irq_work)]; +}; + +/* Safe printing in NMI context */ +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); + +static void print_seq_line(struct nmi_seq_buf *s, int start, int end) +{ + const char *buf = s->buffer + start; + + printk("%.*s", (end - start) + 1, buf); +} + +static void __printk_nmi_flush(struct irq_work *work) +{ + static raw_spinlock_t read_lock = __RAW_SPIN_LOCK_INITIALIZER(read_lock); + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); + int len, size, i, last_i; + + /* + * Serialize against the actual printing of the chunks + * to avoid interleaving lines and interleaving readers. + */ + raw_spin_lock(&read_lock); + + i = 0; +more: + len = atomic_read(&s->tail); + + /* + * we have the read lock; @tail must only increase if + * we printed something + */ + WARN_ON(len && i >= len); + + if (!len) + goto out; /* someone else already flushed the buffer */ + + /* make sure that data has ben written up to the @tail */ + smp_rmb(); + + size = min_t(int, len, sizeof(s->buffer)); + last_i = i; + + /* Print line by line. */ + for (; i < size; i++) { + if (s->buffer[i] == '\n') { + print_seq_line(s, last_i, i); + last_i = i + 1; + } + } + /* Check if there was a partial line. */ + if (last_i < size) { + print_seq_line(s, last_i, len - 1); + pr_cont("\n"); + } + + /* + * Check nothing got added and truncate the buffer. + */ + if (atomic_cmpxchg(&s->tail, len, 0) != len) + goto more; + +out: + raw_spin_unlock(&read_lock); +} + +/* + * flush all per-cpu nmi buffers + */ +void printk_nmi_flush(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); +} + +void printk_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu); + + init_irq_work(&s->work, __printk_nmi_flush); + } +} + +/* + */ +static int vprintk_nmi(const char *fmt, va_list args) +{ + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); + int add = 0, len; + + irq_work_queue(&s->work); + +again: + len = atomic_read(&s->tail); + + if (len < sizeof(s->buffer)) { + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, + fmt, args); + /* write the data before updating the used buffer size */ + smp_wmb(); + /* once again if someone flushed the buffer in the meantime */ + if (atomic_cmpxchg(&s->tail, len, len + add) != len) + goto again; + + } + return add; +} + +void printk_nmi_enter(void) +{ + this_cpu_write(printk_func, vprintk_nmi); +} + +void printk_nmi_exit(void) +{ + this_cpu_write(printk_func, vprintk_default); +} + +static inline int vprintk_func(const char *fmt, va_list args) +{ + return this_cpu_read(printk_func)(fmt, args); +} + +#else + +static inline int vprintk_func(const char *fmt, va_list args) +{ + return vprintk_default(fmt, args); +} + +#endif /* PRINTK_NMI */ /** * printk - print a kernel message @@ -1852,21 +1999,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; */ asmlinkage __visible int printk(const char *fmt, ...) { - printk_func_t vprintk_func; va_list args; int r; va_start(args, fmt); - - /* - * If a caller overrides the per_cpu printk_func, then it needs - * to disable preemption when calling printk(). Otherwise - * the printk_func should be set to the default. No need to - * disable preemption here. - */ - vprintk_func = this_cpu_read(printk_func); r = vprintk_func(fmt, args); - va_end(args); return r; -- 1.8.5.6 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/