From: "Steven Rostedt (Red Hat)" <[email protected]>
When trigger_all_cpu_backtrace() is called on x86, it will trigger an
NMI on each CPU and call show_regs(). But this can lead to a hard lock
up if the NMI comes in on another printk().
In order to avoid this, when the NMI triggers, it switches the printk
routine for that CPU to call a NMI safe printk function that records the
printk in a per_cpu seq_buf descriptor. After all NMIs have finished
recording its data, the seq_bufs are printed in a safe context.
Link: http://lkml.kernel.org/p/[email protected]
Link: http://lkml.kernel.org/r/[email protected]
Tested-by: Jiri Kosina <[email protected]>
Acked-by: Jiri Kosina <[email protected]>
Acked-by: Paul E. McKenney <[email protected]>
Reviewed-by: Petr Mladek <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86/kernel/apic/hw_nmi.c | 91 ++++++++++++++++++++++++++++++++++++++++---
1 file changed, 86 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 6a1e71bde323..c95c3e9ce196 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,6 +18,7 @@
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/delay.h>
+#include <linux/seq_buf.h>
#ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(int watchdog_thresh)
@@ -29,14 +30,35 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
#ifdef arch_trigger_all_cpu_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+static cpumask_var_t printtrace_mask;
+
+#define NMI_BUF_SIZE 4096
+
+struct nmi_seq_buf {
+ unsigned char buffer[NMI_BUF_SIZE];
+ struct seq_buf seq;
+};
+
+/* Safe printing in NMI context */
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
+static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+ const char *buf = s->buffer + start;
+
+ printk("%.*s", (end - start) + 1, buf);
+}
+
void arch_trigger_all_cpu_backtrace(bool include_self)
{
+ struct nmi_seq_buf *s;
+ int len;
+ int cpu;
int i;
- int cpu = get_cpu();
+ int this_cpu = get_cpu();
if (test_and_set_bit(0, &backtrace_flag)) {
/*
@@ -49,7 +71,17 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
if (!include_self)
- cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+ cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
+
+ cpumask_copy(printtrace_mask, to_cpumask(backtrace_mask));
+ /*
+ * Set up per_cpu seq_buf buffers that the NMIs running on the other
+ * CPUs will write to.
+ */
+ for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
+ s = &per_cpu(nmi_print_seq, cpu);
+ seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
+ }
if (!cpumask_empty(to_cpumask(backtrace_mask))) {
pr_info("sending NMI to %s CPUs:\n",
@@ -65,11 +97,58 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
touch_softlockup_watchdog();
}
+ /*
+ * Now that all the NMIs have triggered, we can dump out their
+ * back traces safely to the console.
+ */
+ for_each_cpu(cpu, printtrace_mask) {
+ int last_i = 0;
+
+ s = &per_cpu(nmi_print_seq, cpu);
+ len = seq_buf_used(&s->seq);
+ if (!len)
+ continue;
+
+ /* Print line by line. */
+ for (i = 0; i < len; i++) {
+ if (s->buffer[i] == '\n') {
+ print_seq_line(s, last_i, i);
+ last_i = i + 1;
+ }
+ }
+ /* Check if there was a partial line. */
+ if (last_i < len) {
+ print_seq_line(s, last_i, len - 1);
+ pr_cont("\n");
+ }
+ }
+
clear_bit(0, &backtrace_flag);
smp_mb__after_atomic();
put_cpu();
}
+/*
+ * It is not safe to call printk() directly from NMI handlers.
+ * It may be fine if the NMI detected a lock up and we have no choice
+ * but to do so, but doing a NMI on all other CPUs to get a back trace
+ * can be done with a sysrq-l. We don't want that to lock up, which
+ * can happen if the NMI interrupts a printk in progress.
+ *
+ * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
+ * the content into a per cpu seq_buf buffer. Then when the NMIs are
+ * all done, we can safely dump the contents of the seq_buf to a printk()
+ * from a non NMI context.
+ */
+static int nmi_vprintk(const char *fmt, va_list args)
+{
+ struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+ unsigned int len = seq_buf_used(&s->seq);
+
+ seq_buf_vprintf(&s->seq, fmt, args);
+ return seq_buf_used(&s->seq) - len;
+}
+
static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
@@ -78,12 +157,14 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+ printk_func_t printk_func_save = this_cpu_read(printk_func);
- arch_spin_lock(&lock);
+ /* Replace printk to write into the NMI seq */
+ this_cpu_write(printk_func, nmi_vprintk);
printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
show_regs(regs);
- arch_spin_unlock(&lock);
+ this_cpu_write(printk_func, printk_func_save);
+
cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return NMI_HANDLED;
}
--
2.1.1
On Tue, Nov 18, 2014 at 11:39:20PM -0500, Steven Rostedt wrote:
> static int
> arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
> {
> @@ -78,12 +157,14 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
> cpu = smp_processor_id();
>
> if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
> - static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
> + printk_func_t printk_func_save = this_cpu_read(printk_func);
>
> - arch_spin_lock(&lock);
> + /* Replace printk to write into the NMI seq */
> + this_cpu_write(printk_func, nmi_vprintk);
> printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
> show_regs(regs);
> - arch_spin_unlock(&lock);
> + this_cpu_write(printk_func, printk_func_save);
I'm wondering if this could be used in a generic manner throughout code
where we could say "ok, I'm in an NMI context, so lemme switch printk's
and do some printing" so that NMI and NMI-like atomic contexts could use
printk. Lemme do an mce example:
do_machine_check(..)
{
printk_func_t printk_func_save = this_cpu_read(printk_func);
...
/* in #MC handler, switch printks */
this_cpu_write(printk_func, nmi_vprintk);
printk("This is a hw error, details: ...\n");
/* more bla */
this_cpu_write(printk_func, printk_func_save);
}
or should we change that in entry.S, before we call the handler?
Because, if we could do something like that, then we finally get to use
printk in an NMI context which would be a good thing.
:-)
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--
On Wed, 19 Nov 2014, Borislav Petkov wrote:
> I'm wondering if this could be used in a generic manner throughout code
> where we could say "ok, I'm in an NMI context, so lemme switch printk's
> and do some printing" so that NMI and NMI-like atomic contexts could use
> printk. Lemme do an mce example:
>
> do_machine_check(..)
> {
> printk_func_t printk_func_save = this_cpu_read(printk_func);
>
> ...
>
> /* in #MC handler, switch printks */
> this_cpu_write(printk_func, nmi_vprintk);
>
> printk("This is a hw error, details: ...\n");
>
> /* more bla */
>
> this_cpu_write(printk_func, printk_func_save);
> }
>
> or should we change that in entry.S, before we call the handler?
If we are going down this path, do_nmi() should be early enough to do it,
no need to pollute NMI assembly code with this.
--
Jiri Kosina
SUSE Labs
On Wed, Nov 19, 2014 at 11:44:56AM +0100, Jiri Kosina wrote:
> If we are going down this path, do_nmi() should be early enough to do
> it, no need to pollute NMI assembly code with this.
do_nmi() doesn't cover all, like do_machine_check(), do_double_fault(),
and the rest of the handlers. I'm suggesting to do it before any handler
gets run so that you can use printk in the handler itself.
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--
On Wed 2014-11-19 11:41:14, Borislav Petkov wrote:
> On Tue, Nov 18, 2014 at 11:39:20PM -0500, Steven Rostedt wrote:
> > static int
> > arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
> > {
> > @@ -78,12 +157,14 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
> > cpu = smp_processor_id();
> >
> > if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
> > - static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
> > + printk_func_t printk_func_save = this_cpu_read(printk_func);
> >
> > - arch_spin_lock(&lock);
> > + /* Replace printk to write into the NMI seq */
> > + this_cpu_write(printk_func, nmi_vprintk);
> > printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
> > show_regs(regs);
> > - arch_spin_unlock(&lock);
> > + this_cpu_write(printk_func, printk_func_save);
>
> I'm wondering if this could be used in a generic manner throughout code
> where we could say "ok, I'm in an NMI context, so lemme switch printk's
> and do some printing" so that NMI and NMI-like atomic contexts could use
> printk. Lemme do an mce example:
>
> do_machine_check(..)
> {
> printk_func_t printk_func_save = this_cpu_read(printk_func);
>
> ...
>
> /* in #MC handler, switch printks */
> this_cpu_write(printk_func, nmi_vprintk);
>
> printk("This is a hw error, details: ...\n");
>
> /* more bla */
>
> this_cpu_write(printk_func, printk_func_save);
> }
>
> or should we change that in entry.S, before we call the handler?
>
> Because, if we could do something like that, then we finally get to use
> printk in an NMI context which would be a good thing.
Unfortunately, the problem is more complicated. The alternative
printk_func() just stores the string into the seq_buf. Then someone
needs to move the data to the main ring buffer, see print_seq_line()
and the related cycle in this patch.
The copying needs to be done in the normal context because it will
need to take the lock for the main ring buffer. Then it will need to
somehow protect the seq_buf against other accesses from NMI context.
By other words, any serious generic solution would end up with implementing
an alternative ring buffer and lockless operations. It would be
something similar to my older patch set, see
https://lkml.org/lkml/2014/5/9/118. It was not
accepted and it triggered this solution for NMI backtraces.
Note that this patch works because arch_trigger_all_cpu_backtrace() is
called from normal context and any further calls are ignored until the
current call is finished. So, there is a safe place to copy the data.
I have another idea for the generic solution. Linus was against using
printk() in NMI context, see https://lkml.org/lkml/2014/6/10/388.
The backtraces are the only serious usage. Other than
that there are only few warnings. My proposal is to handle the
warnings similar way like recursive printk() warning. The recursive
printk() just sets a flag, the message is dropped, warning about lost
message is printed within the next printk() call.
Best Regards,
Petr