LinuxLists.cc - [tip:perf/core] x86/nmi: Push duration printk() to irq context

2014-02-10 13:29:56

Subject: [tip:perf/core] x86/nmi: Push duration printk() to irq context

Commit-ID: e90c78535283dd0ded3bf2e484890d14dba2d527
Gitweb: http://git.kernel.org/tip/e90c78535283dd0ded3bf2e484890d14dba2d527
Author: Peter Zijlstra <[email protected]>
AuthorDate: Mon, 3 Feb 2014 18:02:09 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Sun, 9 Feb 2014 13:17:22 +0100

x86/nmi: Push duration printk() to irq context

Calling printk() from NMI context is bad (TM), so move it to IRQ
context.

In doing so we slightly change (probably wreck) the debugfs
nmi_longest_ns thingy, in that it doesn't update to reflect the
longest, nor does writing to it reset the count.

Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Don Zickus <[email protected]>
Cc: Dave Hansen <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/nmi.h | 3 +++
arch/x86/kernel/nmi.c | 37 ++++++++++++++++++++++++-------------
2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 86f9301..5f2fc44 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -1,6 +1,7 @@
#ifndef _ASM_X86_NMI_H
#define _ASM_X86_NMI_H

+#include <linux/irq_work.h>
#include <linux/pm.h>
#include <asm/irq.h>
#include <asm/io.h>
@@ -38,6 +39,8 @@ typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *);
struct nmiaction {
struct list_head list;
nmi_handler_t handler;
+ u64 max_duration;
+ struct irq_work irq_work;
unsigned long flags;
const char *name;
};
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 6fcb49c..b4872b9 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -87,6 +87,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
#define nmi_to_desc(type) (&nmi_desc[type])

static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
+
static int __init nmi_warning_debugfs(void)
{
debugfs_create_u64("nmi_longest_ns", 0644,
@@ -95,6 +96,20 @@ static int __init nmi_warning_debugfs(void)
}
fs_initcall(nmi_warning_debugfs);

+static void nmi_max_handler(struct irq_work *w)
+{
+ struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
+ int remainder_ns, decimal_msecs;
+ u64 whole_msecs = ACCESS_ONCE(a->max_duration);
+
+ remainder_ns = do_div(whole_msecs, (1000 * 1000));
+ decimal_msecs = remainder_ns / 1000;
+
+ printk_ratelimited(KERN_INFO
+ "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
+ a->handler, whole_msecs, decimal_msecs);
+}
+
static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
{
struct nmi_desc *desc = nmi_to_desc(type);
@@ -110,26 +125,20 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
* to handle those situations.
*/
list_for_each_entry_rcu(a, &desc->head, list) {
- u64 before, delta, whole_msecs;
- int remainder_ns, decimal_msecs, thishandled;
+ int thishandled;
+ u64 delta;

- before = sched_clock();
+ delta = sched_clock();
thishandled = a->handler(type, regs);
handled += thishandled;
- delta = sched_clock() - before;
+ delta = sched_clock() - delta;
trace_nmi_handler(a->handler, (int)delta, thishandled);

- if (delta < nmi_longest_ns)
+ if (delta < nmi_longest_ns || delta < a->max_duration)
continue;

- nmi_longest_ns = delta;
- whole_msecs = delta;
- remainder_ns = do_div(whole_msecs, (1000 * 1000));
- decimal_msecs = remainder_ns / 1000;
- printk_ratelimited(KERN_INFO
- "INFO: NMI handler (%ps) took too long to run: "
- "%lld.%03d msecs\n", a->handler, whole_msecs,
- decimal_msecs);
+ a->max_duration = delta;
+ irq_work_queue(&a->irq_work);
}

rcu_read_unlock();
@@ -146,6 +155,8 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
if (!action->handler)
return -EINVAL;

+ init_irq_work(&action->irq_work, nmi_max_handler);
+
spin_lock_irqsave(&desc->lock, flags);

/*

2014-02-10 16:46:21

by Dave Hansen

[permalink] [raw]

Subject: Re: [tip:perf/core] x86/nmi: Push duration printk() to irq context

On 02/10/2014 05:29 AM, tip-bot for Peter Zijlstra wrote:
> x86/nmi: Push duration printk() to irq context
>
> Calling printk() from NMI context is bad (TM), so move it to IRQ
> context.

Bad since the I/O device that we're doing it to may be slow and make the
NMI painfully long?

I can see why it might be a bad idea, but I'm unsold that it is
*universally* a bad idea.

> In doing so we slightly change (probably wreck) the debugfs
> nmi_longest_ns thingy, in that it doesn't update to reflect the
> longest, nor does writing to it reset the count.

The reason I coded this up was that NMIs were firing off so fast that
nothing else was getting a chance to run. With this patch, at least the
printk() would come out and I'd have some idea what was going on.

2014-02-10 17:05:35

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [tip:perf/core] x86/nmi: Push duration printk() to irq context

On Mon, Feb 10, 2014 at 08:45:16AM -0800, Dave Hansen wrote:
> On 02/10/2014 05:29 AM, tip-bot for Peter Zijlstra wrote:
> > x86/nmi: Push duration printk() to irq context
> >
> > Calling printk() from NMI context is bad (TM), so move it to IRQ
> > context.
>
> Bad since the I/O device that we're doing it to may be slow and make the
> NMI painfully long?
>
> I can see why it might be a bad idea, but I'm unsold that it is
> *universally* a bad idea.

printk() takes locks, NMI interrupts printk lock and tries to printk.
you get to keep pieces.

> > In doing so we slightly change (probably wreck) the debugfs
> > nmi_longest_ns thingy, in that it doesn't update to reflect the
> > longest, nor does writing to it reset the count.
>
> The reason I coded this up was that NMIs were firing off so fast that
> nothing else was getting a chance to run. With this patch, at least the
> printk() would come out and I'd have some idea what was going on.

Ah indeed, in which case you don't get to run the irq_work. But you
really shouldn't do printk() from NMI and esp. not in a case that
regularly and reliably triggers for everyone and is not a hardware
error.

2014-02-11 15:01:29

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [tip:perf/core] x86/nmi: Push duration printk() to irq context

On Mon, Feb 10, 2014 at 08:45:16AM -0800, Dave Hansen wrote:
> The reason I coded this up was that NMIs were firing off so fast that
> nothing else was getting a chance to run. With this patch, at least the
> printk() would come out and I'd have some idea what was going on.

How's something like this on top?

It will start spewing to early_printk() (which is a lot nicer to use
from NMI context too) when it fails to queue the IRQ-work because its
already enqueued.

It does have the false-positive for when two CPUs trigger the warn
concurrently, but that should be rare and some extra clutter on the
early printk shouldn't be a problem.

---
include/linux/irq_work.h | 2 +-
kernel/events/core.c | 9 +++++++--
kernel/irq_work.c | 6 ++++--
3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index add13c8..19ae05d 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -32,7 +32,7 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))

#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), }

-void irq_work_queue(struct irq_work *work);
+bool irq_work_queue(struct irq_work *work);
void irq_work_run(void);
void irq_work_sync(struct irq_work *work);

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2067cbb..45e5543 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -243,7 +243,7 @@ static void perf_duration_warn(struct irq_work *w)
printk_ratelimited(KERN_WARNING
"perf interrupt took too long (%lld > %lld), lowering "
"kernel.perf_event_max_sample_rate to %d\n",
- avg_local_sample_len, allowed_ns,
+ avg_local_sample_len, allowed_ns >> 1,
sysctl_perf_event_sample_rate);
}

@@ -283,7 +283,12 @@ void perf_sample_event_took(u64 sample_len_ns)

update_perf_cpu_limits();

- irq_work_queue(&perf_duration_work);
+ if (!irq_work_queue(&perf_duration_work)) {
+ early_printk("perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+ }
}

static atomic64_t perf_event_id;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 55fcce6..a82170e 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -61,11 +61,11 @@ void __weak arch_irq_work_raise(void)
*
* Can be re-enqueued while the callback is still in progress.
*/
-void irq_work_queue(struct irq_work *work)
+bool irq_work_queue(struct irq_work *work)
{
/* Only queue if not already pending */
if (!irq_work_claim(work))
- return;
+ return false;

/* Queue the entry and raise the IPI if needed. */
preempt_disable();
@@ -83,6 +83,8 @@ void irq_work_queue(struct irq_work *work)
}

preempt_enable();
+
+ return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);

2014-02-11 16:44:31

by Dave Hansen

[permalink] [raw]

Subject: Re: [tip:perf/core] x86/nmi: Push duration printk() to irq context

On 02/11/2014 07:01 AM, Peter Zijlstra wrote:
> - irq_work_queue(&perf_duration_work);
> + if (!irq_work_queue(&perf_duration_work)) {
> + early_printk("perf interrupt took too long (%lld > %lld), lowering "
> + "kernel.perf_event_max_sample_rate to %d\n",
> + avg_local_sample_len, allowed_ns >> 1,
> + sysctl_perf_event_sample_rate);
> + }
> }

That's a cool trick. Works for me.

2014-02-21 21:13:10

by tip-bot for Vasyl Gomonovych

[permalink] [raw]

Subject: [tip:perf/core] perf/x86: Warn to early_printk() in case irq_work is too slow

Commit-ID: cd578abb24aa67ce468c427d3356c08ea32cf768
Gitweb: http://git.kernel.org/tip/cd578abb24aa67ce468c427d3356c08ea32cf768
Author: Peter Zijlstra <[email protected]>
AuthorDate: Tue, 11 Feb 2014 16:01:16 +0100
Committer: Thomas Gleixner <[email protected]>
CommitDate: Fri, 21 Feb 2014 21:49:07 +0100

perf/x86: Warn to early_printk() in case irq_work is too slow

On Mon, Feb 10, 2014 at 08:45:16AM -0800, Dave Hansen wrote:
> The reason I coded this up was that NMIs were firing off so fast that
> nothing else was getting a chance to run. With this patch, at least the
> printk() would come out and I'd have some idea what was going on.

It will start spewing to early_printk() (which is a lot nicer to use
from NMI context too) when it fails to queue the IRQ-work because its
already enqueued.

It does have the false-positive for when two CPUs trigger the warn
concurrently, but that should be rare and some extra clutter on the
early printk shouldn't be a problem.

Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: Dave Hansen <[email protected]>
Cc: [email protected]
Fixes: 6a02ad66b2c4 ("perf/x86: Push the duration-logging printk() to IRQ context")
Signed-off-by: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
include/linux/irq_work.h | 2 +-
kernel/events/core.c | 9 +++++++--
kernel/irq_work.c | 6 ++++--
3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index add13c8..19ae05d 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -32,7 +32,7 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))

#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), }

-void irq_work_queue(struct irq_work *work);
+bool irq_work_queue(struct irq_work *work);
void irq_work_run(void);
void irq_work_sync(struct irq_work *work);

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2067cbb..45e5543 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -243,7 +243,7 @@ static void perf_duration_warn(struct irq_work *w)
printk_ratelimited(KERN_WARNING
"perf interrupt took too long (%lld > %lld), lowering "
"kernel.perf_event_max_sample_rate to %d\n",
- avg_local_sample_len, allowed_ns,
+ avg_local_sample_len, allowed_ns >> 1,
sysctl_perf_event_sample_rate);
}

@@ -283,7 +283,12 @@ void perf_sample_event_took(u64 sample_len_ns)

update_perf_cpu_limits();

- irq_work_queue(&perf_duration_work);
+ if (!irq_work_queue(&perf_duration_work)) {
+ early_printk("perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+ }
}

static atomic64_t perf_event_id;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 55fcce6..a82170e 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -61,11 +61,11 @@ void __weak arch_irq_work_raise(void)
*
* Can be re-enqueued while the callback is still in progress.
*/
-void irq_work_queue(struct irq_work *work)
+bool irq_work_queue(struct irq_work *work)
{
/* Only queue if not already pending */
if (!irq_work_claim(work))
- return;
+ return false;

/* Queue the entry and raise the IPI if needed. */
preempt_disable();
@@ -83,6 +83,8 @@ void irq_work_queue(struct irq_work *work)
}

preempt_enable();
+
+ return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);