I hit a irqflood bug on powerpc platform, and two years ago, on a x86 platform.
When the bug happens, the kernel is totally occupies by irq. Currently, there
may be nothing or just soft lockup warning showed in console. It is better
to warn users with irq flood info.
In the kdump case, the kernel can move on by suppressing the irq flood.
Cc: Thomas Gleixner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Jisheng Zhang <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "Guilherme G. Piccoli" <[email protected]>
Cc: Petr Mladek <[email protected]>
Cc: Marc Zyngier <[email protected]>
Cc: Linus Walleij <[email protected]>
Cc: afzal mohammed <[email protected]>
Cc: Lina Iyer <[email protected]>
Cc: "Gustavo A. R. Silva" <[email protected]>
Cc: Maulik Shah <[email protected]>
Cc: Al Viro <[email protected]>
Cc: Jonathan Corbet <[email protected]>
Cc: Pawan Gupta <[email protected]>
Cc: Mike Kravetz <[email protected]>
Cc: Oliver Neukum <[email protected]>
To: [email protected]
Cc: [email protected]
Cc: [email protected]
Pingfan Liu (3):
kernel/watchdog: show irq percentage if irq floods
kernel/watchdog: suppress max irq when irq floods
Documentation: introduce a param "irqflood_suppress"
Documentation/admin-guide/kernel-parameters.txt | 3 ++
include/linux/irq.h | 2 ++
kernel/irq/spurious.c | 32 +++++++++++++++++
kernel/watchdog.c | 48 +++++++++++++++++++++++++
4 files changed, 85 insertions(+)
--
2.7.5
In kdump case, the capture kernel has no chance to shutdown devices, and
leave the devices in unstable state. Irq flood is one of the consequence.
When irq floods, the kernel is totally occupies by irq. Currently, there
may be nothing or just soft lockup warning showed in console. It is better
to warn users with irq flood info.
Soft lockup watchdog is a good foundation to implement the detector. A irq
flood is reported if the following two conditions are met:
-1. the irq occupies too much (98%) of the past interval.
-2. no tasks has been scheduled in the past interval. This is implemented
by check_hint.
A note: is_softlockup() implies the 2nd condition, but unfortunately, irq
flood can come from anywhere. If irq_enter_rcu()->tick_irq_enter(), then
touch_softlockup_watchdog_sched() will reset watchdog, and softlockup is
never detected.
Signed-off-by: Pingfan Liu <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Jisheng Zhang <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "Guilherme G. Piccoli" <[email protected]>
Cc: Petr Mladek <[email protected]>
Cc: Marc Zyngier <[email protected]>
Cc: Linus Walleij <[email protected]>
Cc: afzal mohammed <[email protected]>
Cc: Lina Iyer <[email protected]>
Cc: "Gustavo A. R. Silva" <[email protected]>
Cc: Maulik Shah <[email protected]>
Cc: Al Viro <[email protected]>
Cc: Jonathan Corbet <[email protected]>
Cc: Pawan Gupta <[email protected]>
Cc: Mike Kravetz <[email protected]>
Cc: Oliver Neukum <[email protected]>
To: [email protected]
Cc: [email protected]
Cc: [email protected]
---
kernel/watchdog.c | 41 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5abb5b2..230ac38 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,6 +23,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <linux/stop_machine.h>
+#include <linux/kernel_stat.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -175,6 +176,13 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static DEFINE_PER_CPU(long, check_hint) = {-1};
+static DEFINE_PER_CPU(unsigned long, last_irq_ts);
+static DEFINE_PER_CPU(unsigned long, last_total_ts);
+#endif
+
static unsigned long soft_lockup_nmi_warn;
static int __init nowatchdog_setup(char *str)
@@ -331,12 +339,43 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
*/
static int softlockup_fn(void *data)
{
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ __this_cpu_write(check_hint, -1);
+#endif
__touch_watchdog();
complete(this_cpu_ptr(&softlockup_completion));
return 0;
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static void check_irq_flood(void)
+{
+ unsigned long irqts, totalts, percent, cnt;
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ totalts = running_clock();
+ irqts = cpustat[CPUTIME_IRQ] + cpustat[CPUTIME_SOFTIRQ];
+ cnt = __this_cpu_inc_return(check_hint);
+
+ if (cnt >= 5) {
+ totalts = totalts - __this_cpu_read(last_total_ts);
+ irqts = irqts - __this_cpu_read(last_irq_ts);
+ percent = irqts * 100 / totalts;
+ percent = percent < 100 ? percent : 100;
+ __this_cpu_write(check_hint, -1);
+ if (percent >= 98)
+ pr_info("Irq flood occupies more than %lu%% of the past %lu seconds\n",
+ percent, totalts >> 30);
+ } else if (cnt == 0) {
+ __this_cpu_write(last_total_ts, totalts);
+ __this_cpu_write(last_irq_ts, irqts);
+ }
+}
+#else
+static void check_irq_flood(void){}
+#endif
+
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
@@ -348,6 +387,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
if (!watchdog_enabled)
return HRTIMER_NORESTART;
+ /* When irq floods, watchdog may be still touched. Hence it can not be done inside lockup */
+ check_irq_flood();
/* kick the hardlockup detector */
watchdog_interrupt_count();
--
2.7.5
The param "irqflood_suppress" is helpful for capture kernel to survive irq
flood.
Signed-off-by: Pingfan Liu <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Jisheng Zhang <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "Guilherme G. Piccoli" <[email protected]>
Cc: Petr Mladek <[email protected]>
Cc: Marc Zyngier <[email protected]>
Cc: Linus Walleij <[email protected]>
Cc: afzal mohammed <[email protected]>
Cc: Lina Iyer <[email protected]>
Cc: "Gustavo A. R. Silva" <[email protected]>
Cc: Maulik Shah <[email protected]>
Cc: Al Viro <[email protected]>
Cc: Jonathan Corbet <[email protected]>
Cc: Pawan Gupta <[email protected]>
Cc: Mike Kravetz <[email protected]>
Cc: Oliver Neukum <[email protected]>
To: [email protected]
Cc: [email protected]
Cc: [email protected]
---
Documentation/admin-guide/kernel-parameters.txt | 3 +++
1 file changed, 3 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a106874..0a25a05 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2009,6 +2009,9 @@
for it. Also check all handlers each timer
interrupt. Intended to get systems with badly broken
firmware running.
+ irqflood_suppress [HW]
+ When a irq fully occupies a cpu in a long time, suppressing
+ it to make kernel move on. It is useful in the capture kernel.
isapnp= [ISAPNP]
Format: <RDP>,<reset>,<pci_scan>,<verbosity>
--
2.7.5
On Thu, Oct 22 2020 at 13:56, Pingfan Liu wrote:
> I hit a irqflood bug on powerpc platform, and two years ago, on a x86 platform.
> When the bug happens, the kernel is totally occupies by irq. Currently, there
> may be nothing or just soft lockup warning showed in console. It is better
> to warn users with irq flood info.
>
> In the kdump case, the kernel can move on by suppressing the irq flood.
You're curing the symptom not the cause and the cure is just magic and
can't work reliably.
Where is that irq flood originated from and why is none of the
mechanisms we have in place to shut it up working?
Thanks,
tglx
The capture kernel should try its best to save the crash info. Normally,
irq flood is caused by some trivial devices, which has no impact on saving
vmcore.
Introducing a parameter "irqflood_suppress" to enable suppress irq flood.
Signed-off-by: Pingfan Liu <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Jisheng Zhang <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "Guilherme G. Piccoli" <[email protected]>
Cc: Petr Mladek <[email protected]>
Cc: Marc Zyngier <[email protected]>
Cc: Linus Walleij <[email protected]>
Cc: afzal mohammed <[email protected]>
Cc: Lina Iyer <[email protected]>
Cc: "Gustavo A. R. Silva" <[email protected]>
Cc: Maulik Shah <[email protected]>
Cc: Al Viro <[email protected]>
Cc: Jonathan Corbet <[email protected]>
Cc: Pawan Gupta <[email protected]>
Cc: Mike Kravetz <[email protected]>
Cc: Oliver Neukum <[email protected]>
To: [email protected]
Cc: [email protected]
Cc: [email protected]
---
include/linux/irq.h | 2 ++
kernel/irq/spurious.c | 32 ++++++++++++++++++++++++++++++++
kernel/watchdog.c | 9 ++++++++-
3 files changed, 42 insertions(+), 1 deletion(-)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 1b7f4df..140cb61 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -684,6 +684,8 @@ extern void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret);
/* Enable/disable irq debugging output: */
extern int noirqdebug_setup(char *str);
+void suppress_max_irq(void);
+
/* Checks whether the interrupt can be requested by request_irq(): */
extern int can_request_irq(unsigned int irq, unsigned long irqflags);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index f865e5f..d3d94d6 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -464,3 +464,35 @@ static int __init irqpoll_setup(char *str)
}
__setup("irqpoll", irqpoll_setup);
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+static bool irqflood_suppress;
+
+static int __init irqflood_suppress_setup(char *str)
+{
+ irqflood_suppress = true;
+ pr_info("enable auto suppress irqflood\n");
+ return 1;
+}
+__setup("irqflood_suppress", irqflood_suppress_setup);
+
+void suppress_max_irq(void)
+{
+ unsigned int tmp, maxirq = 0, max = 0;
+ int irq;
+
+ if (!irqflood_suppress)
+ return;
+ for_each_active_irq(irq) {
+ tmp = kstat_irqs_cpu(irq, smp_processor_id());
+ if (max < tmp) {
+ maxirq = irq;
+ max = tmp;
+ }
+ }
+ pr_warn("Suppress irq:%u, which is triggered %u times\n",
+ maxirq, max);
+ disable_irq_nosync(maxirq);
+}
+#endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 230ac38..28a74e5 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
#include <linux/sched/isolation.h>
#include <linux/stop_machine.h>
#include <linux/kernel_stat.h>
+#include <linux/irq.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -364,9 +365,15 @@ static void check_irq_flood(void)
percent = irqts * 100 / totalts;
percent = percent < 100 ? percent : 100;
__this_cpu_write(check_hint, -1);
- if (percent >= 98)
+ if (percent >= 98) {
pr_info("Irq flood occupies more than %lu%% of the past %lu seconds\n",
percent, totalts >> 30);
+ /*
+ * Suppress top irq when scheduler does not work for long time and irq
+ * occupies too much time.
+ */
+ suppress_max_irq();
+ }
} else if (cnt == 0) {
__this_cpu_write(last_total_ts, totalts);
__this_cpu_write(last_irq_ts, irqts);
--
2.7.5
On Thu, Oct 22, 2020 at 4:37 PM Thomas Gleixner <[email protected]> wrote:
>
> On Thu, Oct 22 2020 at 13:56, Pingfan Liu wrote:
> > I hit a irqflood bug on powerpc platform, and two years ago, on a x86 platform.
> > When the bug happens, the kernel is totally occupies by irq. Currently, there
> > may be nothing or just soft lockup warning showed in console. It is better
> > to warn users with irq flood info.
> >
> > In the kdump case, the kernel can move on by suppressing the irq flood.
>
> You're curing the symptom not the cause and the cure is just magic and
> can't work reliably.
Yeah, it is magic. But at least, it is better to printk something and
alarm users about what happens. With current code, it may show nothing
when system hangs.
>
> Where is that irq flood originated from and why is none of the
> mechanisms we have in place to shut it up working?
The bug originates from a driver tpm_i2c_nuvoton, which calls i2c-bus
driver (i2c-opal.c). After i2c_opal_send_request(), the bug is
triggered.
But things are complicated by introducing a firmware layer: Skiboot.
This software layer hides the detail of manipulating the hardware from
Linux.
I guess the software logic can not enter a sane state when kernel crashes.
Cc Skiboot and ppc64 community to see whether anyone has idea about it.
Thanks,
Pingfan
On Sun, Oct 25, 2020 at 10:22 PM Pingfan Liu <[email protected]> wrote:
>
> On Thu, Oct 22, 2020 at 4:37 PM Thomas Gleixner <[email protected]> wrote:
> >
> > On Thu, Oct 22 2020 at 13:56, Pingfan Liu wrote:
> > > I hit a irqflood bug on powerpc platform, and two years ago, on a x86 platform.
> > > When the bug happens, the kernel is totally occupies by irq. Currently, there
> > > may be nothing or just soft lockup warning showed in console. It is better
> > > to warn users with irq flood info.
> > >
> > > In the kdump case, the kernel can move on by suppressing the irq flood.
> >
> > You're curing the symptom not the cause and the cure is just magic and
> > can't work reliably.
> Yeah, it is magic. But at least, it is better to printk something and
> alarm users about what happens. With current code, it may show nothing
> when system hangs.
> >
> > Where is that irq flood originated from and why is none of the
> > mechanisms we have in place to shut it up working?
> The bug originates from a driver tpm_i2c_nuvoton, which calls i2c-bus
> driver (i2c-opal.c). After i2c_opal_send_request(), the bug is
> triggered.
>
> But things are complicated by introducing a firmware layer: Skiboot.
> This software layer hides the detail of manipulating the hardware from
> Linux.
>
> I guess the software logic can not enter a sane state when kernel crashes.
>
> Cc Skiboot and ppc64 community to see whether anyone has idea about it.
What system are you using?
There's an external interrupt pin which is supposed to be wired to the
TPM. I think we bounce that interrupt to FW by default since the
external interrupt is sometimes used for other system-specific
purposes. Odds are FW doesn't know what to do with it so you
effectively have an always-on LSI. I fixed a similar bug a while ago
by having skiboot mask any interrupts it doesn't have a handler for,
but I have no idea when that fix will land in a released FW build. Oh
well.
On Sun, Oct 25, 2020 at 8:21 PM Oliver O'Halloran <[email protected]> wrote:
>
> On Sun, Oct 25, 2020 at 10:22 PM Pingfan Liu <[email protected]> wrote:
> >
> > On Thu, Oct 22, 2020 at 4:37 PM Thomas Gleixner <[email protected]> wrote:
> > >
> > > On Thu, Oct 22 2020 at 13:56, Pingfan Liu wrote:
> > > > I hit a irqflood bug on powerpc platform, and two years ago, on a x86 platform.
> > > > When the bug happens, the kernel is totally occupies by irq. Currently, there
> > > > may be nothing or just soft lockup warning showed in console. It is better
> > > > to warn users with irq flood info.
> > > >
> > > > In the kdump case, the kernel can move on by suppressing the irq flood.
> > >
> > > You're curing the symptom not the cause and the cure is just magic and
> > > can't work reliably.
> > Yeah, it is magic. But at least, it is better to printk something and
> > alarm users about what happens. With current code, it may show nothing
> > when system hangs.
> > >
> > > Where is that irq flood originated from and why is none of the
> > > mechanisms we have in place to shut it up working?
> > The bug originates from a driver tpm_i2c_nuvoton, which calls i2c-bus
> > driver (i2c-opal.c). After i2c_opal_send_request(), the bug is
> > triggered.
> >
> > But things are complicated by introducing a firmware layer: Skiboot.
> > This software layer hides the detail of manipulating the hardware from
> > Linux.
> >
> > I guess the software logic can not enter a sane state when kernel crashes.
> >
> > Cc Skiboot and ppc64 community to see whether anyone has idea about it.
>
> What system are you using?
Here is the info, if not enough, I will get more.
Product Name : OpenPOWER Firmware
Product Version : open-power-SUPERMICRO-P9DSU-V1.16-20180531-imp
Product Extra : op-build-e4b3eb5
Product Extra : skiboot-v6.0-p1da203b
Product Extra : hostboot-f911e5c-pda8239f
Product Extra : occ-77bb5e6-p623d1cd
Product Extra : linux-4.16.7-openpower2-pbc45895
Product Extra : petitboot-v1.7.1-pf773c0d
Product Extra : machine-xml-218a77a
>
> There's an external interrupt pin which is supposed to be wired to the
> TPM. I think we bounce that interrupt to FW by default since the
> external interrupt is sometimes used for other system-specific
> purposes. Odds are FW doesn't know what to do with it so you
> effectively have an always-on LSI. I fixed a similar bug a while ago
> by having skiboot mask any interrupts it doesn't have a handler for,
This sounds like the root cause. But here Skiboot should have handler,
otherwise the first kernel can not run smoothly.
Do you have any idea about an unexpected re-initialization introducing
an unsane stage?
Thanks,
Pingfan
On Mon, Oct 26, 2020 at 12:11 AM Pingfan Liu <[email protected]> wrote:
>
> On Sun, Oct 25, 2020 at 8:21 PM Oliver O'Halloran <[email protected]> wrote:
> >
> > On Sun, Oct 25, 2020 at 10:22 PM Pingfan Liu <[email protected]> wrote:
> > >
> > > On Thu, Oct 22, 2020 at 4:37 PM Thomas Gleixner <[email protected]> wrote:
> > > >
> > > > On Thu, Oct 22 2020 at 13:56, Pingfan Liu wrote:
> > > > > I hit a irqflood bug on powerpc platform, and two years ago, on a x86 platform.
> > > > > When the bug happens, the kernel is totally occupies by irq. Currently, there
> > > > > may be nothing or just soft lockup warning showed in console. It is better
> > > > > to warn users with irq flood info.
> > > > >
> > > > > In the kdump case, the kernel can move on by suppressing the irq flood.
> > > >
> > > > You're curing the symptom not the cause and the cure is just magic and
> > > > can't work reliably.
> > > Yeah, it is magic. But at least, it is better to printk something and
> > > alarm users about what happens. With current code, it may show nothing
> > > when system hangs.
> > > >
> > > > Where is that irq flood originated from and why is none of the
> > > > mechanisms we have in place to shut it up working?
> > > The bug originates from a driver tpm_i2c_nuvoton, which calls i2c-bus
> > > driver (i2c-opal.c). After i2c_opal_send_request(), the bug is
> > > triggered.
> > >
> > > But things are complicated by introducing a firmware layer: Skiboot.
> > > This software layer hides the detail of manipulating the hardware from
> > > Linux.
> > >
> > > I guess the software logic can not enter a sane state when kernel crashes.
> > >
> > > Cc Skiboot and ppc64 community to see whether anyone has idea about it.
> >
> > What system are you using?
>
> Here is the info, if not enough, I will get more.
> Product Name : OpenPOWER Firmware
> Product Version : open-power-SUPERMICRO-P9DSU-V1.16-20180531-imp
> Product Extra : op-build-e4b3eb5
> Product Extra : skiboot-v6.0-p1da203b
> Product Extra : hostboot-f911e5c-pda8239f
> Product Extra : occ-77bb5e6-p623d1cd
> Product Extra : linux-4.16.7-openpower2-pbc45895
> Product Extra : petitboot-v1.7.1-pf773c0d
> Product Extra : machine-xml-218a77a
Unfortunately I don't have a schematic for that one.
> > There's an external interrupt pin which is supposed to be wired to the
> > TPM. I think we bounce that interrupt to FW by default since the
> > external interrupt is sometimes used for other system-specific
> > purposes. Odds are FW doesn't know what to do with it so you
> > effectively have an always-on LSI. I fixed a similar bug a while ago
> > by having skiboot mask any interrupts it doesn't have a handler for,
>
> This sounds like the root cause. But here Skiboot should have handler,
> otherwise the first kernel can not run smoothly.
I don't know why the TPM interrupt is asserted. If the TPM driver is
polling for a response it might clear the underlying condition as a
side effect of it's normal operation.
> Do you have any idea about an unexpected re-initialization introducing
> an unsane stage?
No idea, but those TPMs have a history of bricking themselves if you
do anything slightly odd to them. It wouldn't surprise me if the
re-probe can cause issues.
> Thanks,
> Pingfan
On Sun, Oct 25, 2020 at 8:12 AM Pingfan Liu <[email protected]> wrote:
>
> On Thu, Oct 22, 2020 at 4:37 PM Thomas Gleixner <[email protected]> wrote:
> >
> > On Thu, Oct 22 2020 at 13:56, Pingfan Liu wrote:
> > > I hit a irqflood bug on powerpc platform, and two years ago, on a x86 platform.
> > > When the bug happens, the kernel is totally occupies by irq. Currently, there
> > > may be nothing or just soft lockup warning showed in console. It is better
> > > to warn users with irq flood info.
> > >
> > > In the kdump case, the kernel can move on by suppressing the irq flood.
> >
> > You're curing the symptom not the cause and the cure is just magic and
> > can't work reliably.
> Yeah, it is magic. But at least, it is better to printk something and
> alarm users about what happens. With current code, it may show nothing
> when system hangs.
Thanks Pingfan and Thomas for the points - I'd like to have a
mechanism in the kernel to warn users when an IRQ flood is potentially
happening.
Some time ago (2 years) we faced a similar issue in x86-64, a hard to
debug problem in kdump, that eventually was narrowed to a buggy NIC FW
flooding IRQs in kdump kernel, and no messages showed (although kernel
changed a lot since that time, today we might have better IRQ
handling/warning). We tried an early-boot fix, by disabling MSIs (as
per PCI spec) early in x86 boot, but it wasn't accepted - Bjorn asked
pertinent questions that I couldn't respond (I lost the reproducer)
[0].
Cheers,
Guilherme
[0] lore.kernel.org/linux-pci/[email protected]
On Mon, Oct 26 2020 at 17:28, Guilherme Piccoli wrote:
> On Mon, Oct 26, 2020 at 4:59 PM Thomas Gleixner <[email protected]> wrote:
>> It gets flooded right at the point where the crash kernel enables
>> interrupts in start_kernel(). At that point there is no device driver
>> and no interupt requested. All you can see on the console for this is
>>
>> "common_interrupt: $VECTOR.$CPU No irq handler for vector"
>>
>> And contrary to Liu's patches which try to disable a requested interrupt
>> if too many of them arrive, the kernel cannot do anything because there
>> is nothing to disable in your case. That's why you needed to do the MSI
>> disable magic in the early PCI quirks which run before interrupts get
>> enabled.
>
> Wow, thank you very much for this great explanation (without a
> reproducer) - it's nice to hear somebody that deeply understands the
> code! And double thanks for CCing Bjorn.
Understanding the code is only half of the picture. You need to
understand how the hardware works or not :)
> So, I don't want to hijack Liu's thread, but do you think it makes
> sense to have my approach as a (debug) parameter to prevent such a
> degenerate case?
At least it makes sense to some extent even if it's incomplete. What
bothers me is that it'd be x86 specific while the issue is pretty much
architecture independent. I don't think that the APIC is special in that
regard. Rogue MSIs should be able to bring down pretty much all
architectures.
> Or could we have something in core IRQ code to prevent irq flooding in
> such scenarios, something "stronger" than disabling MSIs (APIC-level,
> likely)?
For your case? No. The APIC cannot be protected against rogue MSIs. The
only cure is to disable interrupts or disable MSIs on all PCI[E] devices
early on. Disabling interrupts is not so much of an option obviously :)
Thanks,
tglx
On Mon, Oct 26 2020 at 12:06, Guilherme Piccoli wrote:
> On Sun, Oct 25, 2020 at 8:12 AM Pingfan Liu <[email protected]> wrote:
>
> Some time ago (2 years) we faced a similar issue in x86-64, a hard to
> debug problem in kdump, that eventually was narrowed to a buggy NIC FW
> flooding IRQs in kdump kernel, and no messages showed (although kernel
> changed a lot since that time, today we might have better IRQ
> handling/warning). We tried an early-boot fix, by disabling MSIs (as
> per PCI spec) early in x86 boot, but it wasn't accepted - Bjorn asked
> pertinent questions that I couldn't respond (I lost the reproducer)
> [0].
...
> [0] lore.kernel.org/linux-pci/[email protected]
With that broken firmware the NIC continued to send MSI messages to the
vector/CPU which was assigned to it before the crash. But the crash
kernel has no interrupt descriptor for this vector installed. So Liu's
patches wont print anything simply because the interrupt core cannot
detect it.
To answer Bjorns still open question about when the point X is:
https://lore.kernel.org/linux-pci/[email protected]/
It gets flooded right at the point where the crash kernel enables
interrupts in start_kernel(). At that point there is no device driver
and no interupt requested. All you can see on the console for this is
"common_interrupt: $VECTOR.$CPU No irq handler for vector"
And contrary to Liu's patches which try to disable a requested interrupt
if too many of them arrive, the kernel cannot do anything because there
is nothing to disable in your case. That's why you needed to do the MSI
disable magic in the early PCI quirks which run before interrupts get
enabled.
Also Liu's patch only works if:
1) CONFIG_IRQ_TIME_ACCOUNTING is enabled
2) the runaway interrupt has been requested by the relevant driver in
the dump kernel.
Especially #1 is not a sensible restriction.
Thanks,
tglx
On Mon, Oct 26, 2020 at 4:59 PM Thomas Gleixner <[email protected]> wrote:
>
> On Mon, Oct 26 2020 at 12:06, Guilherme Piccoli wrote:
> > On Sun, Oct 25, 2020 at 8:12 AM Pingfan Liu <[email protected]> wrote:
> >
> > Some time ago (2 years) we faced a similar issue in x86-64, a hard to
> > debug problem in kdump, that eventually was narrowed to a buggy NIC FW
> > flooding IRQs in kdump kernel, and no messages showed (although kernel
> > changed a lot since that time, today we might have better IRQ
> > handling/warning). We tried an early-boot fix, by disabling MSIs (as
> > per PCI spec) early in x86 boot, but it wasn't accepted - Bjorn asked
> > pertinent questions that I couldn't respond (I lost the reproducer)
> > [0].
> ...
> > [0] lore.kernel.org/linux-pci/[email protected]
>
> With that broken firmware the NIC continued to send MSI messages to the
> vector/CPU which was assigned to it before the crash. But the crash
> kernel has no interrupt descriptor for this vector installed. So Liu's
> patches wont print anything simply because the interrupt core cannot
> detect it.
>
> To answer Bjorns still open question about when the point X is:
>
> https://lore.kernel.org/linux-pci/[email protected]/
>
> It gets flooded right at the point where the crash kernel enables
> interrupts in start_kernel(). At that point there is no device driver
> and no interupt requested. All you can see on the console for this is
>
> "common_interrupt: $VECTOR.$CPU No irq handler for vector"
>
> And contrary to Liu's patches which try to disable a requested interrupt
> if too many of them arrive, the kernel cannot do anything because there
> is nothing to disable in your case. That's why you needed to do the MSI
> disable magic in the early PCI quirks which run before interrupts get
> enabled.
>
> Also Liu's patch only works if:
>
> 1) CONFIG_IRQ_TIME_ACCOUNTING is enabled
>
> 2) the runaway interrupt has been requested by the relevant driver in
> the dump kernel.
>
> Especially #1 is not a sensible restriction.
>
> Thanks,
>
> tglx
Wow, thank you very much for this great explanation (without a
reproducer) - it's nice to hear somebody that deeply understands the
code! And double thanks for CCing Bjorn.
So, I don't want to hijack Liu's thread, but do you think it makes
sense to have my approach as a (debug) parameter to prevent such a
degenerate case? Or could we have something in core IRQ code to
prevent irq flooding in such scenarios, something "stronger" than
disabling MSIs (APIC-level, likely)?
Cheers,
Guilherme
On Mon, Oct 26, 2020 at 6:21 PM Thomas Gleixner <[email protected]> wrote:
> [...]
> > So, I don't want to hijack Liu's thread, but do you think it makes
> > sense to have my approach as a (debug) parameter to prevent such a
> > degenerate case?
>
> At least it makes sense to some extent even if it's incomplete. What
> bothers me is that it'd be x86 specific while the issue is pretty much
> architecture independent. I don't think that the APIC is special in that
> regard. Rogue MSIs should be able to bring down pretty much all
> architectures.
>
Thanks Thomas! I partially agree with you, I can speak only for x86
and powerpc. In x86 we know that happens, OK. But in powerpc, we had a
special PCI reset, we called it IIRC "fundamental"/PHB reset - that
procedure would put the PCI devices in good shape, it was something
that the kernel could request from FW - see [0] for an example. It was
present in all incarnations of powerpc (bare-metal, powerVM/PHyp - a
virtual thing) except maybe in qemu (although it'd be possible to do
that, since the PCI devices are attached on host and passthrough'ed
via vfio).
Anyway, in powerpc the PCI devices are really reset across
"soft-reboots" be it kexec or what was called a fast reboot (that
skipped some FW initializations), effectively disabling MSIs - x86 has
no such default/vendor-agnostic reset infrastructure, BIOSes usually
do some kind of PCI reset but with no interface for the kernel to
request that in kexec, for example. That said, the option was to use
the arch code to early-clear the MSI state in all devices, that being
a kind of reset. And it's "supported" by the spec, that claims MSIs
should be clear before devices' initialization =)
Anyway, I'm glad to discuss more, and I'm even more glad that you
consider the approach useful. We could revive that if Bjorn agrees, I
could respin an updated version. ARM64/RISC-V or whatever other
architectures I can't say about, but I think if they have early-PCI
handlers (and !FW reset, like powerpc) it would be possible to
implement that in a more complete way.
> > Or could we have something in core IRQ code to prevent irq flooding in
> > such scenarios, something "stronger" than disabling MSIs (APIC-level,
> > likely)?
>
> For your case? No. The APIC cannot be protected against rogue MSIs. The
> only cure is to disable interrupts or disable MSIs on all PCI[E] devices
> early on. Disabling interrupts is not so much of an option obviously :)
Great to know that, we imagined if it would be possible to have a more
"soft" option, but it seems clearing MSIs is the way to go.
Cheers,
Guilherme
[0] kernel portion:
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/platforms/powernv/pci-ioda.c#n3161
FW portion: github.com/open-power/skiboot/blob/master/core/pci-opal.c#L545
On Wed, Oct 28 2020 at 14:02, Pingfan Liu wrote:
> On Tue, Oct 27, 2020 at 3:59 AM Thomas Gleixner <[email protected]> wrote:
>> Also Liu's patch only works if:
>>
>> 1) CONFIG_IRQ_TIME_ACCOUNTING is enabled
>
> I wonder whether it can not be a default option or not by the following method:
> DEFINE_STATIC_KEY_FALSE(irqtime_account), and enable it according to
> a boot param.
How so?
config IRQ_TIME_ACCOUNTING
depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE
> This will have no impact on performance with the disabled branch.
> Meanwhile users can easily turn on the option to detect an irq flood
> without recompiling the kernel.
>
> If it is doable, I will rework only on [1/2].
See above :)
>> 2) the runaway interrupt has been requested by the relevant driver in
>> the dump kernel.
>
> Yes, it raises a big challenge to my method. Kdump kernel miss the
> whole picture of the first kernel's irq routing.
Correct. If there is anything stale then you get what Guilherme
observed. But the irq core can do nothing about that.
Something like the completly untested below should work independent of
config options.
Thanks,
tglx
---
include/linux/irqdesc.h | 4 ++
kernel/irq/manage.c | 3 +
kernel/irq/spurious.c | 74 +++++++++++++++++++++++++++++++++++-------------
3 files changed, 61 insertions(+), 20 deletions(-)
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -30,6 +30,8 @@ struct pt_regs;
* @tot_count: stats field for non-percpu irqs
* @irq_count: stats field to detect stalled irqs
* @last_unhandled: aging timer for unhandled count
+ * @storm_count: Counter for irq storm detection
+ * @storm_checked: Timestamp for irq storm detection
* @irqs_unhandled: stats field for spurious unhandled interrupts
* @threads_handled: stats field for deferred spurious detection of threaded handlers
* @threads_handled_last: comparator field for deferred spurious detection of theraded handlers
@@ -65,6 +67,8 @@ struct irq_desc {
unsigned int tot_count;
unsigned int irq_count; /* For detecting broken IRQs */
unsigned long last_unhandled; /* Aging timer for unhandled count */
+ unsigned long storm_count;
+ unsigned long storm_checked;
unsigned int irqs_unhandled;
atomic_t threads_handled;
int threads_handled_last;
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1581,6 +1581,9 @@ static int
if (!shared) {
init_waitqueue_head(&desc->wait_for_threads);
+ /* Take a timestamp for interrupt storm detection */
+ desc->storm_checked = jiffies;
+
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
ret = __irq_set_trigger(desc,
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -21,6 +21,7 @@ static void poll_spurious_irqs(struct ti
static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
static int irq_poll_cpu;
static atomic_t irq_poll_active;
+static unsigned long irqstorm_limit __ro_after_init;
/*
* We wait here for a poller to finish.
@@ -189,18 +190,21 @@ static inline int bad_action_ret(irqretu
* (The other 100-of-100,000 interrupts may have been a correctly
* functioning device sharing an IRQ with the failing one)
*/
-static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
+static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret,
+ bool storm)
{
unsigned int irq = irq_desc_get_irq(desc);
struct irqaction *action;
unsigned long flags;
- if (bad_action_ret(action_ret)) {
- printk(KERN_ERR "irq event %d: bogus return value %x\n",
- irq, action_ret);
- } else {
- printk(KERN_ERR "irq %d: nobody cared (try booting with "
+ if (!storm) {
+ if (bad_action_ret(action_ret)) {
+ pr_err("irq event %d: bogus return value %x\n",
+ irq, action_ret);
+ } else {
+ pr_err("irq %d: nobody cared (try booting with "
"the \"irqpoll\" option)\n", irq);
+ }
}
dump_stack();
printk(KERN_ERR "handlers:\n");
@@ -228,7 +232,7 @@ static void report_bad_irq(struct irq_de
if (count > 0) {
count--;
- __report_bad_irq(desc, action_ret);
+ __report_bad_irq(desc, action_ret, false);
}
}
@@ -267,6 +271,33 @@ try_misrouted_irq(unsigned int irq, stru
return action && (action->flags & IRQF_IRQPOLL);
}
+static void disable_stuck_irq(struct irq_desc *desc, irqreturn_t action_ret,
+ const char *reason, bool storm)
+{
+ __report_bad_irq(desc, action_ret, storm);
+ pr_emerg("Disabling %s IRQ #%d\n", reason, irq_desc_get_irq(desc));
+ desc->istate |= IRQS_SPURIOUS_DISABLED;
+ desc->depth++;
+ irq_disable(desc);
+}
+
+/* Interrupt storm detector for runaway interrupts (handled or not). */
+static bool irqstorm_detected(struct irq_desc *desc)
+{
+ unsigned long now = jiffies;
+
+ if (++desc->storm_count < irqstorm_limit) {
+ if (time_after(now, desc->storm_checked + HZ)) {
+ desc->storm_count = 0;
+ desc->storm_checked = now;
+ }
+ return false;
+ }
+
+ disable_stuck_irq(desc, IRQ_NONE, "runaway", true);
+ return true;
+}
+
#define SPURIOUS_DEFERRED 0x80000000
void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
@@ -403,24 +434,16 @@ void note_interrupt(struct irq_desc *des
desc->irqs_unhandled -= ok;
}
+ if (unlikely(irqstorm_limit && irqstorm_detected(desc)))
+ return;
+
desc->irq_count++;
if (likely(desc->irq_count < 100000))
return;
desc->irq_count = 0;
if (unlikely(desc->irqs_unhandled > 99900)) {
- /*
- * The interrupt is stuck
- */
- __report_bad_irq(desc, action_ret);
- /*
- * Now kill the IRQ
- */
- printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
- desc->istate |= IRQS_SPURIOUS_DISABLED;
- desc->depth++;
- irq_disable(desc);
-
+ disable_stuck_irq(desc, action_ret, "unhandled", false);
mod_timer(&poll_spurious_irq_timer,
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
@@ -462,5 +485,16 @@ static int __init irqpoll_setup(char *st
"performance\n");
return 1;
}
-
__setup("irqpoll", irqpoll_setup);
+
+static int __init irqstorm_setup(char *arg)
+{
+ int res = kstrtoul(arg, 0, &irqstorm_limit);
+
+ if (!res) {
+ pr_info("Interrupt storm detector enabled. Limit=%lu / s\n",
+ irqstorm_limit);
+ }
+ return !!res;
+}
+__setup("irqstorm_limit", irqstorm_setup);
On Wed, Oct 28, 2020 at 12:58:41PM +0100, Thomas Gleixner wrote:
> On Wed, Oct 28 2020 at 14:02, Pingfan Liu wrote:
> > On Tue, Oct 27, 2020 at 3:59 AM Thomas Gleixner <[email protected]> wrote:
> >> Also Liu's patch only works if:
> >>
> >> 1) CONFIG_IRQ_TIME_ACCOUNTING is enabled
> >
> > I wonder whether it can not be a default option or not by the following method:
> > DEFINE_STATIC_KEY_FALSE(irqtime_account), and enable it according to
> > a boot param.
>
> How so?
>
> config IRQ_TIME_ACCOUNTING
> depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE
>
Look closely at the two config value:
-1. HAVE_IRQ_TIME_ACCOUNTING, it is selected by most of the popular arches, and
can be further relaxed.
It implies sched_clock() is fast enough for sampling. With current code, the
variable sched_clock_irqtime=0 can be used to turn off irqtime accounting on
some arches with slow sched_clock(). And it can be even better by using
DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime)
So the pre-requirement can be relaxed as "depends on !VIRT_CPU_ACCOUNTING_NATIVE"
In case that I can not express clearly, could you have a look at the demo patch?
That patch _assumes_ that irqtime accounting costs much and is not turned on by
default. If turned on, it will cost an extra jmp than current implement.
And I think it is critical to my [1/3] whether this assumption is reasonable.
-2. For VIRT_CPU_ACCOUNTING_NATIVE, it can only be selected by powerpc and ia64
In fact, I have a seperate patch for powerpc with
CONFIG_VIRT_CPU_ACCOUNTING_NATIVE to utilize my [1/3].
---
diff --git a/init/Kconfig b/init/Kconfig
index c944691..16d168b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -490,7 +490,7 @@ endchoice
config IRQ_TIME_ACCOUNTING
bool "Fine granularity task level IRQ time accounting"
- depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE
+ depends on !VIRT_CPU_ACCOUNTING_NATIVE
help
Select this option to enable fine granularity task irq time
accounting. This is done by reading a timestamp on each
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5a55d23..3ab7e1d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -19,7 +19,7 @@
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-static int sched_clock_irqtime;
+DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime);
void enable_sched_clock_irqtime(void)
{
@@ -49,13 +49,14 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
*/
void irqtime_account_irq(struct task_struct *curr)
{
- struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ struct irqtime *irqtime;
s64 delta;
int cpu;
- if (!sched_clock_irqtime)
+ if (static_branch_unlikely(&sched_clock_irqtime))
return;
+ irqtime = this_cpu_ptr(&cpu_irqtime);
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
@@ -84,16 +85,7 @@ static u64 irqtime_tick_accounted(u64 maxtime)
return delta;
}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#define sched_clock_irqtime (0)
-
-static u64 irqtime_tick_accounted(u64 dummy)
-{
- return 0;
-}
-
-#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
+#endif
static inline void task_group_account_field(struct task_struct *p, int index,
u64 tmp)
@@ -475,7 +467,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (vtime_accounting_enabled_this_cpu())
return;
- if (sched_clock_irqtime) {
+ if (static_branch_unlikely(&sched_clock_irqtime))
irqtime_account_process_tick(p, user_tick, 1);
return;
}
@@ -504,7 +496,7 @@ void account_idle_ticks(unsigned long ticks)
{
u64 cputime, steal;
- if (sched_clock_irqtime) {
+ if (static_branch_unlikely(&sched_clock_irqtime))
irqtime_account_idle_ticks(ticks);
return;
}
--
2.7.5
[...]
> +
> +static int __init irqstorm_setup(char *arg)
> +{
> + int res = kstrtoul(arg, 0, &irqstorm_limit);
> +
> + if (!res) {
> + pr_info("Interrupt storm detector enabled. Limit=%lu / s\n",
> + irqstorm_limit);
> + }
> + return !!res;
> +}
> +__setup("irqstorm_limit", irqstorm_setup);
This configuration independent method looks appealing. And I am glad to have a try.
But irqstorm_limit may be a hard choice. Maybe by formula:
instruction-percpu-per-second / insn num of irq failed path ? It is hard to
estimate "instruction-percpu-per-second".
Thanks,
Pingfan
On Tue, Oct 27, 2020 at 3:59 AM Thomas Gleixner <[email protected]> wrote:
>
[...]
>
> And contrary to Liu's patches which try to disable a requested interrupt
> if too many of them arrive, the kernel cannot do anything because there
> is nothing to disable in your case. That's why you needed to do the MSI
> disable magic in the early PCI quirks which run before interrupts get
> enabled.
>
> Also Liu's patch only works if:
>
> 1) CONFIG_IRQ_TIME_ACCOUNTING is enabled
I wonder whether it can not be a default option or not by the following method:
DEFINE_STATIC_KEY_FALSE(irqtime_account), and enable it according to
a boot param.
This will have no impact on performance with the disabled branch.
Meanwhile users can easily turn on the option to detect an irq flood
without recompiling the kernel.
If it is doable, I will rework only on [1/2].
>
> 2) the runaway interrupt has been requested by the relevant driver in
> the dump kernel.
Yes, it raises a big challenge to my method. Kdump kernel miss the
whole picture of the first kernel's irq routing.
Thanks,
Pingfan
On Wed, Oct 28, 2020 at 7:58 PM Thomas Gleixner <[email protected]> wrote:
>
[...]
> ---
> include/linux/irqdesc.h | 4 ++
> kernel/irq/manage.c | 3 +
> kernel/irq/spurious.c | 74 +++++++++++++++++++++++++++++++++++-------------
> 3 files changed, 61 insertions(+), 20 deletions(-)
>
> --- a/include/linux/irqdesc.h
> +++ b/include/linux/irqdesc.h
> @@ -30,6 +30,8 @@ struct pt_regs;
> * @tot_count: stats field for non-percpu irqs
> * @irq_count: stats field to detect stalled irqs
> * @last_unhandled: aging timer for unhandled count
> + * @storm_count: Counter for irq storm detection
> + * @storm_checked: Timestamp for irq storm detection
> * @irqs_unhandled: stats field for spurious unhandled interrupts
> * @threads_handled: stats field for deferred spurious detection of threaded handlers
> * @threads_handled_last: comparator field for deferred spurious detection of theraded handlers
> @@ -65,6 +67,8 @@ struct irq_desc {
> unsigned int tot_count;
> unsigned int irq_count; /* For detecting broken IRQs */
> unsigned long last_unhandled; /* Aging timer for unhandled count */
> + unsigned long storm_count;
> + unsigned long storm_checked;
> unsigned int irqs_unhandled;
> atomic_t threads_handled;
> int threads_handled_last;
> --- a/kernel/irq/manage.c
> +++ b/kernel/irq/manage.c
> @@ -1581,6 +1581,9 @@ static int
> if (!shared) {
> init_waitqueue_head(&desc->wait_for_threads);
>
> + /* Take a timestamp for interrupt storm detection */
> + desc->storm_checked = jiffies;
> +
> /* Setup the type (level, edge polarity) if configured: */
> if (new->flags & IRQF_TRIGGER_MASK) {
> ret = __irq_set_trigger(desc,
> --- a/kernel/irq/spurious.c
> +++ b/kernel/irq/spurious.c
> @@ -21,6 +21,7 @@ static void poll_spurious_irqs(struct ti
> static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
> static int irq_poll_cpu;
> static atomic_t irq_poll_active;
> +static unsigned long irqstorm_limit __ro_after_init;
>
> /*
> * We wait here for a poller to finish.
> @@ -189,18 +190,21 @@ static inline int bad_action_ret(irqretu
> * (The other 100-of-100,000 interrupts may have been a correctly
> * functioning device sharing an IRQ with the failing one)
> */
> -static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
> +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret,
> + bool storm)
> {
> unsigned int irq = irq_desc_get_irq(desc);
> struct irqaction *action;
> unsigned long flags;
>
> - if (bad_action_ret(action_ret)) {
> - printk(KERN_ERR "irq event %d: bogus return value %x\n",
> - irq, action_ret);
> - } else {
> - printk(KERN_ERR "irq %d: nobody cared (try booting with "
> + if (!storm) {
> + if (bad_action_ret(action_ret)) {
> + pr_err("irq event %d: bogus return value %x\n",
> + irq, action_ret);
> + } else {
> + pr_err("irq %d: nobody cared (try booting with "
> "the \"irqpoll\" option)\n", irq);
> + }
> }
> dump_stack();
> printk(KERN_ERR "handlers:\n");
> @@ -228,7 +232,7 @@ static void report_bad_irq(struct irq_de
>
> if (count > 0) {
> count--;
> - __report_bad_irq(desc, action_ret);
> + __report_bad_irq(desc, action_ret, false);
> }
> }
>
> @@ -267,6 +271,33 @@ try_misrouted_irq(unsigned int irq, stru
> return action && (action->flags & IRQF_IRQPOLL);
> }
>
> +static void disable_stuck_irq(struct irq_desc *desc, irqreturn_t action_ret,
> + const char *reason, bool storm)
> +{
> + __report_bad_irq(desc, action_ret, storm);
> + pr_emerg("Disabling %s IRQ #%d\n", reason, irq_desc_get_irq(desc));
> + desc->istate |= IRQS_SPURIOUS_DISABLED;
> + desc->depth++;
> + irq_disable(desc);
> +}
> +
> +/* Interrupt storm detector for runaway interrupts (handled or not). */
> +static bool irqstorm_detected(struct irq_desc *desc)
> +{
> + unsigned long now = jiffies;
> +
> + if (++desc->storm_count < irqstorm_limit) {
> + if (time_after(now, desc->storm_checked + HZ)) {
> + desc->storm_count = 0;
> + desc->storm_checked = now;
> + }
> + return false;
> + }
> +
> + disable_stuck_irq(desc, IRQ_NONE, "runaway", true);
> + return true;
> +}
> +
> #define SPURIOUS_DEFERRED 0x80000000
>
> void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
> @@ -403,24 +434,16 @@ void note_interrupt(struct irq_desc *des
> desc->irqs_unhandled -= ok;
> }
>
> + if (unlikely(irqstorm_limit && irqstorm_detected(desc)))
> + return;
> +
> desc->irq_count++;
> if (likely(desc->irq_count < 100000))
> return;
>
> desc->irq_count = 0;
> if (unlikely(desc->irqs_unhandled > 99900)) {
> - /*
> - * The interrupt is stuck
> - */
> - __report_bad_irq(desc, action_ret);
> - /*
> - * Now kill the IRQ
> - */
> - printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
> - desc->istate |= IRQS_SPURIOUS_DISABLED;
> - desc->depth++;
> - irq_disable(desc);
> -
> + disable_stuck_irq(desc, action_ret, "unhandled", false);
> mod_timer(&poll_spurious_irq_timer,
> jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
> }
> @@ -462,5 +485,16 @@ static int __init irqpoll_setup(char *st
> "performance\n");
> return 1;
> }
> -
> __setup("irqpoll", irqpoll_setup);
> +
> +static int __init irqstorm_setup(char *arg)
> +{
> + int res = kstrtoul(arg, 0, &irqstorm_limit);
> +
> + if (!res) {
> + pr_info("Interrupt storm detector enabled. Limit=%lu / s\n",
> + irqstorm_limit);
> + }
> + return !!res;
> +}
> +__setup("irqstorm_limit", irqstorm_setup);
It should be
__setup("irqstorm_limit=", irqstorm_setup);
And I have tested this patch on the P9 machine, where I set the limit
to 70000. It works for kdump kernel.
Thanks,
Pingfan
The chipset's pins are often multi-used because greedy usage compares to a
finite pin number. This requests a carefully configuration in firmware or OS.
If not, it may contribute to most of irq flood cases, which appears as a soft
lockup issue on Linux.
Strictly speaking, soft lockup and irq flood are different things to overcome.
And it is helpful to make users aware of that situation for prime time.
This series shows the irq statistics when soft lockup. The statistics can be
used to evaluate the possibility of irq flood and as a rough evaluated input to
the kernel parameter "irqstorm_limit" in [1].
It is not easy to find a common way to work around irq flood, which may be
raised by different root cause. To now, it is still a open question.
Thomas and Guilherme suggested patches to suppress the odd irq in different
situation. [1].[2]
[1]: https://lore.kernel.org/lkml/[email protected]/
[2]: https://lore.kernel.org/linux-pci/[email protected]/
Cc: Thomas Gleixner <[email protected]>
Cc: Jisheng Zhang <[email protected]>
Cc: "Peter Zijlstra (Intel)" <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "Guilherme G. Piccoli" <[email protected]>
Cc: Petr Mladek <[email protected]>
Cc: [email protected]
To: [email protected]
Pingfan Liu (3):
x86/irq: account the unused irq
kernel/watchdog: make watchdog_touch_ts more accurate by using
nanosecond
kernel/watchdog: use soft lockup to detect irq flood
arch/x86/kernel/irq.c | 1 +
include/linux/kernel_stat.h | 1 +
kernel/watchdog.c | 37 ++++++++++++++++++++++++++-----------
3 files changed, 28 insertions(+), 11 deletions(-)
--
2.7.5
Accounting the unused irq in order to count it if irq flood.
Signed-off-by: Pingfan Liu <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Jisheng Zhang <[email protected]>
Cc: "Peter Zijlstra (Intel)" <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "Guilherme G. Piccoli" <[email protected]>
Cc: Petr Mladek <[email protected]>
Cc: [email protected]
To: [email protected]
---
arch/x86/kernel/irq.c | 1 +
include/linux/kernel_stat.h | 1 +
2 files changed, 2 insertions(+)
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c5dd503..6f583a7 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -254,6 +254,7 @@ DEFINE_IDTENTRY_IRQ(common_interrupt)
pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
__func__, smp_processor_id(),
vector);
+ __this_cpu_inc(kstat.unused_irqs_sum);
} else {
__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
}
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 89f0745..c8d5cb8 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -37,6 +37,7 @@ struct kernel_cpustat {
struct kernel_stat {
unsigned long irqs_sum;
+ unsigned long unused_irqs_sum;
unsigned int softirqs[NR_SOFTIRQS];
};
--
2.7.5
When irq flood happens, interrupt handler occupies all of the cpu time.
This results in a situation where soft lockup can be observed, although it
is different from the design purpose of soft lockup.
In order to distinguish this situation, it is helpful to print out the
statistics of irq frequency when warning soft lockup to evaluate the
potential irq flood.
Thomas and Guilherme suggested patches to suppress the odd irq in different
situation. [1].[2]. But it seems to be an open question in a near future. For now,
it had better print some hints for users than nothing.
[1]: https://lore.kernel.org/lkml/[email protected]/
[2]: https://lore.kernel.org/linux-pci/[email protected]/
Signed-off-by: Pingfan Liu <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Jisheng Zhang <[email protected]>
Cc: "Peter Zijlstra (Intel)" <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "Guilherme G. Piccoli" <[email protected]>
Cc: Petr Mladek <[email protected]>
Cc: [email protected]
To: [email protected]
---
kernel/watchdog.c | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1cc619a..a0ab2a8 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,6 +23,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <linux/stop_machine.h>
+#include <linux/kernel_stat.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -175,6 +176,9 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(unsigned long, last_irq_sum);
+static DEFINE_PER_CPU(unsigned long, last_unused_irq_sum);
+
static unsigned long soft_lockup_nmi_warn;
static int __init nowatchdog_setup(char *str)
@@ -353,6 +357,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* kick the softlockup detector */
if (completion_done(this_cpu_ptr(&softlockup_completion))) {
+ __this_cpu_write(last_irq_sum, kstat_this_cpu->irqs_sum);
+ __this_cpu_write(last_unused_irq_sum, kstat_this_cpu->unused_irqs_sum);
reinit_completion(this_cpu_ptr(&softlockup_completion));
stop_one_cpu_nowait(smp_processor_id(),
softlockup_fn, NULL,
@@ -386,6 +392,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
*/
duration = is_softlockup(touch_ts);
if (unlikely(duration)) {
+ unsigned long irq_sum, unused_irq_sum;
+ unsigned int seconds;
+
/*
* If a virtual machine is stopped by the host it can look to
* the watchdog like a soft lockup, check to see if the host
@@ -409,9 +418,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
}
}
+ irq_sum = kstat_this_cpu->irqs_sum - __this_cpu_read(last_irq_sum);
+ unused_irq_sum = kstat_this_cpu->unused_irqs_sum -
+ __this_cpu_read(last_unused_irq_sum);
+ seconds = (unsigned int)convert_seconds(duration);
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
- smp_processor_id(), (unsigned int)convert_seconds(duration),
+ smp_processor_id(), seconds,
current->comm, task_pid_nr(current));
+ pr_emerg("%lu irqs at rate: %lu / s, %lu unused irq at rate: %lu / s\n",
+ irq_sum, irq_sum/seconds, unused_irq_sum, unused_irq_sum/seconds);
print_modules();
print_irqtrace_events(current);
if (regs)
--
2.7.5
The incoming statistics of irq average number will base on the delta of
watchdog_touch_ts. Improving the accuracy of watchdog_touch_ts from second
to nanosecond can help improve the accuracy of the statistics.
Signed-off-by: Pingfan Liu <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Jisheng Zhang <[email protected]>
Cc: "Peter Zijlstra (Intel)" <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "Guilherme G. Piccoli" <[email protected]>
Cc: Petr Mladek <[email protected]>
Cc: [email protected]
To: [email protected]
---
kernel/watchdog.c | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5abb5b2..1cc619a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -207,7 +207,7 @@ static void __lockup_detector_cleanup(void);
* the thresholds with a factor: we make the soft threshold twice the amount of
* time the hard threshold is.
*/
-static int get_softlockup_thresh(void)
+static unsigned int get_softlockup_thresh(void)
{
return watchdog_thresh * 2;
}
@@ -217,9 +217,9 @@ static int get_softlockup_thresh(void)
* resolution, and we don't need to waste time with a big divide when
* 2^30ns == 1.074s.
*/
-static unsigned long get_timestamp(void)
+static unsigned long convert_seconds(unsigned long ns)
{
- return running_clock() >> 30LL; /* 2^30 ~= 10^9 */
+ return ns >> 30LL; /* 2^30 ~= 10^9 */
}
static void set_sample_period(void)
@@ -238,7 +238,7 @@ static void set_sample_period(void)
/* Commands for resetting the watchdog */
static void __touch_watchdog(void)
{
- __this_cpu_write(watchdog_touch_ts, get_timestamp());
+ __this_cpu_write(watchdog_touch_ts, running_clock());
}
/**
@@ -289,14 +289,15 @@ void touch_softlockup_watchdog_sync(void)
__this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
}
-static int is_softlockup(unsigned long touch_ts)
+static unsigned long is_softlockup(unsigned long touch_ts)
{
- unsigned long now = get_timestamp();
+ unsigned long span, now = running_clock();
+ span = now - touch_ts;
if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
/* Warn about unreasonable delays. */
- if (time_after(now, touch_ts + get_softlockup_thresh()))
- return now - touch_ts;
+ if (time_after(convert_seconds(span), (unsigned long)get_softlockup_thresh()))
+ return span;
}
return 0;
}
@@ -340,9 +341,8 @@ static int softlockup_fn(void *data)
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
- unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
+ unsigned long duration, touch_ts = __this_cpu_read(watchdog_touch_ts);
struct pt_regs *regs = get_irq_regs();
- int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
if (!watchdog_enabled)
@@ -410,7 +410,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
}
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
- smp_processor_id(), duration,
+ smp_processor_id(), (unsigned int)convert_seconds(duration),
current->comm, task_pid_nr(current));
print_modules();
print_irqtrace_events(current);
--
2.7.5
Hi Thomas,
> On Wed, Oct 28, 2020 at 12:58:41PM +0100, Thomas Gleixner wrote:
>
<snip>...
> Something like the completly untested below should work independent of
> config options.
>
> Thanks,
>
> tglx
> ---
> include/linux/irqdesc.h | 4 ++
> kernel/irq/manage.c | 3 +
> kernel/irq/spurious.c | 74 +++++++++++++++++++++++++++++++++++-------------
> 3 files changed, 61 insertions(+), 20 deletions(-)
>
> --- a/include/linux/irqdesc.h
> +++ b/include/linux/irqdesc.h
> @@ -30,6 +30,8 @@ struct pt_regs;
> * @tot_count: stats field for non-percpu irqs
> * @irq_count: stats field to detect stalled irqs
> * @last_unhandled: aging timer for unhandled count
> + * @storm_count: Counter for irq storm detection
> + * @storm_checked: Timestamp for irq storm detection
> * @irqs_unhandled: stats field for spurious unhandled interrupts
> * @threads_handled: stats field for deferred spurious detection of threaded handlers
> * @threads_handled_last: comparator field for deferred spurious detection of theraded handlers
> @@ -65,6 +67,8 @@ struct irq_desc {
> unsigned int tot_count;
> unsigned int irq_count; /* For detecting broken IRQs */
> unsigned long last_unhandled; /* Aging timer for unhandled count */
> + unsigned long storm_count;
> + unsigned long storm_checked;
> unsigned int irqs_unhandled;
> atomic_t threads_handled;
> int threads_handled_last;
> --- a/kernel/irq/manage.c
> +++ b/kernel/irq/manage.c
> @@ -1581,6 +1581,9 @@ static int
> if (!shared) {
> init_waitqueue_head(&desc->wait_for_threads);
>
> + /* Take a timestamp for interrupt storm detection */
> + desc->storm_checked = jiffies;
> +
> /* Setup the type (level, edge polarity) if configured: */
> if (new->flags & IRQF_TRIGGER_MASK) {
> ret = __irq_set_trigger(desc,
> --- a/kernel/irq/spurious.c
> +++ b/kernel/irq/spurious.c
> @@ -21,6 +21,7 @@ static void poll_spurious_irqs(struct ti
> static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
> static int irq_poll_cpu;
> static atomic_t irq_poll_active;
> +static unsigned long irqstorm_limit __ro_after_init;
>
> /*
> * We wait here for a poller to finish.
> @@ -189,18 +190,21 @@ static inline int bad_action_ret(irqretu
> * (The other 100-of-100,000 interrupts may have been a correctly
> * functioning device sharing an IRQ with the failing one)
> */
> -static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
> +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret,
> + bool storm)
> {
> unsigned int irq = irq_desc_get_irq(desc);
> struct irqaction *action;
> unsigned long flags;
>
> - if (bad_action_ret(action_ret)) {
> - printk(KERN_ERR "irq event %d: bogus return value %x\n",
> - irq, action_ret);
> - } else {
> - printk(KERN_ERR "irq %d: nobody cared (try booting with "
> + if (!storm) {
> + if (bad_action_ret(action_ret)) {
> + pr_err("irq event %d: bogus return value %x\n",
> + irq, action_ret);
> + } else {
> + pr_err("irq %d: nobody cared (try booting with "
> "the \"irqpoll\" option)\n", irq);
> + }
> }
> dump_stack();
> printk(KERN_ERR "handlers:\n");
> @@ -228,7 +232,7 @@ static void report_bad_irq(struct irq_de
>
> if (count > 0) {
> count--;
> - __report_bad_irq(desc, action_ret);
> + __report_bad_irq(desc, action_ret, false);
> }
> }
>
> @@ -267,6 +271,33 @@ try_misrouted_irq(unsigned int irq, stru
> return action && (action->flags & IRQF_IRQPOLL);
> }
>
> +static void disable_stuck_irq(struct irq_desc *desc, irqreturn_t action_ret,
> + const char *reason, bool storm)
> +{
> + __report_bad_irq(desc, action_ret, storm);
> + pr_emerg("Disabling %s IRQ #%d\n", reason, irq_desc_get_irq(desc));
> + desc->istate |= IRQS_SPURIOUS_DISABLED;
> + desc->depth++;
> + irq_disable(desc);
> +}
> +
> +/* Interrupt storm detector for runaway interrupts (handled or not). */
> +static bool irqstorm_detected(struct irq_desc *desc)
> +{
> + unsigned long now = jiffies;
> +
> + if (++desc->storm_count < irqstorm_limit) {
> + if (time_after(now, desc->storm_checked + HZ)) {
> + desc->storm_count = 0;
> + desc->storm_checked = now;
> + }
> + return false;
> + }
> +
> + disable_stuck_irq(desc, IRQ_NONE, "runaway", true);
> + return true;
> +}
> +
> #define SPURIOUS_DEFERRED 0x80000000
>
> void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
> @@ -403,24 +434,16 @@ void note_interrupt(struct irq_desc *des
> desc->irqs_unhandled -= ok;
> }
>
> + if (unlikely(irqstorm_limit && irqstorm_detected(desc)))
> + return;
> +
> desc->irq_count++;
> if (likely(desc->irq_count < 100000))
> return;
>
> desc->irq_count = 0;
> if (unlikely(desc->irqs_unhandled > 99900)) {
> - /*
> - * The interrupt is stuck
> - */
> - __report_bad_irq(desc, action_ret);
> - /*
> - * Now kill the IRQ
> - */
> - printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
> - desc->istate |= IRQS_SPURIOUS_DISABLED;
> - desc->depth++;
> - irq_disable(desc);
> -
> + disable_stuck_irq(desc, action_ret, "unhandled", false);
> mod_timer(&poll_spurious_irq_timer,
> jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
> }
> @@ -462,5 +485,16 @@ static int __init irqpoll_setup(char *st
> "performance\n");
> return 1;
> }
> -
> __setup("irqpoll", irqpoll_setup);
> +
> +static int __init irqstorm_setup(char *arg)
> +{
> + int res = kstrtoul(arg, 0, &irqstorm_limit);
> +
> + if (!res) {
> + pr_info("Interrupt storm detector enabled. Limit=%lu / s\n",
> + irqstorm_limit);
> + }
> + return !!res;
> +}
> +__setup("irqstorm_limit", irqstorm_setup);
This irq storm detection feature is very useful, any chance to get this merged?
We will be happy to test. People seem to be having their own copy of such feature
out-of-tree [1].
[1] https://elinux.org/images/d/de/Oct28_InterruptStormDetectionFeature_KentoKobayashi.pdf
Thanks,
Sai
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation
Hi Thomas,
On 2021-03-02 13:15, Sai Prakash Ranjan wrote:
> Hi Thomas,
>
>> On Wed, Oct 28, 2020 at 12:58:41PM +0100, Thomas Gleixner wrote:
>>
>
> <snip>...
>
>> Something like the completly untested below should work independent of
>> config options.
>>
>> Thanks,
>>
>> tglx
>> ---
>> include/linux/irqdesc.h | 4 ++
>> kernel/irq/manage.c | 3 +
>> kernel/irq/spurious.c | 74
>> +++++++++++++++++++++++++++++++++++-------------
>> 3 files changed, 61 insertions(+), 20 deletions(-)
>>
>> --- a/include/linux/irqdesc.h
>> +++ b/include/linux/irqdesc.h
>> @@ -30,6 +30,8 @@ struct pt_regs;
>> * @tot_count: stats field for non-percpu irqs
>> * @irq_count: stats field to detect stalled irqs
>> * @last_unhandled: aging timer for unhandled count
>> + * @storm_count: Counter for irq storm detection
>> + * @storm_checked: Timestamp for irq storm detection
>> * @irqs_unhandled: stats field for spurious unhandled interrupts
>> * @threads_handled: stats field for deferred spurious detection of
>> threaded handlers
>> * @threads_handled_last: comparator field for deferred spurious
>> detection of theraded handlers
>> @@ -65,6 +67,8 @@ struct irq_desc {
>> unsigned int tot_count;
>> unsigned int irq_count; /* For detecting broken IRQs */
>> unsigned long last_unhandled; /* Aging timer for unhandled count */
>> + unsigned long storm_count;
>> + unsigned long storm_checked;
>> unsigned int irqs_unhandled;
>> atomic_t threads_handled;
>> int threads_handled_last;
>> --- a/kernel/irq/manage.c
>> +++ b/kernel/irq/manage.c
>> @@ -1581,6 +1581,9 @@ static int
>> if (!shared) {
>> init_waitqueue_head(&desc->wait_for_threads);
>>
>> + /* Take a timestamp for interrupt storm detection */
>> + desc->storm_checked = jiffies;
>> +
>> /* Setup the type (level, edge polarity) if configured: */
>> if (new->flags & IRQF_TRIGGER_MASK) {
>> ret = __irq_set_trigger(desc,
>> --- a/kernel/irq/spurious.c
>> +++ b/kernel/irq/spurious.c
>> @@ -21,6 +21,7 @@ static void poll_spurious_irqs(struct ti
>> static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
>> static int irq_poll_cpu;
>> static atomic_t irq_poll_active;
>> +static unsigned long irqstorm_limit __ro_after_init;
>>
>> /*
>> * We wait here for a poller to finish.
>> @@ -189,18 +190,21 @@ static inline int bad_action_ret(irqretu
>> * (The other 100-of-100,000 interrupts may have been a correctly
>> * functioning device sharing an IRQ with the failing one)
>> */
>> -static void __report_bad_irq(struct irq_desc *desc, irqreturn_t
>> action_ret)
>> +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t
>> action_ret,
>> + bool storm)
>> {
>> unsigned int irq = irq_desc_get_irq(desc);
>> struct irqaction *action;
>> unsigned long flags;
>>
>> - if (bad_action_ret(action_ret)) {
>> - printk(KERN_ERR "irq event %d: bogus return value %x\n",
>> - irq, action_ret);
>> - } else {
>> - printk(KERN_ERR "irq %d: nobody cared (try booting with "
>> + if (!storm) {
>> + if (bad_action_ret(action_ret)) {
>> + pr_err("irq event %d: bogus return value %x\n",
>> + irq, action_ret);
>> + } else {
>> + pr_err("irq %d: nobody cared (try booting with "
>> "the \"irqpoll\" option)\n", irq);
>> + }
>> }
>> dump_stack();
>> printk(KERN_ERR "handlers:\n");
>> @@ -228,7 +232,7 @@ static void report_bad_irq(struct irq_de
>>
>> if (count > 0) {
>> count--;
>> - __report_bad_irq(desc, action_ret);
>> + __report_bad_irq(desc, action_ret, false);
>> }
>> }
>>
>> @@ -267,6 +271,33 @@ try_misrouted_irq(unsigned int irq, stru
>> return action && (action->flags & IRQF_IRQPOLL);
>> }
>>
>> +static void disable_stuck_irq(struct irq_desc *desc, irqreturn_t
>> action_ret,
>> + const char *reason, bool storm)
>> +{
>> + __report_bad_irq(desc, action_ret, storm);
>> + pr_emerg("Disabling %s IRQ #%d\n", reason, irq_desc_get_irq(desc));
>> + desc->istate |= IRQS_SPURIOUS_DISABLED;
>> + desc->depth++;
>> + irq_disable(desc);
>> +}
>> +
>> +/* Interrupt storm detector for runaway interrupts (handled or not).
>> */
>> +static bool irqstorm_detected(struct irq_desc *desc)
>> +{
>> + unsigned long now = jiffies;
>> +
>> + if (++desc->storm_count < irqstorm_limit) {
>> + if (time_after(now, desc->storm_checked + HZ)) {
>> + desc->storm_count = 0;
>> + desc->storm_checked = now;
>> + }
>> + return false;
>> + }
>> +
>> + disable_stuck_irq(desc, IRQ_NONE, "runaway", true);
>> + return true;
>> +}
>> +
>> #define SPURIOUS_DEFERRED 0x80000000
>>
>> void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
>> @@ -403,24 +434,16 @@ void note_interrupt(struct irq_desc *des
>> desc->irqs_unhandled -= ok;
>> }
>>
>> + if (unlikely(irqstorm_limit && irqstorm_detected(desc)))
>> + return;
>> +
>> desc->irq_count++;
>> if (likely(desc->irq_count < 100000))
>> return;
>>
>> desc->irq_count = 0;
>> if (unlikely(desc->irqs_unhandled > 99900)) {
>> - /*
>> - * The interrupt is stuck
>> - */
>> - __report_bad_irq(desc, action_ret);
>> - /*
>> - * Now kill the IRQ
>> - */
>> - printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
>> - desc->istate |= IRQS_SPURIOUS_DISABLED;
>> - desc->depth++;
>> - irq_disable(desc);
>> -
>> + disable_stuck_irq(desc, action_ret, "unhandled", false);
>> mod_timer(&poll_spurious_irq_timer,
>> jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
>> }
>> @@ -462,5 +485,16 @@ static int __init irqpoll_setup(char *st
>> "performance\n");
>> return 1;
>> }
>> -
>> __setup("irqpoll", irqpoll_setup);
>> +
>> +static int __init irqstorm_setup(char *arg)
>> +{
>> + int res = kstrtoul(arg, 0, &irqstorm_limit);
>> +
>> + if (!res) {
>> + pr_info("Interrupt storm detector enabled. Limit=%lu / s\n",
>> + irqstorm_limit);
>> + }
>> + return !!res;
>> +}
>> +__setup("irqstorm_limit", irqstorm_setup);
>
> This irq storm detection feature is very useful, any chance to get this
> merged?
> We will be happy to test. People seem to be having their own copy of
> such feature
> out-of-tree [1].
>
> [1]
> https://elinux.org/images/d/de/Oct28_InterruptStormDetectionFeature_KentoKobayashi.pdf
>
Any chance of having this useful debug feature in upstream kernel?
Thanks,
Sai
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
member
of Code Aurora Forum, hosted by The Linux Foundation