2022-10-11 06:04:22

by Zhang Xincheng

[permalink] [raw]
Subject: [PATCH v2] interrupt: debug for discovering frequent interrupts

In some cases, a peripheral's interrupt will be triggered frequently,
which will keep the CPU processing the interrupt and eventually cause
the RCU to report rcu_sched self-detected stall on the CPU. This patch
provides a way to discover and report which outage is causing the problem.

Signed-off-by: Zhang Xincheng <[email protected]>
---
include/linux/irqdesc.h | 5 ++
kernel/irq/Kconfig | 25 +++++++++
kernel/irq/spurious.c | 121 +++++++++++++++++++++++++++++++++++++---
3 files changed, 144 insertions(+), 7 deletions(-)

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 1cd4e36890fb..f82b138c68d6 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -102,6 +102,11 @@ struct irq_desc {
int parent_irq;
struct module *owner;
const char *name;
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+ bool have_reported;
+ u32 gap_count;
+ u64 gap_time;
+#endif
} ____cacheline_internodealigned_in_smp;

#ifdef CONFIG_SPARSE_IRQ
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index db3d174c53d4..0b666ef51a08 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -137,6 +137,31 @@ config GENERIC_IRQ_DEBUGFS

If you don't know what to do here, say N.

+config FREQUENT_IRQ_DEBUG
+ bool "Support for finding and reporting frequent interrupt"
+ default n
+ help
+
+ This is a mechanism to detect and report that interrupts
+ are triggered too frequently.
+
+config COUNT_PER_SECOND
+ int "Interrupt limit per second"
+ depends on FREQUENT_IRQ_DEBUG
+ default "2000"
+ help
+
+ This is the limit on the number of interrupts triggered per second.
+ (Max 65535)
+config DURATION_LIMIT
+ int "Duration limit"
+ depends on FREQUENT_IRQ_DEBUG
+ default "30"
+ help
+
+ The number of interruptions per second exceeds the duration limit of
+ the limit. (Max 65535)
+
endmenu

config GENERIC_IRQ_MULTI_HANDLER
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 02b2daf07441..74f3833aedd3 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -22,6 +22,16 @@ static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
static int irq_poll_cpu;
static atomic_t irq_poll_active;

+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+#define COUNT_PER_SECOND_MASK 0x0000ffff
+#define DURATION_LIMIT_MASK 0xffff0000
+#define DURATION_LIMIT_COUNT 0x00010000
+#define DURATION_LIMIT_OFFSET 16
+static unsigned int count_per_second = CONFIG_COUNT_PER_SECOND;
+static unsigned int duration_limit = CONFIG_DURATION_LIMIT;
+static bool disable_frequent_irq;
+#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
+
/*
* We wait here for a poller to finish.
*
@@ -189,18 +199,16 @@ static inline int bad_action_ret(irqreturn_t action_ret)
* (The other 100-of-100,000 interrupts may have been a correctly
* functioning device sharing an IRQ with the failing one)
*/
-static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
+static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret, const char *msg)
{
unsigned int irq = irq_desc_get_irq(desc);
struct irqaction *action;
unsigned long flags;

if (bad_action_ret(action_ret)) {
- printk(KERN_ERR "irq event %d: bogus return value %x\n",
- irq, action_ret);
+ printk(msg, irq, action_ret);
} else {
- printk(KERN_ERR "irq %d: nobody cared (try booting with "
- "the \"irqpoll\" option)\n", irq);
+ printk(msg, irq);
}
dump_stack();
printk(KERN_ERR "handlers:\n");
@@ -228,7 +236,7 @@ static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)

if (count > 0) {
count--;
- __report_bad_irq(desc, action_ret);
+ __report_bad_irq(desc, action_ret, KERN_ERR "irq event %d: bogus return value %x\n");
}
}

@@ -269,6 +277,46 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,

#define SPURIOUS_DEFERRED 0x80000000

+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+/*
+ * Some bad hardware will trigger interrupts very frequently, which will
+ * cause the CPU to process hardware interrupts all the time. We found
+ * and reported it, and disabling it is optional.
+ */
+void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
+{
+ if (desc->have_reported)
+ return;
+
+ if ((desc->gap_count & DURATION_LIMIT_MASK) == 0)
+ desc->gap_time = get_jiffies_64();
+
+ desc->gap_count++;
+
+ if ((desc->gap_count & COUNT_PER_SECOND_MASK) >= count_per_second) {
+ if ((get_jiffies_64() - desc->gap_time) < HZ) {
+ desc->gap_count += DURATION_LIMIT_COUNT;
+ desc->gap_count &= DURATION_LIMIT_MASK;
+ } else {
+ desc->gap_count = 0;
+ }
+
+ if ((desc->gap_count >> DURATION_LIMIT_OFFSET) >= duration_limit) {
+ __report_bad_irq(desc, action_ret, KERN_ERR "irq %d: triggered too "
+ "frequently\n");
+ desc->have_reported = true;
+ if (disable_frequent_irq)
+ irq_disable(desc);
+ }
+ }
+}
+
+#else
+void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
+{
+}
+#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
+
void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
{
unsigned int irq;
@@ -282,6 +330,8 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
return;
}

+ report_frequent_irq(desc, action_ret);
+
/*
* We cannot call note_interrupt from the threaded handler
* because we need to look at the compound of all handlers
@@ -416,7 +466,8 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
/*
* The interrupt is stuck
*/
- __report_bad_irq(desc, action_ret);
+ __report_bad_irq(desc, action_ret, KERN_ERR "irq %d: nobody cared (try "
+ "bootingwith the \"irqpoll\" option)\n");
/*
* Now kill the IRQ
*/
@@ -476,3 +527,59 @@ static int __init irqpoll_setup(char *str)
}

__setup("irqpoll", irqpoll_setup);
+
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+int __init count_per_second_setup(char *str)
+{
+ int ret;
+
+ ret = kstrtouint(str, 10, &count_per_second);
+ if (ret)
+ return 0;
+
+ printk(KERN_INFO "Interrupt limit per second: %u\n", count_per_second);
+
+ return 1;
+}
+
+__setup("count_per_second=", count_per_second_setup);
+module_param(count_per_second, uint, 0644);
+MODULE_PARM_DESC(count_per_second, "Interrupt limit per second. (Max 0x65535)");
+
+int __init duration_limit_setup(char *str)
+{
+ int ret;
+
+ ret = kstrtouint(str, 10, &duration_limit);
+ if (ret)
+ return 0;
+
+ printk(KERN_INFO "Duration limit: %u\n", duration_limit);
+
+ return 1;
+}
+
+__setup("duration_limit=", duration_limit_setup);
+module_param(duration_limit, uint, 0644);
+MODULE_PARM_DESC(duration_limit, "The number of interruptions per second exceeds the duration limit of the limit. (Max 65535)");
+
+int __init disable_frequent_irq_setup(char *str)
+{
+ int ret;
+
+ ret = kstrtobool(str, &disable_frequent_irq);
+ if (ret)
+ return 0;
+
+ if (disable_frequent_irq)
+ printk(KERN_INFO "Disable frequent irq'\n");
+ else
+ printk(KERN_INFO "Don't disable frequent irq'\n");
+
+ return 1;
+}
+
+__setup("disable_frequent_irq=", disable_frequent_irq_setup);
+module_param(disable_frequent_irq, bool, 0644);
+MODULE_PARM_DESC(noirqdebug, "Disable frequent irq when true");
+#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
--
2.20.1



2022-10-11 08:27:37

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v2] interrupt: debug for discovering frequent interrupts

Hi Zhang,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on tip/irq/core]
[also build test WARNING on linus/master v6.0 next-20221011]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Zhang-Xincheng/interrupt-debug-for-discovering-frequent-interrupts/20221011-135800
base: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 36de4f94197318e45ba77badb5b07274f5bc72a9
config: um-i386_defconfig
compiler: gcc-11 (Debian 11.3.0-5) 11.3.0
reproduce (this is a W=1 build):
# https://github.com/intel-lab-lkp/linux/commit/70d1ccdeb3cc035e44adfd98df109d31ed9af116
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Zhang-Xincheng/interrupt-debug-for-discovering-frequent-interrupts/20221011-135800
git checkout 70d1ccdeb3cc035e44adfd98df109d31ed9af116
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 O=build_dir ARCH=um SUBARCH=i386 SHELL=/bin/bash kernel/irq/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

>> kernel/irq/spurious.c:315:6: warning: no previous prototype for 'report_frequent_irq' [-Wmissing-prototypes]
315 | void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
| ^~~~~~~~~~~~~~~~~~~


vim +/report_frequent_irq +315 kernel/irq/spurious.c

313
314 #else
> 315 void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
316 {
317 }
318 #endif /* CONFIG_FREQUENT_IRQ_DEBUG */
319

--
0-DAY CI Kernel Test Service
https://01.org/lkp


Attachments:
(No filename) (1.92 kB)
config (42.05 kB)
Download all attachments

2022-10-11 10:45:47

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v2] interrupt: debug for discovering frequent interrupts

Hi Zhang,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on tip/irq/core]
[also build test WARNING on linus/master v6.0 next-20221011]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Zhang-Xincheng/interrupt-debug-for-discovering-frequent-interrupts/20221011-135800
base: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 36de4f94197318e45ba77badb5b07274f5bc72a9
config: m68k-allyesconfig
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/70d1ccdeb3cc035e44adfd98df109d31ed9af116
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Zhang-Xincheng/interrupt-debug-for-discovering-frequent-interrupts/20221011-135800
git checkout 70d1ccdeb3cc035e44adfd98df109d31ed9af116
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash kernel/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

kernel/irq/spurious.c:286:6: warning: no previous prototype for 'report_frequent_irq' [-Wmissing-prototypes]
286 | void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
| ^~~~~~~~~~~~~~~~~~~
>> kernel/irq/spurious.c:532:12: warning: no previous prototype for 'count_per_second_setup' [-Wmissing-prototypes]
532 | int __init count_per_second_setup(char *str)
| ^~~~~~~~~~~~~~~~~~~~~~
>> kernel/irq/spurious.c:549:12: warning: no previous prototype for 'duration_limit_setup' [-Wmissing-prototypes]
549 | int __init duration_limit_setup(char *str)
| ^~~~~~~~~~~~~~~~~~~~
>> kernel/irq/spurious.c:566:12: warning: no previous prototype for 'disable_frequent_irq_setup' [-Wmissing-prototypes]
566 | int __init disable_frequent_irq_setup(char *str)
| ^~~~~~~~~~~~~~~~~~~~~~~~~~


vim +/count_per_second_setup +532 kernel/irq/spurious.c

530
531 #ifdef CONFIG_FREQUENT_IRQ_DEBUG
> 532 int __init count_per_second_setup(char *str)
533 {
534 int ret;
535
536 ret = kstrtouint(str, 10, &count_per_second);
537 if (ret)
538 return 0;
539
540 printk(KERN_INFO "Interrupt limit per second: %u\n", count_per_second);
541
542 return 1;
543 }
544
545 __setup("count_per_second=", count_per_second_setup);
546 module_param(count_per_second, uint, 0644);
547 MODULE_PARM_DESC(count_per_second, "Interrupt limit per second. (Max 0x65535)");
548
> 549 int __init duration_limit_setup(char *str)
550 {
551 int ret;
552
553 ret = kstrtouint(str, 10, &duration_limit);
554 if (ret)
555 return 0;
556
557 printk(KERN_INFO "Duration limit: %u\n", duration_limit);
558
559 return 1;
560 }
561
562 __setup("duration_limit=", duration_limit_setup);
563 module_param(duration_limit, uint, 0644);
564 MODULE_PARM_DESC(duration_limit, "The number of interruptions per second exceeds the duration limit of the limit. (Max 65535)");
565
> 566 int __init disable_frequent_irq_setup(char *str)
567 {
568 int ret;
569
570 ret = kstrtobool(str, &disable_frequent_irq);
571 if (ret)
572 return 0;
573
574 if (disable_frequent_irq)
575 printk(KERN_INFO "Disable frequent irq'\n");
576 else
577 printk(KERN_INFO "Don't disable frequent irq'\n");
578
579 return 1;
580 }
581

--
0-DAY CI Kernel Test Service
https://01.org/lkp


Attachments:
(No filename) (4.08 kB)
config (285.21 kB)
Download all attachments

2022-10-11 11:07:16

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v2] interrupt: debug for discovering frequent interrupts

Hi Zhang,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on tip/irq/core]
[also build test WARNING on linus/master v6.0 next-20221011]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Zhang-Xincheng/interrupt-debug-for-discovering-frequent-interrupts/20221011-135800
base: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 36de4f94197318e45ba77badb5b07274f5bc72a9
config: hexagon-randconfig-r045-20221010
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 791a7ae1ba3efd6bca96338e10ffde557ba83920)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/70d1ccdeb3cc035e44adfd98df109d31ed9af116
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Zhang-Xincheng/interrupt-debug-for-discovering-frequent-interrupts/20221011-135800
git checkout 70d1ccdeb3cc035e44adfd98df109d31ed9af116
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash kernel/irq/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

>> kernel/irq/spurious.c:315:6: warning: no previous prototype for function 'report_frequent_irq' [-Wmissing-prototypes]
void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
^
kernel/irq/spurious.c:315:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
^
static
1 warning generated.


vim +/report_frequent_irq +315 kernel/irq/spurious.c

313
314 #else
> 315 void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
316 {
317 }
318 #endif /* CONFIG_FREQUENT_IRQ_DEBUG */
319

--
0-DAY CI Kernel Test Service
https://01.org/lkp


Attachments:
(No filename) (2.40 kB)
config (120.37 kB)
Download all attachments