2022-12-20 06:01:24

by Baoquan He

[permalink] [raw]
Subject: [PATCH] Revert "x86/apic/x2apic: Implement IPI shorthands support"

This reverts commit 43931d350f30c6cd8c2f498d54ef7d65750abc92.

On kvm guest with 4 cpus deployed, when adding 'nr_cpus=2' to normal
kernel's cmdline, and triggering crash to jump to kdump kernel, kdump
kernel will stably hang. Reverting commit 43931d350f30 ("x86/apic/x2apic:
Implement IPI shorthands support") can fix it.

The problem will disappear if removing 'nr_cpus=2' from normal kerne's
cmdline.

Tried on several bare metal systems, this issue can't be stably
reproduced. On several systems, kdump kernel can always succeed. On some
systems, kdump kernel randomly failed, not sure if it's the same issue
as the one on kvm guest. On some systems, there's another random hang
with tick_periodic() call trace, I will report it in another mail.

Signed-off-by: Baoquan He <[email protected]>
---
This is reproduced stably on kvm guest with Fedora. Attached the kernel
config for reference just in case.

arch/x86/kernel/apic/local.h | 1 -
arch/x86/kernel/apic/x2apic_cluster.c | 4 ++--
arch/x86/kernel/apic/x2apic_phys.c | 13 ++-----------
3 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
index a997d849509a..59b91f7708d8 100644
--- a/arch/x86/kernel/apic/local.h
+++ b/arch/x86/kernel/apic/local.h
@@ -24,7 +24,6 @@ unsigned int x2apic_get_apic_id(unsigned long id);
u32 x2apic_set_apic_id(unsigned int id);
int x2apic_phys_pkg_id(int initial_apicid, int index_msb);
void x2apic_send_IPI_self(int vector);
-void __x2apic_send_IPI_shorthand(int vector, u32 which);

/* IPI */

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e696e22d0531..d95b49fac01a 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -90,12 +90,12 @@ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)

static void x2apic_send_IPI_allbutself(int vector)
{
- __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT);
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
}

static void x2apic_send_IPI_all(int vector)
{
- __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC);
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
}

static u32 x2apic_calc_apicid(unsigned int cpu)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 6bde05a86b4e..18c5201d1cb1 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -83,12 +83,12 @@ static void

static void x2apic_send_IPI_allbutself(int vector)
{
- __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT);
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
}

static void x2apic_send_IPI_all(int vector)
{
- __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC);
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
}

static void init_x2apic_ldr(void)
@@ -123,15 +123,6 @@ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
native_x2apic_icr_write(cfg, apicid);
}

-void __x2apic_send_IPI_shorthand(int vector, u32 which)
-{
- unsigned long cfg = __prepare_ICR(which, vector, 0);
-
- /* x2apic MSRs are special and need a special fence: */
- weak_wrmsr_fence();
- native_x2apic_icr_write(cfg, 0);
-}
-
unsigned int x2apic_get_apic_id(unsigned long id)
{
return id;
--
2.34.1


Attachments:
(No filename) (3.42 kB)
config-kdump-failure-kvm-nr_cpus (171.88 kB)
Download all attachments

2022-12-20 06:19:26

by Baoquan He

[permalink] [raw]
Subject: kdump kernel randomly hang with tick_periodic call trace on bare metal system

On one intel bare metal system, I can randomly reproduce the kdump hang
as below with tick_periodic call trace. Attach the kernel config for
reference.

==========Boot log of kdump kernel hang===================
+ kdumpctl restart
kdump: kexec: unloaded kdump kernel
kdump: Stopping kdump: [OK]
kdump: kexec: loaded kdump kerne[ 3167.436227] sysrq: Trigger a crash
l
kdump: Starti[ 3167.440786] Kernel panic - not syncing: sysrq triggered crash
[ 3167.447870] CPU: 0 PID: 2161 Comm: sysrq.sh Kdump: loaded Tainted: G I 6.0.11-300.fc37.x86_64 #1
[ 3167.458024] Hardware name: Dell Inc. PowerEdge R410/0N051F, BIOS 1.11.0 07/20/2012
[ 3167.465576] Call Trace:
[ 3167.468016] <TASK>
[ 3167.470109] dump_stack_lvl+0x44/0x5c
[ 3167.473768] panic+0x10a/0x2c0
[ 3167.476819] sysrq_handle_crash+0x16/0x20
[ 3167.480821] __handle_sysrq.cold+0x44/0x11c
[ 3167.484997] write_sysrq_trigger+0x24/0x40
[ 3167.489083] proc_reg_write+0x56/0xa0
[ 3167.492738] ? preempt_count_add+0x47/0xa0
[ 3167.496826] vfs_write+0xb9/0x3e0
[ 3167.500136] ksys_write+0x5b/0xd0
[ 3167.503442] do_syscall_64+0x5b/0x80
[ 3167.507009] ? do_user_addr_fault+0x1ef/0x690
[ 3167.511360] ? exc_page_fault+0x70/0x170
[ 3167.515276] entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 3167.520318] RIP: 0033:0x7f4aa81840c4
[ 3167.523893] Code: 15 71 7d 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 80 3d 3d 05 0e 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 48 83 ec 28 48 89 54 24 18 48
[ 3167.542626] RSP: 002b:00007fff15bbf448 EFLAGS: 00000202 ORIG_RAX: 0000000000000001
[ 3167.550180] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f4aa81840c4
[ 3167.557300] RDX: 0000000000000002 RSI: 000055c0145f6ca0 RDI: 0000000000000001
[ 3167.564417] RBP: 000055c0145f6ca0 R08: 0000000000000001 R09: 0000000000000073
[ 3167.571535] R10: 0000000000001000 R11: 0000000000000202 R12: 0000000000000002
[ 3167.578654] R13: 00007f4aa825d780 R14: 0000000000000002 R15: 00007f4aa8258a00
[ 3167.585777] </TASK>
[ 0.000000] Linux version 6.0.11-300.fc37.x86_64 ([email protected]) (gcc (GCC) 12.2.1 20221121 (Red Hat 12.2.1-4), GNU ld version 2.38-25.fc37) #1 SMP PREEMPT_DYNAMIC Fri Dec 2 20:47:45 UTC 2022
[ 0.000000] Command line: elfcorehdr=0xbf000000 BOOT_IMAGE=(hd0,gpt2)/vmlinuz-6.0.11-300.fc37.x86_64 ro console=ttyS1,115200n81 nr_cpus=2 irqpoll nr_cpus=1 reset_devices cgroup_disable=memory mce=off numa=off udev.children-max=2 panic=10 acpi_no_memhotplug transparent_hugepage=never nokaslr hest_disable novmcoredd cma=0 hugetlb_cma=0 disable_cpu_apicid=16 iTCO_wdt.pretimeout=0
[ 0.000000] x86/fpu: x87 FPU will use FXSAVE
[ 0.000000] signal: max sigframe size: 1440
[ 0.000000] BIOS-provided physical RAM map:
[ 0.000000] BIOS-e820: [mem 0x0000000000000000-0x0000000000000fff] reserved
[ 0.000000] BIOS-e820: [mem 0x0000000000001000-0x000000000009dfff] usable
[ 0.000000] BIOS-e820: [mem 0x00000000bf001000-0x00000000ceffffff] usable
[ 0.000000] BIOS-e820: [mem 0x00000000cf369000-0x00000000cf37efff] reserved
[ 0.000000] BIOS-e820: [mem 0x00000000cf37f000-0x00000000cf3bdfff] ACPI data
[ 0.000000] BIOS-e820: [mem 0x00000000cf3be000-0x00000000cfffffff] reserved
[ 0.000000] BIOS-e820: [mem 0x00000000e0000000-0x00000000efffffff] reserved
[ 0.000000] BIOS-e820: [mem 0x00000000fe000000-0x00000000ffffffff] reserved
[ 0.000000] random: crng init done
[ 0.000000] NX (Execute Disable) protection: active
[ 0.000000] extended physical RAM map:
[ 0.000000] reserve setup_data: [mem 0x0000000000000000-0x0000000000000fff] reserved
[ 0.000000] reserve setup_data: [mem 0x0000000000001000-0x000000000009dfff] usable
[ 0.000000] reserve setup_data: [mem 0x00000000bf001000-0x00000000ceff81df] usable
[ 0.000000] reserve setup_data: [mem 0x00000000ceff81e0-0x00000000ceff820f] usable
[ 0.000000] reserve setup_data: [mem 0x00000000ceff8210-0x00000000ceffffff] usable
[ 0.000000] reserve setup_data: [mem 0x00000000cf369000-0x00000000cf37efff] reserved
[ 0.000000] reserve setup_data: [mem 0x00000000cf37f000-0x00000000cf3bdfff] ACPI data
[ 0.000000] reserve setup_data: [mem 0x00000000cf3be000-0x00000000cfffffff] reserved
[ 0.000000] reserve setup_data: [mem 0x00000000e0000000-0x00000000efffffff] reserved
[ 0.000000] reserve setup_data: [mem 0x00000000fe000000-0x00000000ffffffff] reserved
[ 0.000000] SMBIOS 2.6 present.
[ 0.000000] DMI: Dell Inc. PowerEdge R410/0N051F, BIOS 1.11.0 07/20/2012
[ 0.000000] tsc: Fast TSC calibration using PIT
[ 0.000000] tsc: Detected 2792.956 MHz processor
[ 0.002401] last_pfn = 0xcf000 max_arch_pfn = 0x400000000
[ 0.003426] x86/PAT: Configuration [0-7]: WB WC UC- UC WB WP UC- WT
[ 0.014635] found SMP MP-table at [mem 0x000fe710-0x000fe71f]
[ 0.015076] RAMDISK: [mem 0xc9073000-0xcaffffff]
[ 0.015116] ACPI: Early table checksum verification disabled
[ 0.015121] ACPI: RSDP 0x00000000000F1070 000024 (v02 DELL )
[ 0.015126] ACPI: XSDT 0x00000000000F1174 00009C (v01 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015133] ACPI: FACP 0x00000000CF3A3F9C 0000F4 (v03 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015140] ACPI: DSDT 0x00000000CF37F000 003BAF (v01 DELL PE_SC3 00000001 INTL 20050624)
[ 0.015145] ACPI: FACS 0x00000000CF3A6000 000040
[ 0.015149] ACPI: FACS 0x00000000CF3A6000 000040
[ 0.015153] ACPI: APIC 0x00000000CF3A3478 00015E (v01 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015157] ACPI: SPCR 0x00000000CF3A35D8 000050 (v01 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015162] ACPI: HPET 0x00000000CF3A362C 000038 (v01 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015166] ACPI: DMAR 0x00000000CF3A3668 0001B0 (v01 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015171] ACPI: MCFG 0x00000000CF3A38C4 00003C (v01 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015175] ACPI: WD__ 0x00000000CF3A3904 000134 (v01 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015179] ACPI: SLIC 0x00000000CF3A3A3C 000024 (v01 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015184] ACPI: ERST 0x00000000CF382D50 000270 (v01 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015188] ACPI: HEST 0x00000000CF382FC0 0003A8 (v01 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015192] ACPI: BERT 0x00000000CF382BB0 000030 (v01 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015197] ACPI: EINJ 0x00000000CF382BE0 000170 (v01 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015201] ACPI: SRAT 0x00000000CF3A3BC0 000370 (v01 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015205] ACPI: TCPA 0x00000000CF3A3F34 000064 (v02 DELL PE_SC3 00000001 DELL 00000001)
[ 0.015210] ACPI: SSDT 0x00000000CF3A7000 004A24 (v01 INTEL PPM RCM 80000001 INTL 20061109)
[ 0.015213] ACPI: Reserving FACP table memory at [mem 0xcf3a3f9c-0xcf3a408f]
[ 0.015215] ACPI: Reserving DSDT table memory at [mem 0xcf37f000-0xcf382bae]
[ 0.015216] ACPI: Reserving FACS table memory at [mem 0xcf3a6000-0xcf3a603f]
[ 0.015217] ACPI: Reserving FACS table memory at [mem 0xcf3a6000-0xcf3a603f]
[ 0.015219] ACPI: Reserving APIC table memory at [mem 0xcf3a3478-0xcf3a35d5]
[ 0.015220] ACPI: Reserving SPCR table memory at [mem 0xcf3a35d8-0xcf3a3627]
[ 0.015221] ACPI: Reserving HPET table memory at [mem 0xcf3a362c-0xcf3a3663]
[ 0.015222] ACPI: Reserving DMAR table memory at [mem 0xcf3a3668-0xcf3a3817]
[ 0.015223] ACPI: Reserving MCFG table memory at [mem 0xcf3a38c4-0xcf3a38ff]
[ 0.015225] ACPI: Reserving WD__ table memory at [mem 0xcf3a3904-0xcf3a3a37]
[ 0.015226] ACPI: Reserving SLIC table memory at [mem 0xcf3a3a3c-0xcf3a3a5f]
[ 0.015227] ACPI: Reserving ERST table memory at [mem 0xcf382d50-0xcf382fbf]
[ 0.015228] ACPI: Reserving HEST table memory at [mem 0xcf382fc0-0xcf383367]
[ 0.015230] ACPI: Reserving BERT table memory at [mem 0xcf382bb0-0xcf382bdf]
[ 0.015231] ACPI: Reserving EINJ table memory at [mem 0xcf382be0-0xcf382d4f]
[ 0.015232] ACPI: Reserving SRAT table memory at [mem 0xcf3a3bc0-0xcf3a3f2f]
[ 0.015233] ACPI: Reserving TCPA table memory at [mem 0xcf3a3f34-0xcf3a3f97]
[ 0.015235] ACPI: Reserving SSDT table memory at [mem 0xcf3a7000-0xcf3aba23]
[ 0.015266] NUMA turned off
[ 0.015267] Faking a node at [mem 0x0000000000000000-0x00000000ceffffff]
[ 0.015278] NODE_DATA(0) allocated [mem 0xcefcd1c0-0xceff81bf]
[ 0.016560] Zone ranges:
[ 0.016565] DMA [mem 0x0000000000001000-0x0000000000ffffff]
[ 0.016569] DMA32 [mem 0x0000000001000000-0x00000000ceffffff]
[ 0.016572] Normal empty
[ 0.016573] Device empty
[ 0.016575] Movable zone start for each node
[ 0.016578] Early memory node ranges
[ 0.016578] node 0: [mem 0x0000000000001000-0x000000000009dfff]
[ 0.016580] node 0: [mem 0x00000000bf001000-0x00000000ceffffff]
[ 0.016583] Initmem setup node 0 [mem 0x0000000000001000-0x00000000ceffffff]
[ 0.016591] On node 0, zone DMA: 1 pages in unavailable ranges
[ 0.018109] On node 0, zone DMA32: 61283 pages in unavailable ranges
[ 0.018187] On node 0, zone DMA32: 4096 pages in unavailable ranges
[ 0.018319] ACPI: PM-Timer IO Port: 0x808
[ 0.018329] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 1/0x0 ignored.
[ 0.018331] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 2/0x12 ignored.
[ 0.018332] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 3/0x2 ignored.
[ 0.018334] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 4/0x14 ignored.
[ 0.018335] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 5/0x4 ignored.
[ 0.018336] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 6/0x16 ignored.
[ 0.018338] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 7/0x6 ignored.
[ 0.018339] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 8/0x11 ignored.
[ 0.018341] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 9/0x1 ignored.
[ 0.018342] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 10/0x13 ignored.
[ 0.018343] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 11/0x3 ignored.
[ 0.018345] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 12/0x15 ignored.
[ 0.018346] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 13/0x5 ignored.
[ 0.018347] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 14/0x17 ignored.
[ 0.018348] APIC: NR_CPUS/possible_cpus limit of 1 reached. Processor 15/0x7 ignored.
[ 0.018352] ACPI: LAPIC_NMI (acpi_id[0xff] high edge lint[0x1])
[ 0.018364] IOAPIC[0]: apic_id 0, version 32, address 0xfec00000, GSI 0-23
[ 0.018371] IOAPIC[1]: apic_id 1, version 32, address 0xfec80000, GSI 32-55
[ 0.018374] ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl)
[ 0.018376] ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 high level)
[ 0.018382] ACPI: Using ACPI (MADT) for SMP configuration information
[ 0.018383] ACPI: HPET id: 0x8086a301 base: 0xfed00000
[ 0.018387] ACPI: SPCR: SPCR table version 1
[ 0.018389] ACPI: SPCR: console: uart,io,0x3f8,115200
[ 0.018392] smpboot: 16 Processors exceeds NR_CPUS limit of 1
[ 0.018393] smpboot: Allowing 1 CPUs, 0 hotplug CPUs
[ 0.018411] PM: hibernation: Registered nosave memory: [mem 0x00000000-0x00000fff]
[ 0.018413] PM: hibernation: Registered nosave memory: [mem 0x0009e000-0xbf000fff]
[ 0.018415] PM: hibernation: Registered nosave memory: [mem 0xceff8000-0xceff8fff]
[ 0.018417] PM: hibernation: Registered nosave memory: [mem 0xceff8000-0xceff8fff]
[ 0.018420] [mem 0x0009e000-0xbf000fff] available for PCI devices
[ 0.018422] Booting paravirtualized kernel on bare hardware
[ 0.018425] clocksource: refined-jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 1910969940391419 ns
[ 0.024691] setup_percpu: NR_CPUS:8192 nr_cpumask_bits:1 nr_cpu_ids:1 nr_node_ids:1
[ 0.024997] percpu: Embedded 61 pages/cpu s212992 r8192 d28672 u2097152
[ 0.025044] Fallback order for Node 0: 0
[ 0.025048] Built 1 zonelists, mobility grouping on. Total pages: 64665
[ 0.025050] Policy zone: DMA32
[ 0.025052] Kernel command line: elfcorehdr=0xbf000000 BOOT_IMAGE=(hd0,gpt2)/vmlinuz-6.0.11-300.fc37.x86_64 ro console=ttyS1,115200n81 nr_cpus=2 irqpoll nr_cpus=1 reset_devices cgroup_disable=memory mce=off numa=off udev.children-max=2 panic=10 acpi_no_memhotplug transparent_hugepage=never nokaslr hest_disable novmcoredd cma=0 hugetlb_cma=0 disable_cpu_apicid=16 iTCO_wdt.pretimeout=0
[ 0.025183] Misrouted IRQ fixup and polling support enabled
[ 0.025184] This may significantly impact system performance
[ 0.025316] cgroup: Disabling memory control group subsystem
[ 0.025583] Unknown kernel command line parameters "nokaslr BOOT_IMAGE=(hd0,gpt2)/vmlinuz-6.0.11-300.fc37.x86_64", will be passed to user space.
[ 0.025618] Dentry cache hash table entries: 32768 (order: 6, 262144 bytes, linear)
[ 0.025638] Inode-cache hash table entries: 16384 (order: 5, 131072 bytes, linear)
[ 0.025916] mem auto-init: stack:all(zero), heap alloc:off, heap free:off
[ 0.027119] Memory: 169244K/262768K available (16393K kernel code, 3227K rwdata, 12824K rodata, 3024K init, 4680K bss, 93268K reserved, 0K cma-reserved)
[ 0.027289] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=1, Nodes=1
[ 0.027310] Kernel/User page tables isolation: enabled
[ 0.027339] ftrace: allocating 50892 entries in 199 pages
[ 0.037814] ftrace: allocated 199 pages with 5 groups
[ 0.038704] Dynamic Preempt: voluntary
[ 0.038737] rcu: Preemptible hierarchical RCU implementation.
[ 0.038739] rcu: RCU restricting CPUs from NR_CPUS=8192 to nr_cpu_ids=1.
[ 0.038741] Trampoline variant of Tasks RCU enabled.
[ 0.038741] Rude variant of Tasks RCU enabled.
[ 0.038742] Tracing variant of Tasks RCU enabled.
[ 0.038743] rcu: RCU calculated value of scheduler-enlistment delay is 100 jiffies.
[ 0.038744] rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=1
[ 0.045620] NR_IRQS: 524544, nr_irqs: 256, preallocated irqs: 16
[ 0.045817] rcu: srcu_init: Setting srcu_struct sizes based on contention.
[ 0.045951] kfence: initialized - using 2097152 bytes for 255 objects at 0x0000000036880424-0x000000000bf5c52b
[ 0.045980] Spurious LAPIC timer interrupt on cpu 0
[ 0.048416] Console: colour VGA+ 80x25
[ 1.135333] printk: console [ttyS1] enabled
[ 1.139520] ACPI: Core revision 20220331
[ 1.143537] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns
[ 1.152690] BUG: kernel NULL pointer dereference, address: 0000000000000088
[ 1.159634] #PF: supervisor read access in kernel mode
[ 1.164757] #PF: error_code(0x0000) - not-present page
[ 1.169882] PGD 0 P4D 0
[ 1.172407] Oops: 0000 [#1] PREEMPT SMP PTI
[ 1.176578] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.0.11-300.fc37.x86_64 #1
[ 1.183870] Hardware name: Dell Inc. PowerEdge R410/0N051F, BIOS 1.11.0 07/20/2012
[ 1.191420] RIP: 0010:tick_periodic+0x23/0x80
[ 1.195765] Code: 5b e9 71 17 e5 00 90 0f 1f 44 00 00 39 3d 59 94 16 02 74 30 48 c7 c7 19 34 85 82 e8 07 c8 c0 00 31 ff 65 48 8b 05 b5 de e7 7e <f6> 80 88 00 00 00 03 40 0f 95 c7 e8 ed f3 fe ff bf 01 00 00 00 e9
[ 1.214495] RSP: 0000:ffffc90000003eb8 EFLAGS: 00010046
[ 1.219706] RAX: 0000000000000000 RBX: ffff8880bf052800 RCX: 0000000000000000
[ 1.226822] RDX: 0000000000000103 RSI: ffffffff82853419 RDI: 0000000000000000
[ 1.233939] RBP: 7fffffffffffffff R08: ffff8880bf400dd8 R09: ffffffff83146740
[ 1.241056] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[ 1.248172] R13: 0000000000000000 R14: ffff8880bf187e00 R15: ffffffff830060f0
[ 1.255290] FS: 0000000000000000(0000) GS:ffff8880ce400000(0000) knlGS:0000000000000000
[ 1.263360] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1.269090] CR2: 0000000000000088 CR3: 00000000cd010000 CR4: 00000000000006b0
[ 1.276207] Call Trace:
[ 1.278645] <IRQ>
[ 1.280648] tick_handle_periodic+0x1f/0x70
[ 1.284821] timer_interrupt+0x14/0x20
[ 1.288561] __handle_irq_event_percpu+0x46/0x190
[ 1.293253] handle_irq_event+0x34/0x70
[ 1.297080] handle_level_irq+0xa8/0x180
[ 1.300993] resend_irqs+0x5d/0x70
[ 1.304386] tasklet_action_common.constprop.0+0xab/0xe0
[ 1.309686] __do_softirq+0xfb/0x319
[ 1.313254] __irq_exit_rcu+0xd7/0x140
[ 1.316993] common_interrupt+0xb9/0xd0
[ 1.320820] </IRQ>
[ 1.322910] <TASK>
[ 1.325000] asm_common_interrupt+0x22/0x40
[ 1.329175] RIP: 0010:__x86_return_thunk+0x0/0x8
[ 1.333783] Code: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc f6 <c3> cc 0f ae e8 eb f9 cc 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 1.352513] RSP: 0000:ffffffff83003e68 EFLAGS: 00000206
[ 1.357724] RAX: 0000000000000001 RBX: ffff8880bf187e00 RCX: 0000000000015a00
[ 1.364842] RDX: 0000000000000001 RSI: 0000000000000246 RDI: ffff8880bf187ea4
[ 1.371960] RBP: ffff8880bf180680 R08: ffffffff83003e78 R09: 0000000000000080
[ 1.379076] R10: 0000000000000246 R11: 0000000000000000 R12: 0000000000000000
[ 1.386194] R13: ffff8880bf187f60 R14: ffff8880bf187ea4 R15: 0000000000000000
[ 1.393315] _raw_spin_unlock_irqrestore+0x19/0x40
[ 1.398093] __setup_irq+0x443/0x6d0
[ 1.401659] request_threaded_irq+0x109/0x170
[ 1.406005] hpet_time_init+0x2d/0x4b
[ 1.409661] x86_late_time_init+0x17/0x34
[ 1.413658] start_kernel+0x8cf/0x97f
[ 1.417312] secondary_startup_64_no_verify+0xe5/0xeb
[ 1.422354] </TASK>
[ 1.424530] Modules linked in:
[ 1.427574] CR2: 0000000000000088
[ 1.430878] ---[ end trace 0000000000000000 ]---
[ 1.435482] RIP: 0010:tick_periodic+0x23/0x80
[ 1.439826] Code: 5b e9 71 17 e5 00 90 0f 1f 44 00 00 39 3d 59 94 16 02 74 30 48 c7 c7 19 34 85 82 e8 07 c8 c0 00 31 ff 65 48 8b 05 b5 de e7 7e <f6> 80 88 00 00 00 03 40 0f 95 c7 e8 ed f3 fe ff bf 01 00 00 00 e9
[ 1.458556] RSP: 0000:ffffc90000003eb8 EFLAGS: 00010046
[ 1.463767] RAX: 0000000000000000 RBX: ffff8880bf052800 RCX: 0000000000000000
[ 1.470884] RDX: 0000000000000103 RSI: ffffffff82853419 RDI: 0000000000000000
[ 1.478001] RBP: 7fffffffffffffff R08: ffff8880bf400dd8 R09: ffffffff83146740
[ 1.485120] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[ 1.492237] R13: 0000000000000000 R14: ffff8880bf187e00 R15: ffffffff830060f0
[ 1.499354] FS: 0000000000000000(0000) GS:ffff8880ce400000(0000) knlGS:0000000000000000
[ 1.507425] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1.513156] CR2: 0000000000000088 CR3: 00000000cd010000 CR4: 00000000000006b0
[ 1.520274] Kernel panic - not syncing: Fatal exception in interrupt
[ 1.526614] Rebooting in 10 seconds..


Attachments:
(No filename) (18.69 kB)
config-6.0.11-300.fc37.x86_64 (255.97 kB)
Download all attachments

2022-12-20 12:09:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] Revert "x86/apic/x2apic: Implement IPI shorthands support"

On Tue, Dec 20, 2022 at 01:34:58PM +0800, Baoquan He wrote:
> This reverts commit 43931d350f30c6cd8c2f498d54ef7d65750abc92.
>
> On kvm guest with 4 cpus deployed, when adding 'nr_cpus=2' to normal
> kernel's cmdline, and triggering crash to jump to kdump kernel, kdump
> kernel will stably hang. Reverting commit 43931d350f30 ("x86/apic/x2apic:
> Implement IPI shorthands support") can fix it.
>
> The problem will disappear if removing 'nr_cpus=2' from normal kerne's
> cmdline.

And the root cause for this is... ? Does the kvm x2apic emulation
somehow get upset when we shorthand CPUs that haven't been initialized?

2022-12-20 13:14:10

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCH] Revert "x86/apic/x2apic: Implement IPI shorthands support"

On 12/20/22 at 12:38pm, Peter Zijlstra wrote:
> On Tue, Dec 20, 2022 at 01:34:58PM +0800, Baoquan He wrote:
> > This reverts commit 43931d350f30c6cd8c2f498d54ef7d65750abc92.
> >
> > On kvm guest with 4 cpus deployed, when adding 'nr_cpus=2' to normal
> > kernel's cmdline, and triggering crash to jump to kdump kernel, kdump
> > kernel will stably hang. Reverting commit 43931d350f30 ("x86/apic/x2apic:
> > Implement IPI shorthands support") can fix it.
> >
> > The problem will disappear if removing 'nr_cpus=2' from normal kerne's
> > cmdline.
>
> And the root cause for this is... ? Does the kvm x2apic emulation
> somehow get upset when we shorthand CPUs that haven't been initialized?

Thanks for checking.

I haven't figure out the root cause. I haven't read the apic code for
long time, and not familiar with the kvm code. So raise the issue to
upstream.

I can do testing if any suggestion.

Add our virt dev Dr. David Alan Gilbert to CC.

2023-01-04 15:48:08

by Dr. David Alan Gilbert

[permalink] [raw]
Subject: Re: [PATCH] Revert "x86/apic/x2apic: Implement IPI shorthands support"

* Baoquan He ([email protected]) wrote:
> On 12/20/22 at 12:38pm, Peter Zijlstra wrote:
> > On Tue, Dec 20, 2022 at 01:34:58PM +0800, Baoquan He wrote:
> > > This reverts commit 43931d350f30c6cd8c2f498d54ef7d65750abc92.
> > >
> > > On kvm guest with 4 cpus deployed, when adding 'nr_cpus=2' to normal
> > > kernel's cmdline, and triggering crash to jump to kdump kernel, kdump
> > > kernel will stably hang. Reverting commit 43931d350f30 ("x86/apic/x2apic:
> > > Implement IPI shorthands support") can fix it.
> > >
> > > The problem will disappear if removing 'nr_cpus=2' from normal kerne's
> > > cmdline.
> >
> > And the root cause for this is... ? Does the kvm x2apic emulation
> > somehow get upset when we shorthand CPUs that haven't been initialized?
>
> Thanks for checking.
>
> I haven't figure out the root cause. I haven't read the apic code for
> long time, and not familiar with the kvm code. So raise the issue to
> upstream.
>
> I can do testing if any suggestion.
>
> Add our virt dev Dr. David Alan Gilbert to CC.

Hmm I don't know that code well enough; cc'ing Paolo and Maxim.

Dave
--
Dr. David Alan Gilbert / [email protected] / Manchester, UK

2023-01-09 22:22:38

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH] Revert "x86/apic/x2apic: Implement IPI shorthands support"

On Tue, Dec 20 2022 at 13:34, Baoquan He wrote:
> This reverts commit 43931d350f30c6cd8c2f498d54ef7d65750abc92.
>
> On kvm guest with 4 cpus deployed, when adding 'nr_cpus=2' to normal
> kernel's cmdline, and triggering crash to jump to kdump kernel, kdump
> kernel will stably hang. Reverting commit 43931d350f30 ("x86/apic/x2apic:
> Implement IPI shorthands support") can fix it.

Is there any output on the early console or hangs it silently?

If the latter, can you attach GDB to the guest and figure out where it
is stuck?

Thanks,

tglx

2023-01-09 22:35:53

by Thomas Gleixner

[permalink] [raw]
Subject: Re: kdump kernel randomly hang with tick_periodic call trace on bare metal system

On Tue, Dec 20 2022 at 13:41, Baoquan He wrote:
> On one intel bare metal system, I can randomly reproduce the kdump hang
> as below with tick_periodic call trace. Attach the kernel config for
> reference.

This has absolutely nothing to do with x2apic IPI shorthands

> [ 0.045980] Spurious LAPIC timer interrupt on cpu 0

So here the CPU receives a spurious Local APIC timer interrupt, but
that's a red herring.

> [ 1.152690] BUG: kernel NULL pointer dereference, address: 0000000000000088
> [ 1.159634] #PF: supervisor read access in kernel mode
> [ 1.164757] #PF: error_code(0x0000) - not-present page
> [ 1.169882] PGD 0 P4D 0
> [ 1.172407] Oops: 0000 [#1] PREEMPT SMP PTI
> [ 1.176578] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.0.11-300.fc37.x86_64 #1
> [ 1.183870] Hardware name: Dell Inc. PowerEdge R410/0N051F, BIOS 1.11.0 07/20/2012
> [ 1.191420] RIP: 0010:tick_periodic+0x23/0x80

I'm willing to bet that this is caused by the following line in tick_periodic():

update_process_times(user_mode(get_irq_regs()));

user_mode() is invoked with a NULL pointer. user_mode() accesses
regs->cs. CS is at offset 0x88....

The reason for this is here:

> [ 1.280648] tick_handle_periodic+0x1f/0x70
> [ 1.284821] timer_interrupt+0x14/0x20
> [ 1.288561] __handle_irq_event_percpu+0x46/0x190
> [ 1.293253] handle_irq_event+0x34/0x70
> [ 1.297080] handle_level_irq+0xa8/0x180
> [ 1.300993] resend_irqs+0x5d/0x70
> [ 1.304386] tasklet_action_common.constprop.0+0xab/0xe0
> [ 1.309686] __do_softirq+0xfb/0x319
> [ 1.313254] __irq_exit_rcu+0xd7/0x140
> [ 1.316993] common_interrupt+0xb9/0xd0

For some reason the timer interrupt is resent in software. I assume it is
the HPET interrupt because that's what just got initialized.

> [ 1.143537] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns

and the callchain below just confirms that:

> [ 1.393315] _raw_spin_unlock_irqrestore+0x19/0x40
> [ 1.398093] __setup_irq+0x443/0x6d0
> [ 1.401659] request_threaded_irq+0x109/0x170
> [ 1.406005] hpet_time_init+0x2d/0x4b
> [ 1.409661] x86_late_time_init+0x17/0x34
> [ 1.413658] start_kernel+0x8cf/0x97f

The software resend code does not go through the regular interrupt entry
path which explains why get_irq_regs() returns NULL.

That software resend is bogus especially since the timer interrupt is
a level interrupt. As dmesg does not say anything about the APIC
delivery mode I assume this goes through i8259, which fails to set the
IRQ_LEVEL flag on all interrupt lines.

The below should fix this.

Thanks,

tglx
---
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -114,6 +114,7 @@ static void make_8259A_irq(unsigned int
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
+ irq_set_status_flags(irq, IRQ_LEVEL);
enable_irq(irq);
lapic_assign_legacy_vector(irq, true);
}
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -65,8 +65,10 @@ void __init init_ISA_irqs(void)

legacy_pic->init(0);

- for (i = 0; i < nr_legacy_irqs(); i++)
+ for (i = 0; i < nr_legacy_irqs(); i++) {
irq_set_chip_and_handler(i, chip, handle_level_irq);
+ irq_set_status_flags(i, IRQ_LEVEL);
+ }
}

void __init init_IRQ(void)

2023-01-10 02:34:56

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCH] Revert "x86/apic/x2apic: Implement IPI shorthands support"

On 01/09/23 at 10:59pm, Thomas Gleixner wrote:
> On Tue, Dec 20 2022 at 13:34, Baoquan He wrote:
> > This reverts commit 43931d350f30c6cd8c2f498d54ef7d65750abc92.
> >
> > On kvm guest with 4 cpus deployed, when adding 'nr_cpus=2' to normal
> > kernel's cmdline, and triggering crash to jump to kdump kernel, kdump
> > kernel will stably hang. Reverting commit 43931d350f30 ("x86/apic/x2apic:
> > Implement IPI shorthands support") can fix it.
>
> Is there any output on the early console or hangs it silently?
>
> If the latter, can you attach GDB to the guest and figure out where it
> is stuck?

No any output on the ealry console. I will try gdb debugging. Thanks.

2023-01-14 03:09:48

by Baoquan He

[permalink] [raw]
Subject: Re: kdump kernel randomly hang with tick_periodic call trace on bare metal system

On 01/09/23 at 10:57pm, Thomas Gleixner wrote:
> On Tue, Dec 20 2022 at 13:41, Baoquan He wrote:
> > On one intel bare metal system, I can randomly reproduce the kdump hang
> > as below with tick_periodic call trace. Attach the kernel config for
> > reference.
>
> This has absolutely nothing to do with x2apic IPI shorthands
>
> > [ 0.045980] Spurious LAPIC timer interrupt on cpu 0
>
> So here the CPU receives a spurious Local APIC timer interrupt, but
> that's a red herring.
>
> > [ 1.152690] BUG: kernel NULL pointer dereference, address: 0000000000000088
> > [ 1.159634] #PF: supervisor read access in kernel mode
> > [ 1.164757] #PF: error_code(0x0000) - not-present page
> > [ 1.169882] PGD 0 P4D 0
> > [ 1.172407] Oops: 0000 [#1] PREEMPT SMP PTI
> > [ 1.176578] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.0.11-300.fc37.x86_64 #1
> > [ 1.183870] Hardware name: Dell Inc. PowerEdge R410/0N051F, BIOS 1.11.0 07/20/2012
> > [ 1.191420] RIP: 0010:tick_periodic+0x23/0x80
>
> I'm willing to bet that this is caused by the following line in tick_periodic():
>
> update_process_times(user_mode(get_irq_regs()));
>
> user_mode() is invoked with a NULL pointer. user_mode() accesses
> regs->cs. CS is at offset 0x88....
>
> The reason for this is here:
>
> > [ 1.280648] tick_handle_periodic+0x1f/0x70
> > [ 1.284821] timer_interrupt+0x14/0x20
> > [ 1.288561] __handle_irq_event_percpu+0x46/0x190
> > [ 1.293253] handle_irq_event+0x34/0x70
> > [ 1.297080] handle_level_irq+0xa8/0x180
> > [ 1.300993] resend_irqs+0x5d/0x70
> > [ 1.304386] tasklet_action_common.constprop.0+0xab/0xe0
> > [ 1.309686] __do_softirq+0xfb/0x319
> > [ 1.313254] __irq_exit_rcu+0xd7/0x140
> > [ 1.316993] common_interrupt+0xb9/0xd0
>
> For some reason the timer interrupt is resent in software. I assume it is
> the HPET interrupt because that's what just got initialized.
>
> > [ 1.143537] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns
>
> and the callchain below just confirms that:
>
> > [ 1.393315] _raw_spin_unlock_irqrestore+0x19/0x40
> > [ 1.398093] __setup_irq+0x443/0x6d0
> > [ 1.401659] request_threaded_irq+0x109/0x170
> > [ 1.406005] hpet_time_init+0x2d/0x4b
> > [ 1.409661] x86_late_time_init+0x17/0x34
> > [ 1.413658] start_kernel+0x8cf/0x97f
>
> The software resend code does not go through the regular interrupt entry
> path which explains why get_irq_regs() returns NULL.
>
> That software resend is bogus especially since the timer interrupt is
> a level interrupt. As dmesg does not say anything about the APIC
> delivery mode I assume this goes through i8259, which fails to set the
> IRQ_LEVEL flag on all interrupt lines.
>
> The below should fix this.

Sorry for late reply, just notice this mail.

Thanks a lot for checking this and providing a fix, I will reserve the
lab machine and give it a shot, will feedback once finished.

> ---
> --- a/arch/x86/kernel/i8259.c
> +++ b/arch/x86/kernel/i8259.c
> @@ -114,6 +114,7 @@ static void make_8259A_irq(unsigned int
> disable_irq_nosync(irq);
> io_apic_irqs &= ~(1<<irq);
> irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
> + irq_set_status_flags(irq, IRQ_LEVEL);
> enable_irq(irq);
> lapic_assign_legacy_vector(irq, true);
> }
> --- a/arch/x86/kernel/irqinit.c
> +++ b/arch/x86/kernel/irqinit.c
> @@ -65,8 +65,10 @@ void __init init_ISA_irqs(void)
>
> legacy_pic->init(0);
>
> - for (i = 0; i < nr_legacy_irqs(); i++)
> + for (i = 0; i < nr_legacy_irqs(); i++) {
> irq_set_chip_and_handler(i, chip, handle_level_irq);
> + irq_set_status_flags(i, IRQ_LEVEL);
> + }
> }
>
> void __init init_IRQ(void)
>

2023-01-16 09:29:58

by Baoquan He

[permalink] [raw]
Subject: Re: kdump kernel randomly hang with tick_periodic call trace on bare metal system

Hi Thomas,

On 01/09/23 at 10:57pm, Thomas Gleixner wrote:
> On Tue, Dec 20 2022 at 13:41, Baoquan He wrote:
> > On one intel bare metal system, I can randomly reproduce the kdump hang
> > as below with tick_periodic call trace. Attach the kernel config for
> > reference.
>
> This has absolutely nothing to do with x2apic IPI shorthands
>
> > [ 0.045980] Spurious LAPIC timer interrupt on cpu 0
>
> So here the CPU receives a spurious Local APIC timer interrupt, but
> that's a red herring.
>
> > [ 1.152690] BUG: kernel NULL pointer dereference, address: 0000000000000088
> > [ 1.159634] #PF: supervisor read access in kernel mode
> > [ 1.164757] #PF: error_code(0x0000) - not-present page
> > [ 1.169882] PGD 0 P4D 0
> > [ 1.172407] Oops: 0000 [#1] PREEMPT SMP PTI
> > [ 1.176578] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.0.11-300.fc37.x86_64 #1
> > [ 1.183870] Hardware name: Dell Inc. PowerEdge R410/0N051F, BIOS 1.11.0 07/20/2012
> > [ 1.191420] RIP: 0010:tick_periodic+0x23/0x80
>
> I'm willing to bet that this is caused by the following line in tick_periodic():
>
> update_process_times(user_mode(get_irq_regs()));
>
> user_mode() is invoked with a NULL pointer. user_mode() accesses
> regs->cs. CS is at offset 0x88....
>
> The reason for this is here:
>
> > [ 1.280648] tick_handle_periodic+0x1f/0x70
> > [ 1.284821] timer_interrupt+0x14/0x20
> > [ 1.288561] __handle_irq_event_percpu+0x46/0x190
> > [ 1.293253] handle_irq_event+0x34/0x70
> > [ 1.297080] handle_level_irq+0xa8/0x180
> > [ 1.300993] resend_irqs+0x5d/0x70
> > [ 1.304386] tasklet_action_common.constprop.0+0xab/0xe0
> > [ 1.309686] __do_softirq+0xfb/0x319
> > [ 1.313254] __irq_exit_rcu+0xd7/0x140
> > [ 1.316993] common_interrupt+0xb9/0xd0
>
> For some reason the timer interrupt is resent in software. I assume it is
> the HPET interrupt because that's what just got initialized.
>
> > [ 1.143537] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns
>
> and the callchain below just confirms that:
>
> > [ 1.393315] _raw_spin_unlock_irqrestore+0x19/0x40
> > [ 1.398093] __setup_irq+0x443/0x6d0
> > [ 1.401659] request_threaded_irq+0x109/0x170
> > [ 1.406005] hpet_time_init+0x2d/0x4b
> > [ 1.409661] x86_late_time_init+0x17/0x34
> > [ 1.413658] start_kernel+0x8cf/0x97f
>
> The software resend code does not go through the regular interrupt entry
> path which explains why get_irq_regs() returns NULL.
>
> That software resend is bogus especially since the timer interrupt is
> a level interrupt. As dmesg does not say anything about the APIC
> delivery mode I assume this goes through i8259, which fails to set the
> IRQ_LEVEL flag on all interrupt lines.
>
> The below should fix this.

I got the machine where this random hang often happen, and built the
latest upstream kernel to test. W/o this fix, kdump kernel will hang
when calling tick_periodic 6 of 10 times. With this patch applied, I
tried about 30 times, kdump kernel all booted up and vmcore dumping
succeeded.

So this patch fix the issue I met, thanks a lot. And please feel free to
add:

Tested-by: Baoquan He <[email protected]>


>
> ---
> --- a/arch/x86/kernel/i8259.c
> +++ b/arch/x86/kernel/i8259.c
> @@ -114,6 +114,7 @@ static void make_8259A_irq(unsigned int
> disable_irq_nosync(irq);
> io_apic_irqs &= ~(1<<irq);
> irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
> + irq_set_status_flags(irq, IRQ_LEVEL);
> enable_irq(irq);
> lapic_assign_legacy_vector(irq, true);
> }
> --- a/arch/x86/kernel/irqinit.c
> +++ b/arch/x86/kernel/irqinit.c
> @@ -65,8 +65,10 @@ void __init init_ISA_irqs(void)
>
> legacy_pic->init(0);
>
> - for (i = 0; i < nr_legacy_irqs(); i++)
> + for (i = 0; i < nr_legacy_irqs(); i++) {
> irq_set_chip_and_handler(i, chip, handle_level_irq);
> + irq_set_status_flags(i, IRQ_LEVEL);
> + }
> }
>
> void __init init_IRQ(void)
>

Subject: [tip: x86/urgent] x86/i8259: Mark legacy PIC interrupts with IRQ_LEVEL

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 5fa55950729d0762a787451dc52862c3f850f859
Gitweb: https://git.kernel.org/tip/5fa55950729d0762a787451dc52862c3f850f859
Author: Thomas Gleixner <[email protected]>
AuthorDate: Mon, 09 Jan 2023 22:57:13 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Mon, 16 Jan 2023 17:24:56 +01:00

x86/i8259: Mark legacy PIC interrupts with IRQ_LEVEL

Baoquan reported that after triggering a crash the subsequent crash-kernel
fails to boot about half of the time. It triggers a NULL pointer
dereference in the periodic tick code.

This happens because the legacy timer interrupt (IRQ0) is resent in
software which happens in soft interrupt (tasklet) context. In this context
get_irq_regs() returns NULL which leads to the NULL pointer dereference.

The reason for the resend is a spurious APIC interrupt on the IRQ0 vector
which is captured and leads to a resend when the legacy timer interrupt is
enabled. This is wrong because the legacy PIC interrupts are level
triggered and therefore should never be resent in software, but nothing
ever sets the IRQ_LEVEL flag on those interrupts, so the core code does not
know about their trigger type.

Ensure that IRQ_LEVEL is set when the legacy PCI interrupts are set up.

Fixes: a4633adcdbc1 ("[PATCH] genirq: add genirq sw IRQ-retrigger")
Reported-by: Baoquan He <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Tested-by: Baoquan He <[email protected]>
Link: https://lore.kernel.org/r/87mt6rjrra.ffs@tglx
---
arch/x86/kernel/i8259.c | 1 +
arch/x86/kernel/irqinit.c | 4 +++-
2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 3aa5304..4d8aff0 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -114,6 +114,7 @@ static void make_8259A_irq(unsigned int irq)
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
+ irq_set_status_flags(irq, IRQ_LEVEL);
enable_irq(irq);
lapic_assign_legacy_vector(irq, true);
}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index beb1bad..c683666 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -65,8 +65,10 @@ void __init init_ISA_irqs(void)

legacy_pic->init(0);

- for (i = 0; i < nr_legacy_irqs(); i++)
+ for (i = 0; i < nr_legacy_irqs(); i++) {
irq_set_chip_and_handler(i, chip, handle_level_irq);
+ irq_set_status_flags(i, IRQ_LEVEL);
+ }
}

void __init init_IRQ(void)

2023-01-17 08:51:04

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCH] Revert "x86/apic/x2apic: Implement IPI shorthands support"

On 01/10/23 at 10:24am, Baoquan He wrote:
> On 01/09/23 at 10:59pm, Thomas Gleixner wrote:
> > On Tue, Dec 20 2022 at 13:34, Baoquan He wrote:
> > > This reverts commit 43931d350f30c6cd8c2f498d54ef7d65750abc92.
> > >
> > > On kvm guest with 4 cpus deployed, when adding 'nr_cpus=2' to normal
> > > kernel's cmdline, and triggering crash to jump to kdump kernel, kdump
> > > kernel will stably hang. Reverting commit 43931d350f30 ("x86/apic/x2apic:
> > > Implement IPI shorthands support") can fix it.
> >
> > Is there any output on the early console or hangs it silently?
> >
> > If the latter, can you attach GDB to the guest and figure out where it
> > is stuck?
>
> No any output on the ealry console. I will try gdb debugging. Thanks.

I rebase to the latest upstream kernel and tested, the stable kdump
kernel hang disappear. Seems I need do another round of bisect to see
on which commit it's made to fix that. Will update if any progress.