Test based on the following branches:
https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git branch dev, arch X86_64
runqemu kvm slirp nographic qemuparams="-m 1024 -smp 4" bootparams="nokaslr console=ttyS0 rcutorture.torture_type=rcu rcutorture.shuffle_interval=0 isolcpus=0,1 rcu_nocbs=0,1 nohz_full=0,1 rcutree.dump_tree=1 rcutorture.onoff_holdoff=30 rcutorture.onoff_interval=10" -d
[ 39.392925] BUG: unable to handle page fault for address: ffffffff84d05000
[ 39.393244] #PF: supervisor read access in kernel mode
[ 39.393480] #PF: error_code(0x0000) - not-present page
[ 39.393715] PGD 3e19067 P4D 3e19067 PUD 3e1a063 PMD 800ffffffb3ff062
[ 39.394014] Oops: 0000 [#1] PREEMPT SMP KASAN PTI
[ 39.394231] CPU: 0 PID: 16 Comm: rcu_preempt Not tainted 6.2.0-rc1-yocto-standard+ #635
[ 39.394590] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.o4
[ 39.395085] RIP: 0010:do_raw_spin_trylock+0x70/0x120
[ 39.395320] Code: 81 c7 00 f1 f1 f1 f1 c7 40 04 04 f3 f3 f3 65 48 8b 04 25 28 00 00 00 48 89 45 e0 31 c0 e8 c8 0
[ 39.396143] RSP: 0018:ffff8880072d7b30 EFLAGS: 00010046
[ 39.396381] RAX: 0000000000000000 RBX: ffffffff84d05000 RCX: dffffc0000000000
[ 39.396703] RDX: 0000000000000003 RSI: 0000000000000004 RDI: ffffffff84d05000
[ 39.397027] RBP: ffff8880072d7ba8 R08: ffffffff811d74a0 R09: fffffbfff09a0a01
[ 39.397346] R10: ffffffff84d05003 R11: fffffbfff09a0a00 R12: 1ffff11000e5af66
[ 39.397669] R13: ffffffff84d05018 R14: ffffffff84d05000 R15: ffff8880072d7cd8
[ 39.397990] FS: 0000000000000000(0000) GS:ffff888035400000(0000) knlGS:0000000000000000
[ 39.398353] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 39.398615] CR2: ffffffff84d05000 CR3: 000000000a22c000 CR4: 00000000001506f0
[ 39.398936] Call Trace:
[ 39.399053] <TASK>
[ 39.399156] ? __pfx_do_raw_spin_trylock+0x10/0x10
[ 39.399379] ? trace_preempt_off+0x2a/0x110
[ 39.399576] _raw_spin_lock+0x41/0x80
[ 39.399751] ? schedule_timeout+0x242/0x580
[ 39.399945] schedule_timeout+0x242/0x580
[ 39.400133] ? __pfx_schedule_timeout+0x10/0x10
[ 39.400346] ? __pfx_do_raw_spin_trylock+0x10/0x10
[ 39.400567] ? __pfx_process_timeout+0x10/0x10
[ 39.400776] ? _raw_spin_unlock_irqrestore+0x46/0x80
[ 39.401006] ? prepare_to_swait_event+0xb8/0x210
[ 39.401221] rcu_gp_fqs_loop+0x60b/0xd50
[ 39.401405] ? rcu_gp_init+0x89c/0x1250
[ 39.401587] ? __pfx_rcu_gp_fqs_loop+0x10/0x10
[ 39.401793] ? _raw_spin_unlock_irqrestore+0x46/0x80
[ 39.402022] rcu_gp_kthread+0x2b7/0x620
[ 39.402201] ? __pfx_do_raw_spin_trylock+0x10/0x10
[ 39.402421] ? __pfx_rcu_gp_kthread+0x10/0x10
[ 39.402625] ? __kasan_check_read+0x11/0x20
[ 39.402818] ? __kthread_parkme+0xe8/0x110
[ 39.403010] ? __pfx_rcu_gp_kthread+0x10/0x10
[ 39.403213] kthread+0x192/0x1d0
[ 39.403366] ? __pfx_kthread+0x10/0x10
[ 39.403541] ret_from_fork+0x2c/0x50
[ 39.403712] </TASK>
[ 39.403818] Modules linked in:
[ 39.403972] CR2: ffffffff84d05000
[ 39.404128] ---[ end trace 0000000000000000 ]---
[ 39.404340] RIP: 0010:do_raw_spin_trylock+0x70/0x120
[ 39.404569] Code: 81 c7 00 f1 f1 f1 f1 c7 40 04 04 f3 f3 f3 65 48 8b 04 25 28 00 00 00 48 89 45 e0 31 c0 e8 c8 0
[ 39.405400] RSP: 0018:ffff8880072d7b30 EFLAGS: 00010046
[ 39.405639] RAX: 0000000000000000 RBX: ffffffff84d05000 RCX: dffffc0000000000
[ 39.405959] RDX: 0000000000000003 RSI: 0000000000000004 RDI: ffffffff84d05000
[ 39.406281] RBP: ffff8880072d7ba8 R08: ffffffff811d74a0 R09: fffffbfff09a0a01
[ 39.406602] R10: ffffffff84d05003 R11: fffffbfff09a0a00 R12: 1ffff11000e5af66
[ 39.406922] R13: ffffffff84d05018 R14: ffffffff84d05000 R15: ffff8880072d7cd8
[ 39.407245] FS: 0000000000000000(0000) GS:ffff888035400000(0000) knlGS:0000000000000000
[ 39.407607] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 39.407871] CR2: ffffffff84d05000 CR3: 000000000a22c000 CR4: 00000000001506f0
[ 39.408195] Kernel panic - not syncing: Fatal exception
[ 39.408450] Kernel Offset: disabled
[ 39.408615] ---[ end Kernel panic - not syncing: Fatal exception ]---
After remove isolcpus=0,1 and nohz_full=0,1, there is no Oops.
Thanks
Zqiang
On Thu, Feb 09, 2023 at 02:57:42AM +0000, Zhang, Qiang1 wrote:
>
> Test based on the following branches:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git branch dev, arch X86_64
>
>
> runqemu kvm slirp nographic qemuparams="-m 1024 -smp 4" bootparams="nokaslr console=ttyS0 rcutorture.torture_type=rcu rcutorture.shuffle_interval=0 isolcpus=0,1 rcu_nocbs=0,1 nohz_full=0,1 rcutree.dump_tree=1 rcutorture.onoff_holdoff=30 rcutorture.onoff_interval=10" -d
>
> [ 39.392925] BUG: unable to handle page fault for address: ffffffff84d05000
> [ 39.393244] #PF: supervisor read access in kernel mode
> [ 39.393480] #PF: error_code(0x0000) - not-present page
> [ 39.393715] PGD 3e19067 P4D 3e19067 PUD 3e1a063 PMD 800ffffffb3ff062
> [ 39.394014] Oops: 0000 [#1] PREEMPT SMP KASAN PTI
> [ 39.394231] CPU: 0 PID: 16 Comm: rcu_preempt Not tainted 6.2.0-rc1-yocto-standard+ #635
> [ 39.394590] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.o4
> [ 39.395085] RIP: 0010:do_raw_spin_trylock+0x70/0x120
> [ 39.395320] Code: 81 c7 00 f1 f1 f1 f1 c7 40 04 04 f3 f3 f3 65 48 8b 04 25 28 00 00 00 48 89 45 e0 31 c0 e8 c8 0
> [ 39.396143] RSP: 0018:ffff8880072d7b30 EFLAGS: 00010046
> [ 39.396381] RAX: 0000000000000000 RBX: ffffffff84d05000 RCX: dffffc0000000000
> [ 39.396703] RDX: 0000000000000003 RSI: 0000000000000004 RDI: ffffffff84d05000
> [ 39.397027] RBP: ffff8880072d7ba8 R08: ffffffff811d74a0 R09: fffffbfff09a0a01
> [ 39.397346] R10: ffffffff84d05003 R11: fffffbfff09a0a00 R12: 1ffff11000e5af66
> [ 39.397669] R13: ffffffff84d05018 R14: ffffffff84d05000 R15: ffff8880072d7cd8
> [ 39.397990] FS: 0000000000000000(0000) GS:ffff888035400000(0000) knlGS:0000000000000000
> [ 39.398353] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 39.398615] CR2: ffffffff84d05000 CR3: 000000000a22c000 CR4: 00000000001506f0
> [ 39.398936] Call Trace:
> [ 39.399053] <TASK>
> [ 39.399156] ? __pfx_do_raw_spin_trylock+0x10/0x10
> [ 39.399379] ? trace_preempt_off+0x2a/0x110
> [ 39.399576] _raw_spin_lock+0x41/0x80
> [ 39.399751] ? schedule_timeout+0x242/0x580
> [ 39.399945] schedule_timeout+0x242/0x580
> [ 39.400133] ? __pfx_schedule_timeout+0x10/0x10
> [ 39.400346] ? __pfx_do_raw_spin_trylock+0x10/0x10
> [ 39.400567] ? __pfx_process_timeout+0x10/0x10
> [ 39.400776] ? _raw_spin_unlock_irqrestore+0x46/0x80
> [ 39.401006] ? prepare_to_swait_event+0xb8/0x210
> [ 39.401221] rcu_gp_fqs_loop+0x60b/0xd50
> [ 39.401405] ? rcu_gp_init+0x89c/0x1250
> [ 39.401587] ? __pfx_rcu_gp_fqs_loop+0x10/0x10
> [ 39.401793] ? _raw_spin_unlock_irqrestore+0x46/0x80
> [ 39.402022] rcu_gp_kthread+0x2b7/0x620
> [ 39.402201] ? __pfx_do_raw_spin_trylock+0x10/0x10
> [ 39.402421] ? __pfx_rcu_gp_kthread+0x10/0x10
> [ 39.402625] ? __kasan_check_read+0x11/0x20
> [ 39.402818] ? __kthread_parkme+0xe8/0x110
> [ 39.403010] ? __pfx_rcu_gp_kthread+0x10/0x10
> [ 39.403213] kthread+0x192/0x1d0
> [ 39.403366] ? __pfx_kthread+0x10/0x10
> [ 39.403541] ret_from_fork+0x2c/0x50
> [ 39.403712] </TASK>
> [ 39.403818] Modules linked in:
> [ 39.403972] CR2: ffffffff84d05000
> [ 39.404128] ---[ end trace 0000000000000000 ]---
> [ 39.404340] RIP: 0010:do_raw_spin_trylock+0x70/0x120
> [ 39.404569] Code: 81 c7 00 f1 f1 f1 f1 c7 40 04 04 f3 f3 f3 65 48 8b 04 25 28 00 00 00 48 89 45 e0 31 c0 e8 c8 0
> [ 39.405400] RSP: 0018:ffff8880072d7b30 EFLAGS: 00010046
> [ 39.405639] RAX: 0000000000000000 RBX: ffffffff84d05000 RCX: dffffc0000000000
> [ 39.405959] RDX: 0000000000000003 RSI: 0000000000000004 RDI: ffffffff84d05000
> [ 39.406281] RBP: ffff8880072d7ba8 R08: ffffffff811d74a0 R09: fffffbfff09a0a01
> [ 39.406602] R10: ffffffff84d05003 R11: fffffbfff09a0a00 R12: 1ffff11000e5af66
> [ 39.406922] R13: ffffffff84d05018 R14: ffffffff84d05000 R15: ffff8880072d7cd8
> [ 39.407245] FS: 0000000000000000(0000) GS:ffff888035400000(0000) knlGS:0000000000000000
> [ 39.407607] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 39.407871] CR2: ffffffff84d05000 CR3: 000000000a22c000 CR4: 00000000001506f0
> [ 39.408195] Kernel panic - not syncing: Fatal exception
> [ 39.408450] Kernel Offset: disabled
> [ 39.408615] ---[ end Kernel panic - not syncing: Fatal exception ]---
>
> After remove isolcpus=0,1 and nohz_full=0,1, there is no Oops.
That certainly isn't what we want the kernel to be doing! ;-)
Could you please try bisecting?
Thanx, Paul
> Test based on the following branches:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git branch dev, arch X86_64
>
>
> runqemu kvm slirp nographic qemuparams="-m 1024 -smp 4" bootparams="nokaslr console=ttyS0 rcutorture.torture_type=rcu rcutorture.shuffle_interval=0 isolcpus=0,1 rcu_nocbs=0,1 nohz_full=0,1 rcutree.dump_tree=1 rcutorture.onoff_holdoff=30 rcutorture.onoff_interval=10" -d
>
> [ 39.392925] BUG: unable to handle page fault for address: ffffffff84d05000
> [ 39.393244] #PF: supervisor read access in kernel mode
> [ 39.393480] #PF: error_code(0x0000) - not-present page
> [ 39.393715] PGD 3e19067 P4D 3e19067 PUD 3e1a063 PMD 800ffffffb3ff062
> [ 39.394014] Oops: 0000 [#1] PREEMPT SMP KASAN PTI
> [ 39.394231] CPU: 0 PID: 16 Comm: rcu_preempt Not tainted 6.2.0-rc1-yocto-standard+ #635
> [ 39.394590] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.o4
> [ 39.395085] RIP: 0010:do_raw_spin_trylock+0x70/0x120
> [ 39.395320] Code: 81 c7 00 f1 f1 f1 f1 c7 40 04 04 f3 f3 f3 65 48 8b 04 25 28 00 00 00 48 89 45 e0 31 c0 e8 c8 0
> [ 39.396143] RSP: 0018:ffff8880072d7b30 EFLAGS: 00010046
> [ 39.396381] RAX: 0000000000000000 RBX: ffffffff84d05000 RCX: dffffc0000000000
> [ 39.396703] RDX: 0000000000000003 RSI: 0000000000000004 RDI: ffffffff84d05000
> [ 39.397027] RBP: ffff8880072d7ba8 R08: ffffffff811d74a0 R09: fffffbfff09a0a01
> [ 39.397346] R10: ffffffff84d05003 R11: fffffbfff09a0a00 R12: 1ffff11000e5af66
> [ 39.397669] R13: ffffffff84d05018 R14: ffffffff84d05000 R15: ffff8880072d7cd8
> [ 39.397990] FS: 0000000000000000(0000) GS:ffff888035400000(0000) knlGS:0000000000000000
> [ 39.398353] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 39.398615] CR2: ffffffff84d05000 CR3: 000000000a22c000 CR4: 00000000001506f0
> [ 39.398936] Call Trace:
> [ 39.399053] <TASK>
> [ 39.399156] ? __pfx_do_raw_spin_trylock+0x10/0x10
> [ 39.399379] ? trace_preempt_off+0x2a/0x110
> [ 39.399576] _raw_spin_lock+0x41/0x80
> [ 39.399751] ? schedule_timeout+0x242/0x580
> [ 39.399945] schedule_timeout+0x242/0x580
> [ 39.400133] ? __pfx_schedule_timeout+0x10/0x10
> [ 39.400346] ? __pfx_do_raw_spin_trylock+0x10/0x10
> [ 39.400567] ? __pfx_process_timeout+0x10/0x10
> [ 39.400776] ? _raw_spin_unlock_irqrestore+0x46/0x80
> [ 39.401006] ? prepare_to_swait_event+0xb8/0x210
> [ 39.401221] rcu_gp_fqs_loop+0x60b/0xd50
> [ 39.401405] ? rcu_gp_init+0x89c/0x1250
> [ 39.401587] ? __pfx_rcu_gp_fqs_loop+0x10/0x10
> [ 39.401793] ? _raw_spin_unlock_irqrestore+0x46/0x80
> [ 39.402022] rcu_gp_kthread+0x2b7/0x620
> [ 39.402201] ? __pfx_do_raw_spin_trylock+0x10/0x10
> [ 39.402421] ? __pfx_rcu_gp_kthread+0x10/0x10
> [ 39.402625] ? __kasan_check_read+0x11/0x20
> [ 39.402818] ? __kthread_parkme+0xe8/0x110
> [ 39.403010] ? __pfx_rcu_gp_kthread+0x10/0x10
> [ 39.403213] kthread+0x192/0x1d0
> [ 39.403366] ? __pfx_kthread+0x10/0x10
> [ 39.403541] ret_from_fork+0x2c/0x50
> [ 39.403712] </TASK>
> [ 39.403818] Modules linked in:
> [ 39.403972] CR2: ffffffff84d05000
> [ 39.404128] ---[ end trace 0000000000000000 ]---
> [ 39.404340] RIP: 0010:do_raw_spin_trylock+0x70/0x120
> [ 39.404569] Code: 81 c7 00 f1 f1 f1 f1 c7 40 04 04 f3 f3 f3 65 48 8b 04 25 28 00 00 00 48 89 45 e0 31 c0 e8 c8 0
> [ 39.405400] RSP: 0018:ffff8880072d7b30 EFLAGS: 00010046
> [ 39.405639] RAX: 0000000000000000 RBX: ffffffff84d05000 RCX: dffffc0000000000
> [ 39.405959] RDX: 0000000000000003 RSI: 0000000000000004 RDI: ffffffff84d05000
> [ 39.406281] RBP: ffff8880072d7ba8 R08: ffffffff811d74a0 R09: fffffbfff09a0a01
> [ 39.406602] R10: ffffffff84d05003 R11: fffffbfff09a0a00 R12: 1ffff11000e5af66
> [ 39.406922] R13: ffffffff84d05018 R14: ffffffff84d05000 R15: ffff8880072d7cd8
> [ 39.407245] FS: 0000000000000000(0000) GS:ffff888035400000(0000) knlGS:0000000000000000
> [ 39.407607] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 39.407871] CR2: ffffffff84d05000 CR3: 000000000a22c000 CR4: 00000000001506f0
> [ 39.408195] Kernel panic - not syncing: Fatal exception
> [ 39.408450] Kernel Offset: disabled
> [ 39.408615] ---[ end Kernel panic - not syncing: Fatal exception ]---
>
> After remove isolcpus=0,1 and nohz_full=0,1, there is no Oops.
>
Only add nohz_full=0,1 bootparams, the Oops can be reproduced.
runqemu kvm slirp nographic qemuparams="-m 1024 -smp 4" bootparams=
"console=ttyS0 nohz_full=0,1 rcu_nocbs=0,1 sched_verbose" -d
root@qemux86-64:~# echo 0 > /sys/devices/system/cpu/cpu2/online
root@qemux86-64:~# echo 0 > /sys/devices/system/cpu/cpu3/online
schedule_timeout()
->__mod_timer()
->get_target_base(base, timer->flags)
->get_timer_cpu_base(tflags, get_nohz_timer_target());
->cpu = get_nohz_timer_target()
->housekeeping_any_cpu(HK_TYPE_TIMER)
/*housekeeping.cpumasks[type] is 2-3*/
/*cpu_online_mask is 0-1*/
->cpu = cpumask_any_and(housekeeping.cpumasks[type],
cpu_online_mask);
/*cpu value is 4*/
->new_base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
/*new_base is illegal address*/
->if (base != new_base)
->raw_spin_lock(&new_base->lock); ==> trigger Oops
The follow change can fix Oops
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 373d42c707bc..e255eb83f14f 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -46,7 +46,9 @@ int housekeeping_any_cpu(enum hk_type type)
if (cpu < nr_cpu_ids)
return cpu;
- return cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask);
+ cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ return smp_processor_id();
}
}
return smp_processor_id();
Have send patch.
Thanks
Zqiang
>That certainly isn't what we want the kernel to be doing! ;-)
>
>Could you please try bisecting?
>
> Thanx, Paul