2024-01-30 07:24:32

by Oliver Sang

[permalink] [raw]
Subject: [linus:master] [sched/eevdf] 2227a957e1: BUG:kernel_NULL_pointer_dereference,address



Hello,

(besides a previous performance report),
kernel test robot noticed "BUG:kernel_NULL_pointer_dereference,address" on:

commit: 2227a957e1d5b1941be4e4207879ec74f4bb37f8 ("sched/eevdf: Sort the rbtree by virtual deadline")
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master

[test failed on linus/master 3a5879d495b226d0404098e3564462d5f1daa33b]
[test failed on linux-next/master 01af33cc9894b4489fb68fa35c40e9fe85df63dc]

in testcase: trinity
version: trinity-i386-abe9de86-1_20230429
with following parameters:

runtime: 300s
group: group-03
nr_groups: 5

test-description: Trinity is a linux system call fuzz tester.
test-url: http://codemonkey.org.uk/projects/trinity/


compiler: clang-17
test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

(please refer to attached dmesg/kmsg for entire log/backtrace)



we found this issue happens in very random way (23 out of 999 runs).
but keeps clean on parent.

84db47ca7146d7bd 2227a957e1d5b1941be4e420787
---------------- ---------------------------
fail:runs %reproduction fail:runs
| | |
:999 2% 23:999 dmesg.BUG:kernel_NULL_pointer_dereference,address
:999 2% 23:999 dmesg.Kernel_panic-not_syncing:Fatal_exception
:999 2% 23:999 dmesg.Oops:#[##]




If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-lkp/[email protected]


sorry for below parse failure which caused no real line numbers.
we will follow further. the orgial dmesg could be fetch from below link.


[ 512.079810][ T8305] BUG: kernel NULL pointer dereference, address: 0000002c
[ 512.080897][ T8305] #PF: supervisor read access in kernel mode
[ 512.081636][ T8305] #PF: error_code(0x0000) - not-present page
[ 512.082337][ T8305] *pde = 00000000
[ 512.082829][ T8305] Oops: 0000 [#1] PREEMPT SMP
[ 512.083407][ T8305] CPU: 1 PID: 8305 Comm: watchdog Tainted: G W N 6.7.0-rc1-00006-g2227a957e1d5 #1 819e6d1a8b887f5f97adb4aed77d98b15504c836
[ 512.084986][ T8305] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 512.086203][ T8305] EIP: set_next_entity (fair.c:?)
[ 512.086881][ T8305] Code: 89 d6 89 cf 85 d2 74 22 39 77 58 75 1d 89 f0 90 90 90 90 90 8b 48 70 39 41 58 75 0e c7 41 58 00 00 00 00 8b 40 6c 85 c0 75 ea <83> 7e 2c 00 74 28 3e 8d 74 26 00 89 f9 89 f2 e8 36 59 00 00 89 f9
All code
========
0: 89 d6 mov %edx,%esi
2: 89 cf mov %ecx,%edi
4: 85 d2 test %edx,%edx
6: 74 22 je 0x2a
8: 39 77 58 cmp %esi,0x58(%rdi)
b: 75 1d jne 0x2a
d: 89 f0 mov %esi,%eax
f: 90 nop
10: 90 nop
11: 90 nop
12: 90 nop
13: 90 nop
14: 8b 48 70 mov 0x70(%rax),%ecx
17: 39 41 58 cmp %eax,0x58(%rcx)
1a: 75 0e jne 0x2a
1c: c7 41 58 00 00 00 00 movl $0x0,0x58(%rcx)
23: 8b 40 6c mov 0x6c(%rax),%eax
26: 85 c0 test %eax,%eax
28: 75 ea jne 0x14
2a:* 83 7e 2c 00 cmpl $0x0,0x2c(%rsi) <-- trapping instruction
2e: 74 28 je 0x58
30: 3e 8d 74 26 00 lea %ds:0x0(%rsi,%riz,1),%esi
35: 89 f9 mov %edi,%ecx
37: 89 f2 mov %esi,%edx
39: e8 36 59 00 00 callq 0x5974
3e: 89 f9 mov %edi,%ecx

Code starting with the faulting instruction
===========================================
0: 83 7e 2c 00 cmpl $0x0,0x2c(%rsi)
4: 74 28 je 0x2e
6: 3e 8d 74 26 00 lea %ds:0x0(%rsi,%riz,1),%esi
b: 89 f9 mov %edi,%ecx
d: 89 f2 mov %esi,%edx
f: e8 36 59 00 00 callq 0x594a
14: 89 f9 mov %edi,%ecx
[ 512.089261][ T8305] EAX: 00000000 EBX: e75ff100 ECX: e75ff100 EDX: 00000000
[ 512.090180][ T8305] ESI: 00000000 EDI: e75ff100 EBP: ed229dd0 ESP: ed229dc0
[ 512.091097][ T8305] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 EFLAGS: 00010046
[ 512.092138][ T8305] CR0: 80050033 CR2: 0000002c CR3: 0a508000 CR4: 000406d0
[ 512.093076][ T8305] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
[ 512.094004][ T8305] DR6: fffe0ff0 DR7: 00000400
[ 512.094643][ T8305] Call Trace:
[ 512.095145][ T8305] ? __die_body (dumpstack.c:?)
[ 512.095755][ T8305] ? __die (??:?)
[ 512.096309][ T8305] ? page_fault_oops (fault.c:?)
[ 512.096994][ T8305] ? __lock_acquire (lockdep.c:?)
[ 512.097677][ T8305] ? kmemleak_alloc (??:?)
[ 512.098343][ T8305] ? kernelmode_fixup_or_oops (fault.c:?)
[ 512.099085][ T8305] ? __bad_area_nosemaphore (fault.c:?)
[ 512.099839][ T8305] ? bad_area_nosemaphore (fault.c:?)
[ 512.100579][ T8305] ? do_user_addr_fault (fault.c:?)
[ 512.101325][ T8305] ? exc_page_fault (??:?)
[ 512.101992][ T8305] ? pvclock_clocksource_read_nowd (??:?)
[ 512.102799][ T8305] ? handle_exception (init_task.c:?)
[ 512.103492][ T8305] ? yield_to_task_fair (fair.c:?)
[ 512.104166][ T8305] ? pvclock_clocksource_read_nowd (??:?)
[ 512.104975][ T8305] ? set_next_entity (fair.c:?)
[ 512.105655][ T8305] ? pvclock_clocksource_read_nowd (??:?)
[ 512.106447][ T8305] ? set_next_entity (fair.c:?)
[ 512.107106][ T8305] pick_next_task_fair (??:?)
[ 512.107788][ T8305] ? put_prev_task_rt (build_policy.c:?)
[ 512.108460][ T8305] __pick_next_task_fair (fair.c:?)
[ 512.109162][ T8305] __schedule (core.c:?)
[ 512.109753][ T8305] schedule (??:?)
[ 512.110284][ T8305] do_nanosleep (hrtimer.c:?)
[ 512.110900][ T8305] hrtimer_nanosleep (??:?)
[ 512.111581][ T8305] ? __remove_hrtimer (hrtimer.c:?)
[ 512.112264][ T8305] common_nsleep (posix-timers.c:?)
[ 512.112891][ T8305] __ia32_sys_clock_nanosleep (??:?)
[ 512.113657][ T8305] ? syscall_enter_from_user_mode_work (??:?)
[ 512.114486][ T8305] __do_fast_syscall_32 (common.c:?)
[ 512.115180][ T8305] ? irqentry_exit_to_user_mode (??:?)
[ 512.115936][ T8305] ? irqentry_exit_to_user_mode (??:?)
[ 512.116690][ T8305] do_fast_syscall_32 (??:?)
[ 512.117386][ T8305] do_SYSENTER_32 (??:?)
[ 512.118022][ T8305] entry_SYSENTER_32 (??:?)
[ 512.118645][ T8305] EIP: 0xb7edf539
[ 512.119172][ T8305] Code: 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 0f 1f 00 58 b8 77 00 00 00 cd 80 90 0f 1f
All code
========
0: 03 74 b4 01 add 0x1(%rsp,%rsi,4),%esi
4: 10 07 adc %al,(%rdi)
6: 03 74 b0 01 add 0x1(%rax,%rsi,4),%esi
a: 10 08 adc %cl,(%rax)
c: 03 74 d8 01 add 0x1(%rax,%rbx,8),%esi
...
20: 00 51 52 add %dl,0x52(%rcx)
23: 55 push %rbp
24:* 89 e5 mov %esp,%ebp <-- trapping instruction
26: 0f 34 sysenter
28: cd 80 int $0x80
2a: 5d pop %rbp
2b: 5a pop %rdx
2c: 59 pop %rcx
2d: c3 retq
2e: 90 nop
2f: 90 nop
30: 90 nop
31: 90 nop
32: 0f 1f 00 nopl (%rax)
35: 58 pop %rax
36: b8 77 00 00 00 mov $0x77,%eax
3b: cd 80 int $0x80
3d: 90 nop
3e: 0f .byte 0xf
3f: 1f (bad)

Code starting with the faulting instruction
===========================================
0: 5d pop %rbp
1: 5a pop %rdx
2: 59 pop %rcx
3: c3 retq
4: 90 nop
5: 90 nop
6: 90 nop
7: 90 nop
8: 0f 1f 00 nopl (%rax)
b: 58 pop %rax
c: b8 77 00 00 00 mov $0x77,%eax
11: cd 80 int $0x80
13: 90 nop
14: 0f .byte 0xf
15: 1f (bad)


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240130/[email protected]



--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki



2024-01-30 10:38:32

by Abel Wu

[permalink] [raw]
Subject: Re: [linus:master] [sched/eevdf] 2227a957e1: BUG:kernel_NULL_pointer_dereference,address

On 1/30/24 3:24 PM, kernel test robot Wrote:
>
>
> Hello,
>
> (besides a previous performance report),
> kernel test robot noticed "BUG:kernel_NULL_pointer_dereference,address" on:
>
> commit: 2227a957e1d5b1941be4e4207879ec74f4bb37f8 ("sched/eevdf: Sort the rbtree by virtual deadline")
> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master
>
> [test failed on linus/master 3a5879d495b226d0404098e3564462d5f1daa33b]
> [test failed on linux-next/master 01af33cc9894b4489fb68fa35c40e9fe85df63dc]
>
> in testcase: trinity
> version: trinity-i386-abe9de86-1_20230429
> with following parameters:
>
> runtime: 300s
> group: group-03
> nr_groups: 5
>
> test-description: Trinity is a linux system call fuzz tester.
> test-url: http://codemonkey.org.uk/projects/trinity/
>
>
> compiler: clang-17
> test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
>
> (please refer to attached dmesg/kmsg for entire log/backtrace)
>
>
>
> we found this issue happens in very random way (23 out of 999 runs).
> but keeps clean on parent.

Thanks for reporting, I will try to reproduce the issue. Does the 'parent'
mean the same code branch without this commit?

>
> 84db47ca7146d7bd 2227a957e1d5b1941be4e420787
> ---------------- ---------------------------
> fail:runs %reproduction fail:runs
> | | |
> :999 2% 23:999 dmesg.BUG:kernel_NULL_pointer_dereference,address
> :999 2% 23:999 dmesg.Kernel_panic-not_syncing:Fatal_exception
> :999 2% 23:999 dmesg.Oops:#[##]
>
>
>
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <[email protected]>
> | Closes: https://lore.kernel.org/oe-lkp/[email protected]
>
>
> sorry for below parse failure which caused no real line numbers.
> we will follow further. the orgial dmesg could be fetch from below link.
>
>
> [ 512.079810][ T8305] BUG: kernel NULL pointer dereference, address: 0000002c
> [ 512.080897][ T8305] #PF: supervisor read access in kernel mode
> [ 512.081636][ T8305] #PF: error_code(0x0000) - not-present page
> [ 512.082337][ T8305] *pde = 00000000
> [ 512.082829][ T8305] Oops: 0000 [#1] PREEMPT SMP
> [ 512.083407][ T8305] CPU: 1 PID: 8305 Comm: watchdog Tainted: G W N 6.7.0-rc1-00006-g2227a957e1d5 #1 819e6d1a8b887f5f97adb4aed77d98b15504c836
> [ 512.084986][ T8305] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> [ 512.086203][ T8305] EIP: set_next_entity (fair.c:?)

There was actually a NULL-test in pick_eevdf() before this commit,
but I removed it by intent as I found it impossible to be NULL after
examining 'all' the cases.

Also cc Tiwei who once proposed to add this check back.
https://lore.kernel.org/all/[email protected]/

Thanks,
Abel

> [ 512.086881][ T8305] Code: 89 d6 89 cf 85 d2 74 22 39 77 58 75 1d 89 f0 90 90 90 90 90 8b 48 70 39 41 58 75 0e c7 41 58 00 00 00 00 8b 40 6c 85 c0 75 ea <83> 7e 2c 00 74 28 3e 8d 74 26 00 89 f9 89 f2 e8 36 59 00 00 89 f9
> All code
> ========
> 0: 89 d6 mov %edx,%esi
> 2: 89 cf mov %ecx,%edi
> 4: 85 d2 test %edx,%edx
> 6: 74 22 je 0x2a
> 8: 39 77 58 cmp %esi,0x58(%rdi)
> b: 75 1d jne 0x2a
> d: 89 f0 mov %esi,%eax
> f: 90 nop
> 10: 90 nop
> 11: 90 nop
> 12: 90 nop
> 13: 90 nop
> 14: 8b 48 70 mov 0x70(%rax),%ecx
> 17: 39 41 58 cmp %eax,0x58(%rcx)
> 1a: 75 0e jne 0x2a
> 1c: c7 41 58 00 00 00 00 movl $0x0,0x58(%rcx)
> 23: 8b 40 6c mov 0x6c(%rax),%eax
> 26: 85 c0 test %eax,%eax
> 28: 75 ea jne 0x14
> 2a:* 83 7e 2c 00 cmpl $0x0,0x2c(%rsi) <-- trapping instruction
> 2e: 74 28 je 0x58
> 30: 3e 8d 74 26 00 lea %ds:0x0(%rsi,%riz,1),%esi
> 35: 89 f9 mov %edi,%ecx
> 37: 89 f2 mov %esi,%edx
> 39: e8 36 59 00 00 callq 0x5974
> 3e: 89 f9 mov %edi,%ecx
>
> Code starting with the faulting instruction
> ===========================================
> 0: 83 7e 2c 00 cmpl $0x0,0x2c(%rsi)
> 4: 74 28 je 0x2e
> 6: 3e 8d 74 26 00 lea %ds:0x0(%rsi,%riz,1),%esi
> b: 89 f9 mov %edi,%ecx
> d: 89 f2 mov %esi,%edx
> f: e8 36 59 00 00 callq 0x594a
> 14: 89 f9 mov %edi,%ecx
> [ 512.089261][ T8305] EAX: 00000000 EBX: e75ff100 ECX: e75ff100 EDX: 00000000
> [ 512.090180][ T8305] ESI: 00000000 EDI: e75ff100 EBP: ed229dd0 ESP: ed229dc0
> [ 512.091097][ T8305] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 EFLAGS: 00010046
> [ 512.092138][ T8305] CR0: 80050033 CR2: 0000002c CR3: 0a508000 CR4: 000406d0
> [ 512.093076][ T8305] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
> [ 512.094004][ T8305] DR6: fffe0ff0 DR7: 00000400
> [ 512.094643][ T8305] Call Trace:
> [ 512.095145][ T8305] ? __die_body (dumpstack.c:?)
> [ 512.095755][ T8305] ? __die (??:?)
> [ 512.096309][ T8305] ? page_fault_oops (fault.c:?)
> [ 512.096994][ T8305] ? __lock_acquire (lockdep.c:?)
> [ 512.097677][ T8305] ? kmemleak_alloc (??:?)
> [ 512.098343][ T8305] ? kernelmode_fixup_or_oops (fault.c:?)
> [ 512.099085][ T8305] ? __bad_area_nosemaphore (fault.c:?)
> [ 512.099839][ T8305] ? bad_area_nosemaphore (fault.c:?)
> [ 512.100579][ T8305] ? do_user_addr_fault (fault.c:?)
> [ 512.101325][ T8305] ? exc_page_fault (??:?)
> [ 512.101992][ T8305] ? pvclock_clocksource_read_nowd (??:?)
> [ 512.102799][ T8305] ? handle_exception (init_task.c:?)
> [ 512.103492][ T8305] ? yield_to_task_fair (fair.c:?)
> [ 512.104166][ T8305] ? pvclock_clocksource_read_nowd (??:?)
> [ 512.104975][ T8305] ? set_next_entity (fair.c:?)
> [ 512.105655][ T8305] ? pvclock_clocksource_read_nowd (??:?)
> [ 512.106447][ T8305] ? set_next_entity (fair.c:?)
> [ 512.107106][ T8305] pick_next_task_fair (??:?)
> [ 512.107788][ T8305] ? put_prev_task_rt (build_policy.c:?)
> [ 512.108460][ T8305] __pick_next_task_fair (fair.c:?)
> [ 512.109162][ T8305] __schedule (core.c:?)
> [ 512.109753][ T8305] schedule (??:?)
> [ 512.110284][ T8305] do_nanosleep (hrtimer.c:?)
> [ 512.110900][ T8305] hrtimer_nanosleep (??:?)
> [ 512.111581][ T8305] ? __remove_hrtimer (hrtimer.c:?)
> [ 512.112264][ T8305] common_nsleep (posix-timers.c:?)
> [ 512.112891][ T8305] __ia32_sys_clock_nanosleep (??:?)
> [ 512.113657][ T8305] ? syscall_enter_from_user_mode_work (??:?)
> [ 512.114486][ T8305] __do_fast_syscall_32 (common.c:?)
> [ 512.115180][ T8305] ? irqentry_exit_to_user_mode (??:?)
> [ 512.115936][ T8305] ? irqentry_exit_to_user_mode (??:?)
> [ 512.116690][ T8305] do_fast_syscall_32 (??:?)
> [ 512.117386][ T8305] do_SYSENTER_32 (??:?)
> [ 512.118022][ T8305] entry_SYSENTER_32 (??:?)
> [ 512.118645][ T8305] EIP: 0xb7edf539
> [ 512.119172][ T8305] Code: 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 0f 1f 00 58 b8 77 00 00 00 cd 80 90 0f 1f
> All code
> ========
> 0: 03 74 b4 01 add 0x1(%rsp,%rsi,4),%esi
> 4: 10 07 adc %al,(%rdi)
> 6: 03 74 b0 01 add 0x1(%rax,%rsi,4),%esi
> a: 10 08 adc %cl,(%rax)
> c: 03 74 d8 01 add 0x1(%rax,%rbx,8),%esi
> ...
> 20: 00 51 52 add %dl,0x52(%rcx)
> 23: 55 push %rbp
> 24:* 89 e5 mov %esp,%ebp <-- trapping instruction
> 26: 0f 34 sysenter
> 28: cd 80 int $0x80
> 2a: 5d pop %rbp
> 2b: 5a pop %rdx
> 2c: 59 pop %rcx
> 2d: c3 retq
> 2e: 90 nop
> 2f: 90 nop
> 30: 90 nop
> 31: 90 nop
> 32: 0f 1f 00 nopl (%rax)
> 35: 58 pop %rax
> 36: b8 77 00 00 00 mov $0x77,%eax
> 3b: cd 80 int $0x80
> 3d: 90 nop
> 3e: 0f .byte 0xf
> 3f: 1f (bad)
>
> Code starting with the faulting instruction
> ===========================================
> 0: 5d pop %rbp
> 1: 5a pop %rdx
> 2: 59 pop %rcx
> 3: c3 retq
> 4: 90 nop
> 5: 90 nop
> 6: 90 nop
> 7: 90 nop
> 8: 0f 1f 00 nopl (%rax)
> b: 58 pop %rax
> c: b8 77 00 00 00 mov $0x77,%eax
> 11: cd 80 int $0x80
> 13: 90 nop
> 14: 0f .byte 0xf
> 15: 1f (bad)
>
>
> The kernel config and materials to reproduce are available at:
> https://download.01.org/0day-ci/archive/20240130/[email protected]
>
>
>

2024-01-30 14:10:41

by Oliver Sang

[permalink] [raw]
Subject: Re: [linus:master] [sched/eevdf] 2227a957e1: BUG:kernel_NULL_pointer_dereference,address

hi, Abel,

On Tue, Jan 30, 2024 at 06:13:32PM +0800, Abel Wu wrote:
> On 1/30/24 3:24 PM, kernel test robot Wrote:
> >
> >
> > Hello,
> >
> > (besides a previous performance report),
> > kernel test robot noticed "BUG:kernel_NULL_pointer_dereference,address" on:
> >
> > commit: 2227a957e1d5b1941be4e4207879ec74f4bb37f8 ("sched/eevdf: Sort the rbtree by virtual deadline")
> > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master
> >
> > [test failed on linus/master 3a5879d495b226d0404098e3564462d5f1daa33b]
> > [test failed on linux-next/master 01af33cc9894b4489fb68fa35c40e9fe85df63dc]
> >
> > in testcase: trinity
> > version: trinity-i386-abe9de86-1_20230429
> > with following parameters:
> >
> > runtime: 300s
> > group: group-03
> > nr_groups: 5
> >
> > test-description: Trinity is a linux system call fuzz tester.
> > test-url: http://codemonkey.org.uk/projects/trinity/
> >
> >
> > compiler: clang-17
> > test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
> >
> > (please refer to attached dmesg/kmsg for entire log/backtrace)
> >
> >
> >
> > we found this issue happens in very random way (23 out of 999 runs).
> > but keeps clean on parent.
>
> Thanks for reporting, I will try to reproduce the issue. Does the 'parent'
> mean the same code branch without this commit?


it just means the parent in git log.

xsang@inn:/c/repo/linux$ git log --oneline --graph 2227a957e1
* 2227a957e1d5b sched/eevdf: Sort the rbtree by virtual deadline
* 84db47ca7146d sched/numa: Fix mm numa_scan_seq based unconditional scan <-- parent


>
> >
> > 84db47ca7146d7bd 2227a957e1d5b1941be4e420787
> > ---------------- ---------------------------
> > fail:runs %reproduction fail:runs
> > | | |
> > :999 2% 23:999 dmesg.BUG:kernel_NULL_pointer_dereference,address
> > :999 2% 23:999 dmesg.Kernel_panic-not_syncing:Fatal_exception
> > :999 2% 23:999 dmesg.Oops:#[##]
> >
> >
> >
> >
> > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > the same patch/commit), kindly add following tags
> > | Reported-by: kernel test robot <[email protected]>
> > | Closes: https://lore.kernel.org/oe-lkp/[email protected]
> >
> >
> > sorry for below parse failure which caused no real line numbers.
> > we will follow further. the orgial dmesg could be fetch from below link.
> >
> >
> > [ 512.079810][ T8305] BUG: kernel NULL pointer dereference, address: 0000002c
> > [ 512.080897][ T8305] #PF: supervisor read access in kernel mode
> > [ 512.081636][ T8305] #PF: error_code(0x0000) - not-present page
> > [ 512.082337][ T8305] *pde = 00000000
> > [ 512.082829][ T8305] Oops: 0000 [#1] PREEMPT SMP
> > [ 512.083407][ T8305] CPU: 1 PID: 8305 Comm: watchdog Tainted: G W N 6.7.0-rc1-00006-g2227a957e1d5 #1 819e6d1a8b887f5f97adb4aed77d98b15504c836
> > [ 512.084986][ T8305] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> > [ 512.086203][ T8305] EIP: set_next_entity (fair.c:?)
>
> There was actually a NULL-test in pick_eevdf() before this commit,
> but I removed it by intent as I found it impossible to be NULL after
> examining 'all' the cases.
>
> Also cc Tiwei who once proposed to add this check back.
> https://lore.kernel.org/all/[email protected]/
>
> Thanks,
> Abel
>
> > [ 512.086881][ T8305] Code: 89 d6 89 cf 85 d2 74 22 39 77 58 75 1d 89 f0 90 90 90 90 90 8b 48 70 39 41 58 75 0e c7 41 58 00 00 00 00 8b 40 6c 85 c0 75 ea <83> 7e 2c 00 74 28 3e 8d 74 26 00 89 f9 89 f2 e8 36 59 00 00 89 f9
> > All code
> > ========
> > 0: 89 d6 mov %edx,%esi
> > 2: 89 cf mov %ecx,%edi
> > 4: 85 d2 test %edx,%edx
> > 6: 74 22 je 0x2a
> > 8: 39 77 58 cmp %esi,0x58(%rdi)
> > b: 75 1d jne 0x2a
> > d: 89 f0 mov %esi,%eax
> > f: 90 nop
> > 10: 90 nop
> > 11: 90 nop
> > 12: 90 nop
> > 13: 90 nop
> > 14: 8b 48 70 mov 0x70(%rax),%ecx
> > 17: 39 41 58 cmp %eax,0x58(%rcx)
> > 1a: 75 0e jne 0x2a
> > 1c: c7 41 58 00 00 00 00 movl $0x0,0x58(%rcx)
> > 23: 8b 40 6c mov 0x6c(%rax),%eax
> > 26: 85 c0 test %eax,%eax
> > 28: 75 ea jne 0x14
> > 2a:* 83 7e 2c 00 cmpl $0x0,0x2c(%rsi) <-- trapping instruction
> > 2e: 74 28 je 0x58
> > 30: 3e 8d 74 26 00 lea %ds:0x0(%rsi,%riz,1),%esi
> > 35: 89 f9 mov %edi,%ecx
> > 37: 89 f2 mov %esi,%edx
> > 39: e8 36 59 00 00 callq 0x5974
> > 3e: 89 f9 mov %edi,%ecx
> >
> > Code starting with the faulting instruction
> > ===========================================
> > 0: 83 7e 2c 00 cmpl $0x0,0x2c(%rsi)
> > 4: 74 28 je 0x2e
> > 6: 3e 8d 74 26 00 lea %ds:0x0(%rsi,%riz,1),%esi
> > b: 89 f9 mov %edi,%ecx
> > d: 89 f2 mov %esi,%edx
> > f: e8 36 59 00 00 callq 0x594a
> > 14: 89 f9 mov %edi,%ecx
> > [ 512.089261][ T8305] EAX: 00000000 EBX: e75ff100 ECX: e75ff100 EDX: 00000000
> > [ 512.090180][ T8305] ESI: 00000000 EDI: e75ff100 EBP: ed229dd0 ESP: ed229dc0
> > [ 512.091097][ T8305] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 EFLAGS: 00010046
> > [ 512.092138][ T8305] CR0: 80050033 CR2: 0000002c CR3: 0a508000 CR4: 000406d0
> > [ 512.093076][ T8305] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
> > [ 512.094004][ T8305] DR6: fffe0ff0 DR7: 00000400
> > [ 512.094643][ T8305] Call Trace:
> > [ 512.095145][ T8305] ? __die_body (dumpstack.c:?)
> > [ 512.095755][ T8305] ? __die (??:?)
> > [ 512.096309][ T8305] ? page_fault_oops (fault.c:?)
> > [ 512.096994][ T8305] ? __lock_acquire (lockdep.c:?)
> > [ 512.097677][ T8305] ? kmemleak_alloc (??:?)
> > [ 512.098343][ T8305] ? kernelmode_fixup_or_oops (fault.c:?)
> > [ 512.099085][ T8305] ? __bad_area_nosemaphore (fault.c:?)
> > [ 512.099839][ T8305] ? bad_area_nosemaphore (fault.c:?)
> > [ 512.100579][ T8305] ? do_user_addr_fault (fault.c:?)
> > [ 512.101325][ T8305] ? exc_page_fault (??:?)
> > [ 512.101992][ T8305] ? pvclock_clocksource_read_nowd (??:?)
> > [ 512.102799][ T8305] ? handle_exception (init_task.c:?)
> > [ 512.103492][ T8305] ? yield_to_task_fair (fair.c:?)
> > [ 512.104166][ T8305] ? pvclock_clocksource_read_nowd (??:?)
> > [ 512.104975][ T8305] ? set_next_entity (fair.c:?)
> > [ 512.105655][ T8305] ? pvclock_clocksource_read_nowd (??:?)
> > [ 512.106447][ T8305] ? set_next_entity (fair.c:?)
> > [ 512.107106][ T8305] pick_next_task_fair (??:?)
> > [ 512.107788][ T8305] ? put_prev_task_rt (build_policy.c:?)
> > [ 512.108460][ T8305] __pick_next_task_fair (fair.c:?)
> > [ 512.109162][ T8305] __schedule (core.c:?)
> > [ 512.109753][ T8305] schedule (??:?)
> > [ 512.110284][ T8305] do_nanosleep (hrtimer.c:?)
> > [ 512.110900][ T8305] hrtimer_nanosleep (??:?)
> > [ 512.111581][ T8305] ? __remove_hrtimer (hrtimer.c:?)
> > [ 512.112264][ T8305] common_nsleep (posix-timers.c:?)
> > [ 512.112891][ T8305] __ia32_sys_clock_nanosleep (??:?)
> > [ 512.113657][ T8305] ? syscall_enter_from_user_mode_work (??:?)
> > [ 512.114486][ T8305] __do_fast_syscall_32 (common.c:?)
> > [ 512.115180][ T8305] ? irqentry_exit_to_user_mode (??:?)
> > [ 512.115936][ T8305] ? irqentry_exit_to_user_mode (??:?)
> > [ 512.116690][ T8305] do_fast_syscall_32 (??:?)
> > [ 512.117386][ T8305] do_SYSENTER_32 (??:?)
> > [ 512.118022][ T8305] entry_SYSENTER_32 (??:?)
> > [ 512.118645][ T8305] EIP: 0xb7edf539
> > [ 512.119172][ T8305] Code: 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 0f 1f 00 58 b8 77 00 00 00 cd 80 90 0f 1f
> > All code
> > ========
> > 0: 03 74 b4 01 add 0x1(%rsp,%rsi,4),%esi
> > 4: 10 07 adc %al,(%rdi)
> > 6: 03 74 b0 01 add 0x1(%rax,%rsi,4),%esi
> > a: 10 08 adc %cl,(%rax)
> > c: 03 74 d8 01 add 0x1(%rax,%rbx,8),%esi
> > ...
> > 20: 00 51 52 add %dl,0x52(%rcx)
> > 23: 55 push %rbp
> > 24:* 89 e5 mov %esp,%ebp <-- trapping instruction
> > 26: 0f 34 sysenter
> > 28: cd 80 int $0x80
> > 2a: 5d pop %rbp
> > 2b: 5a pop %rdx
> > 2c: 59 pop %rcx
> > 2d: c3 retq
> > 2e: 90 nop
> > 2f: 90 nop
> > 30: 90 nop
> > 31: 90 nop
> > 32: 0f 1f 00 nopl (%rax)
> > 35: 58 pop %rax
> > 36: b8 77 00 00 00 mov $0x77,%eax
> > 3b: cd 80 int $0x80
> > 3d: 90 nop
> > 3e: 0f .byte 0xf
> > 3f: 1f (bad)
> >
> > Code starting with the faulting instruction
> > ===========================================
> > 0: 5d pop %rbp
> > 1: 5a pop %rdx
> > 2: 59 pop %rcx
> > 3: c3 retq
> > 4: 90 nop
> > 5: 90 nop
> > 6: 90 nop
> > 7: 90 nop
> > 8: 0f 1f 00 nopl (%rax)
> > b: 58 pop %rax
> > c: b8 77 00 00 00 mov $0x77,%eax
> > 11: cd 80 int $0x80
> > 13: 90 nop
> > 14: 0f .byte 0xf
> > 15: 1f (bad)
> >
> >
> > The kernel config and materials to reproduce are available at:
> > https://download.01.org/0day-ci/archive/20240130/[email protected]
> >
> >
> >

2024-01-31 12:11:09

by Tiwei Bie

[permalink] [raw]
Subject: Re: [linus:master] [sched/eevdf] 2227a957e1: BUG:kernel_NULL_pointer_dereference,address

On 1/30/24 6:13 PM, Abel Wu wrote:
> On 1/30/24 3:24 PM, kernel test robot Wrote:
>>
>>
>> Hello,
>>
>> (besides a previous performance report),
>> kernel test robot noticed "BUG:kernel_NULL_pointer_dereference,address" on:
>>
>> commit: 2227a957e1d5b1941be4e4207879ec74f4bb37f8 ("sched/eevdf: Sort the rbtree by virtual deadline")
>> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master
>>
>> [test failed on linus/master 3a5879d495b226d0404098e3564462d5f1daa33b]
>> [test failed on linux-next/master 01af33cc9894b4489fb68fa35c40e9fe85df63dc]
>>
>> in testcase: trinity
>> version: trinity-i386-abe9de86-1_20230429
>> with following parameters:
>>
>>     runtime: 300s
>>     group: group-03
>>     nr_groups: 5
>>
>> test-description: Trinity is a linux system call fuzz tester.
>> test-url: http://codemonkey.org.uk/projects/trinity/
>>
>>
>> compiler: clang-17
>> test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
>>
>> (please refer to attached dmesg/kmsg for entire log/backtrace)
>>
>>
>>
>> we found this issue happens in very random way (23 out of 999 runs).
>> but keeps clean on parent.
>
> Thanks for reporting, I will try to reproduce the issue. Does the 'parent'
> mean the same code branch without this commit?
>
>>
>> 84db47ca7146d7bd 2227a957e1d5b1941be4e420787
>> ---------------- ---------------------------
>>         fail:runs  %reproduction    fail:runs
>>             |             |             |
>>             :999          2%          23:999   dmesg.BUG:kernel_NULL_pointer_dereference,address
>>             :999          2%          23:999   dmesg.Kernel_panic-not_syncing:Fatal_exception
>>             :999          2%          23:999   dmesg.Oops:#[##]
>>
>>
>>
>>
>> If you fix the issue in a separate patch/commit (i.e. not just a new version of
>> the same patch/commit), kindly add following tags
>> | Reported-by: kernel test robot <[email protected]>
>> | Closes: https://lore.kernel.org/oe-lkp/[email protected]
>>
>>
>> sorry for below parse failure which caused no real line numbers.
>> we will follow further. the orgial dmesg could be fetch from below link.
>>
>>
>> [  512.079810][ T8305] BUG: kernel NULL pointer dereference, address: 0000002c
>> [  512.080897][ T8305] #PF: supervisor read access in kernel mode
>> [  512.081636][ T8305] #PF: error_code(0x0000) - not-present page
>> [  512.082337][ T8305] *pde = 00000000
>> [  512.082829][ T8305] Oops: 0000 [#1] PREEMPT SMP
>> [  512.083407][ T8305] CPU: 1 PID: 8305 Comm: watchdog Tainted: G        W        N 6.7.0-rc1-00006-g2227a957e1d5 #1 819e6d1a8b887f5f97adb4aed77d98b15504c836
>> [  512.084986][ T8305] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
>> [ 512.086203][ T8305] EIP: set_next_entity (fair.c:?)
>
> There was actually a NULL-test in pick_eevdf() before this commit,
> but I removed it by intent as I found it impossible to be NULL after
> examining 'all' the cases.
>
> Also cc Tiwei who once proposed to add this check back.
> https://lore.kernel.org/all/[email protected]/

Thanks for cc'ing me. That's the case I worried about and why I thought
it might be worthwhile to add the sanity check back. I just sent out a
new version of the above patch with updated commit log and error message.

https://lore.kernel.org/lkml/[email protected]/

Regards,
Tiwei


>
> Thanks,
>     Abel
>
>> [ 512.086881][ T8305] Code: 89 d6 89 cf 85 d2 74 22 39 77 58 75 1d 89 f0 90 90 90 90 90 8b 48 70 39 41 58 75 0e c7 41 58 00 00 00 00 8b 40 6c 85 c0 75 ea <83> 7e 2c 00 74 28 3e 8d 74 26 00 89 f9 89 f2 e8 36 59 00 00 89 f9
>> All code
>> ========
>>     0:    89 d6                    mov    %edx,%esi
>>     2:    89 cf                    mov    %ecx,%edi
>>     4:    85 d2                    test   %edx,%edx
>>     6:    74 22                    je     0x2a
>>     8:    39 77 58                 cmp    %esi,0x58(%rdi)
>>     b:    75 1d                    jne    0x2a
>>     d:    89 f0                    mov    %esi,%eax
>>     f:    90                       nop
>>    10:    90                       nop
>>    11:    90                       nop
>>    12:    90                       nop
>>    13:    90                       nop
>>    14:    8b 48 70                 mov    0x70(%rax),%ecx
>>    17:    39 41 58                 cmp    %eax,0x58(%rcx)
>>    1a:    75 0e                    jne    0x2a
>>    1c:    c7 41 58 00 00 00 00     movl   $0x0,0x58(%rcx)
>>    23:    8b 40 6c                 mov    0x6c(%rax),%eax
>>    26:    85 c0                    test   %eax,%eax
>>    28:    75 ea                    jne    0x14
>>    2a:*    83 7e 2c 00              cmpl   $0x0,0x2c(%rsi)        <-- trapping instruction
>>    2e:    74 28                    je     0x58
>>    30:    3e 8d 74 26 00           lea    %ds:0x0(%rsi,%riz,1),%esi
>>    35:    89 f9                    mov    %edi,%ecx
>>    37:    89 f2                    mov    %esi,%edx
>>    39:    e8 36 59 00 00           callq  0x5974
>>    3e:    89 f9                    mov    %edi,%ecx
>>
>> Code starting with the faulting instruction
>> ===========================================
>>     0:    83 7e 2c 00              cmpl   $0x0,0x2c(%rsi)
>>     4:    74 28                    je     0x2e
>>     6:    3e 8d 74 26 00           lea    %ds:0x0(%rsi,%riz,1),%esi
>>     b:    89 f9                    mov    %edi,%ecx
>>     d:    89 f2                    mov    %esi,%edx
>>     f:    e8 36 59 00 00           callq  0x594a
>>    14:    89 f9                    mov    %edi,%ecx
>> [  512.089261][ T8305] EAX: 00000000 EBX: e75ff100 ECX: e75ff100 EDX: 00000000
>> [  512.090180][ T8305] ESI: 00000000 EDI: e75ff100 EBP: ed229dd0 ESP: ed229dc0
>> [  512.091097][ T8305] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 EFLAGS: 00010046
>> [  512.092138][ T8305] CR0: 80050033 CR2: 0000002c CR3: 0a508000 CR4: 000406d0
>> [  512.093076][ T8305] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
>> [  512.094004][ T8305] DR6: fffe0ff0 DR7: 00000400
>> [  512.094643][ T8305] Call Trace:
>> [ 512.095145][ T8305] ? __die_body (dumpstack.c:?)
>> [ 512.095755][ T8305] ? __die (??:?)
>> [ 512.096309][ T8305] ? page_fault_oops (fault.c:?)
>> [ 512.096994][ T8305] ? __lock_acquire (lockdep.c:?)
>> [ 512.097677][ T8305] ? kmemleak_alloc (??:?)
>> [ 512.098343][ T8305] ? kernelmode_fixup_or_oops (fault.c:?)
>> [ 512.099085][ T8305] ? __bad_area_nosemaphore (fault.c:?)
>> [ 512.099839][ T8305] ? bad_area_nosemaphore (fault.c:?)
>> [ 512.100579][ T8305] ? do_user_addr_fault (fault.c:?)
>> [ 512.101325][ T8305] ? exc_page_fault (??:?)
>> [ 512.101992][ T8305] ? pvclock_clocksource_read_nowd (??:?)
>> [ 512.102799][ T8305] ? handle_exception (init_task.c:?)
>> [ 512.103492][ T8305] ? yield_to_task_fair (fair.c:?)
>> [ 512.104166][ T8305] ? pvclock_clocksource_read_nowd (??:?)
>> [ 512.104975][ T8305] ? set_next_entity (fair.c:?)
>> [ 512.105655][ T8305] ? pvclock_clocksource_read_nowd (??:?)
>> [ 512.106447][ T8305] ? set_next_entity (fair.c:?)
>> [ 512.107106][ T8305] pick_next_task_fair (??:?)
>> [ 512.107788][ T8305] ? put_prev_task_rt (build_policy.c:?)
>> [ 512.108460][ T8305] __pick_next_task_fair (fair.c:?)
>> [ 512.109162][ T8305] __schedule (core.c:?)
>> [ 512.109753][ T8305] schedule (??:?)
>> [ 512.110284][ T8305] do_nanosleep (hrtimer.c:?)
>> [ 512.110900][ T8305] hrtimer_nanosleep (??:?)
>> [ 512.111581][ T8305] ? __remove_hrtimer (hrtimer.c:?)
>> [ 512.112264][ T8305] common_nsleep (posix-timers.c:?)
>> [ 512.112891][ T8305] __ia32_sys_clock_nanosleep (??:?)
>> [ 512.113657][ T8305] ? syscall_enter_from_user_mode_work (??:?)
>> [ 512.114486][ T8305] __do_fast_syscall_32 (common.c:?)
>> [ 512.115180][ T8305] ? irqentry_exit_to_user_mode (??:?)
>> [ 512.115936][ T8305] ? irqentry_exit_to_user_mode (??:?)
>> [ 512.116690][ T8305] do_fast_syscall_32 (??:?)
>> [ 512.117386][ T8305] do_SYSENTER_32 (??:?)
>> [ 512.118022][ T8305] entry_SYSENTER_32 (??:?)
>> [  512.118645][ T8305] EIP: 0xb7edf539
>> [ 512.119172][ T8305] Code: 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 0f 1f 00 58 b8 77 00 00 00 cd 80 90 0f 1f
>> All code
>> ========
>>     0:    03 74 b4 01              add    0x1(%rsp,%rsi,4),%esi
>>     4:    10 07                    adc    %al,(%rdi)
>>     6:    03 74 b0 01              add    0x1(%rax,%rsi,4),%esi
>>     a:    10 08                    adc    %cl,(%rax)
>>     c:    03 74 d8 01              add    0x1(%rax,%rbx,8),%esi
>>     ...
>>    20:    00 51 52                 add    %dl,0x52(%rcx)
>>    23:    55                       push   %rbp
>>    24:*    89 e5                    mov    %esp,%ebp        <-- trapping instruction
>>    26:    0f 34                    sysenter
>>    28:    cd 80                    int    $0x80
>>    2a:    5d                       pop    %rbp
>>    2b:    5a                       pop    %rdx
>>    2c:    59                       pop    %rcx
>>    2d:    c3                       retq
>>    2e:    90                       nop
>>    2f:    90                       nop
>>    30:    90                       nop
>>    31:    90                       nop
>>    32:    0f 1f 00                 nopl   (%rax)
>>    35:    58                       pop    %rax
>>    36:    b8 77 00 00 00           mov    $0x77,%eax
>>    3b:    cd 80                    int    $0x80
>>    3d:    90                       nop
>>    3e:    0f                       .byte 0xf
>>    3f:    1f                       (bad)
>>
>> Code starting with the faulting instruction
>> ===========================================
>>     0:    5d                       pop    %rbp
>>     1:    5a                       pop    %rdx
>>     2:    59                       pop    %rcx
>>     3:    c3                       retq
>>     4:    90                       nop
>>     5:    90                       nop
>>     6:    90                       nop
>>     7:    90                       nop
>>     8:    0f 1f 00                 nopl   (%rax)
>>     b:    58                       pop    %rax
>>     c:    b8 77 00 00 00           mov    $0x77,%eax
>>    11:    cd 80                    int    $0x80
>>    13:    90                       nop
>>    14:    0f                       .byte 0xf
>>    15:    1f                       (bad)
>>
>>
>> The kernel config and materials to reproduce are available at:
>> https://download.01.org/0day-ci/archive/20240130/[email protected]
>>
>>
>>


2024-01-31 12:34:29

by Abel Wu

[permalink] [raw]
Subject: Re: Re: [linus:master] [sched/eevdf] 2227a957e1: BUG:kernel_NULL_pointer_dereference,address

On 1/30/24 10:09 PM, Oliver Sang Wrote:
> hi, Abel,
>
> On Tue, Jan 30, 2024 at 06:13:32PM +0800, Abel Wu wrote:
>> On 1/30/24 3:24 PM, kernel test robot Wrote:
>>>
>>> we found this issue happens in very random way (23 out of 999 runs).
>>> but keeps clean on parent.
>>
>> Thanks for reporting, I will try to reproduce the issue. Does the 'parent'
>> mean the same code branch without this commit?
>
>
> it just means the parent in git log.
>
> xsang@inn:/c/repo/linux$ git log --oneline --graph 2227a957e1
> * 2227a957e1d5b sched/eevdf: Sort the rbtree by virtual deadline
> * 84db47ca7146d sched/numa: Fix mm numa_scan_seq based unconditional scan <-- parent

Thanks for clarifying, Oliver. And can you please check the parent's
dmesg for the log:

pr_err("EEVDF scheduling fail, picking leftmost\n");

which potentially turns a kernel panic to a message.

2024-01-31 13:05:14

by Abel Wu

[permalink] [raw]
Subject: Re: Re: [linus:master] [sched/eevdf] 2227a957e1: BUG:kernel_NULL_pointer_dereference,address

On 1/31/24 8:10 PM, Tiwei Bie Wrote:
> On 1/30/24 6:13 PM, Abel Wu wrote:
>> On 1/30/24 3:24 PM, kernel test robot Wrote:
>>>
>>> [  512.079810][ T8305] BUG: kernel NULL pointer dereference, address: 0000002c
>>> [  512.080897][ T8305] #PF: supervisor read access in kernel mode
>>> [  512.081636][ T8305] #PF: error_code(0x0000) - not-present page
>>> [  512.082337][ T8305] *pde = 00000000
>>> [  512.082829][ T8305] Oops: 0000 [#1] PREEMPT SMP
>>> [  512.083407][ T8305] CPU: 1 PID: 8305 Comm: watchdog Tainted: G        W        N 6.7.0-rc1-00006-g2227a957e1d5 #1 819e6d1a8b887f5f97adb4aed77d98b15504c836
>>> [  512.084986][ T8305] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
>>> [ 512.086203][ T8305] EIP: set_next_entity (fair.c:?)
>>
>> There was actually a NULL-test in pick_eevdf() before this commit,
>> but I removed it by intent as I found it impossible to be NULL after
>> examining 'all' the cases.
>>
>> Also cc Tiwei who once proposed to add this check back.
>> https://lore.kernel.org/all/[email protected]/
>
> Thanks for cc'ing me. That's the case I worried about and why I thought
> it might be worthwhile to add the sanity check back. I just sent out a
> new version of the above patch with updated commit log and error message.

I assuming the real problem is why it *can* be NULL at first place.
IMHO the NULL check with a fallback selection doesn't solve this, but
it indeed avoids kernel panic which is absolutely important.

2024-02-19 12:36:02

by Chen Yu

[permalink] [raw]
Subject: Re: Re: [linus:master] [sched/eevdf] 2227a957e1: BUG:kernel_NULL_pointer_dereference,address

On 2024-01-31 at 20:28:19 +0800, Abel Wu wrote:
> On 1/31/24 8:10 PM, Tiwei Bie Wrote:
> > On 1/30/24 6:13 PM, Abel Wu wrote:
> > > On 1/30/24 3:24 PM, kernel test robot Wrote:
> > > >
> > > > [? 512.079810][ T8305] BUG: kernel NULL pointer dereference, address: 0000002c
> > > > [? 512.080897][ T8305] #PF: supervisor read access in kernel mode
> > > > [? 512.081636][ T8305] #PF: error_code(0x0000) - not-present page
> > > > [? 512.082337][ T8305] *pde = 00000000
> > > > [? 512.082829][ T8305] Oops: 0000 [#1] PREEMPT SMP
> > > > [? 512.083407][ T8305] CPU: 1 PID: 8305 Comm: watchdog Tainted: G??????? W??????? N 6.7.0-rc1-00006-g2227a957e1d5 #1 819e6d1a8b887f5f97adb4aed77d98b15504c836
> > > > [? 512.084986][ T8305] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> > > > [ 512.086203][ T8305] EIP: set_next_entity (fair.c:?)
> > >
> > > There was actually a NULL-test in pick_eevdf() before this commit,
> > > but I removed it by intent as I found it impossible to be NULL after
> > > examining 'all' the cases.
> > >
> > > Also cc Tiwei who once proposed to add this check back.
> > > https://lore.kernel.org/all/[email protected]/
> >
> > Thanks for cc'ing me. That's the case I worried about and why I thought
> > it might be worthwhile to add the sanity check back. I just sent out a
> > new version of the above patch with updated commit log and error message.
>
> I assuming the real problem is why it *can* be NULL at first place.
> IMHO the NULL check with a fallback selection doesn't solve this, but
> it indeed avoids kernel panic which is absolutely important.

I tried to reproduce this issue locally but with no luck. It might
be related to lkp's environment so I'm thinking of adding the following
change into lkp's test repo to help narrow down: when the pick_eevdf() finds
an NULL candidate, print the whole rb-tree, including each entity's vruntime,
min_vruntime, deadline, etc. So we can figure out why we failed to find a proper
entity, thoughts?

Example output:

[ 325.425017] cfs_rq avg_vruntime:-798130176 avg_load:8192 min_vruntime:29296269335
[ 325.425247] current se 0xff11000105858080 is on_rq, deadline:29299081075 min_vruntime:29296537185 vruntime:29297356572, load:1024
[ 325.425514] Traverse rb-tree from left to right
[ 325.425635] se 0xff11000101a28080 deadline:29293835648 min_vruntime:29290835648 vruntime:29290835648 eligible
[ 325.425876] se 0xff11000113113000 deadline:29298566205 min_vruntime:29290835648 vruntime:29297726206 non-eligible
[ 325.426121] se 0xff110001036cb000 deadline:29298613981 min_vruntime:29290835648 vruntime:29297094477 non-eligible
[ 325.426358] se 0xff11000118148080 deadline:29298616409 min_vruntime:29297255015 vruntime:29297255015 non-eligible
[ 325.426597] se 0xff11000111523000 deadline:29299066492 min_vruntime:29290835648 vruntime:29296949653 non-eligible
[ 325.426836] se 0xff11000106173000 deadline:29299152426 min_vruntime:29296537185 vruntime:29296537185 non-eligible
[ 325.427085] se 0xff11000105f03000 deadline:29299269335 min_vruntime:29296269335 vruntime:29296269335 eligible
[ 325.427322] se 0xff110001043c0080 deadline:29299707737 min_vruntime:29296707737 vruntime:29296707737 non-eligible
[ 325.427558] Traverse rb-tree from topdown
[ 325.427648] left se 0xff110001036cb000 deadline:29298613981 min_vruntime:29290835648 vruntime:29297094477 left half eligible
[ 325.427917] left se 0xff11000113113000 deadline:29298566205 min_vruntime:29290835648 vruntime:29297726206 left half eligible
[ 325.428171] left se 0xff11000101a28080 deadline:29293835648 min_vruntime:29290835648 vruntime:29290835648 left half eligible
[ 325.428440] middle se 0xff11000101a28080 deadline:29293835648 min_vruntime:29290835648 vruntime:29290835648 eligible
[ 325.428679] Found best:0xff11000101a28080


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 533547e3c90a..ddacb3dc91b9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -856,6 +856,69 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
return __node_2_se(left);
}

+static void debug_print_eevdf(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *se, *curr = cfs_rq->curr;
+ struct sched_entity *best = NULL;
+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct rb_node *n;
+
+ printk("cfs_rq avg_vruntime:%lld avg_load:%lld min_vruntime:%lld\n",
+ cfs_rq->avg_vruntime, cfs_rq->avg_load, cfs_rq->min_vruntime);
+
+ if (curr && curr->on_rq)
+ printk("current on_rq se 0x%lx, deadline:%lld min_vruntime:%lld vruntime:%lld, load:%ld\n",
+ (unsigned long)curr, curr->deadline, curr->min_vruntime,
+ curr->vruntime, scale_load_down(curr->load.weight));
+
+ printk("Traverse rb-tree from left to right\n");
+ for (n = rb_first(&cfs_rq->tasks_timeline.rb_root); n != NULL; n = rb_next(n)) {
+ bool eli;
+
+ se = __node_2_se(n);
+ eli = entity_eligible(cfs_rq, se);
+ printk(" se 0x%lx deadline:%lld min_vruntime:%lld vruntime:%lld %s\n",
+ (unsigned long)se, se->deadline, se->min_vruntime, se->vruntime,
+ eli ? "eligible" : "non-eligible");
+ }
+ printk("Traverse rb-tree from topdown\n");
+ while (node) {
+ struct rb_node *left = node->rb_left;
+ struct sched_entity *se_left = __node_2_se(left);
+ bool eli = false;
+
+ if (left) {
+ eli = vruntime_eligible(cfs_rq,
+ __node_2_se(left)->min_vruntime);
+
+ printk(" left se 0x%lx deadline:%lld min_vruntime:%lld vruntime:%lld %s\n",
+ (unsigned long)se_left, se_left->deadline, se_left->min_vruntime,
+ se_left->vruntime,
+ eli ? "left half eligible" : "left half non-eligible");
+ }
+
+ if (left && eli) {
+ node = left;
+ continue;
+ }
+
+ se = __node_2_se(node);
+
+ eli = entity_eligible(cfs_rq, se);
+ printk(" middle se 0x%lx deadline:%lld min_vruntime:%lld vruntime:%lld %s\n",
+ (unsigned long)se, se->deadline, se->min_vruntime, se->vruntime,
+ eli ? "eligible" : "non-eligible");
+
+ if (eli) {
+ best = se;
+ break;
+ }
+
+ node = node->rb_right;
+ }
+ printk("Found best:0x%lx\n", (unsigned long)best);
+}
+
/*
* Earliest Eligible Virtual Deadline First
*
@@ -937,6 +1000,9 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
if (!best || (curr && entity_before(curr, best)))
best = curr;

+ if (unlikely(!best))
+ debug_print_eevdf(cfs_rq);
+
return best;
}

--
2.34.1

thanks,
Chenyu

2024-02-19 12:50:33

by Chen Yu

[permalink] [raw]
Subject: Re: [linus:master] [sched/eevdf] 2227a957e1: BUG:kernel_NULL_pointer_dereference,address

On 2024-01-30 at 18:13:32 +0800, Abel Wu wrote:
> On 1/30/24 3:24 PM, kernel test robot Wrote:
> >
> >
> > Hello,
> >
> > (besides a previous performance report),
> > kernel test robot noticed "BUG:kernel_NULL_pointer_dereference,address" on:
> >
> > commit: 2227a957e1d5b1941be4e4207879ec74f4bb37f8 ("sched/eevdf: Sort the rbtree by virtual deadline")
> > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master
> >
> > [test failed on linus/master 3a5879d495b226d0404098e3564462d5f1daa33b]
> > [test failed on linux-next/master 01af33cc9894b4489fb68fa35c40e9fe85df63dc]
> >
> > in testcase: trinity
> > version: trinity-i386-abe9de86-1_20230429
> > with following parameters:
> >
> > runtime: 300s
> > group: group-03
> > nr_groups: 5
> >
> > test-description: Trinity is a linux system call fuzz tester.
> > test-url: http://codemonkey.org.uk/projects/trinity/
> >
> >
> > compiler: clang-17
> > test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
> >
> > (please refer to attached dmesg/kmsg for entire log/backtrace)
> >
> >
> >
> > we found this issue happens in very random way (23 out of 999 runs).
> > but keeps clean on parent.
>
> Thanks for reporting, I will try to reproduce the issue. Does the 'parent'
> mean the same code branch without this commit?
>
> >
> > 84db47ca7146d7bd 2227a957e1d5b1941be4e420787
> > ---------------- ---------------------------
> > fail:runs %reproduction fail:runs
> > | | |
> > :999 2% 23:999 dmesg.BUG:kernel_NULL_pointer_dereference,address
> > :999 2% 23:999 dmesg.Kernel_panic-not_syncing:Fatal_exception
> > :999 2% 23:999 dmesg.Oops:#[##]
> >
> >
> >
> >
> > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > the same patch/commit), kindly add following tags
> > | Reported-by: kernel test robot <[email protected]>
> > | Closes: https://lore.kernel.org/oe-lkp/[email protected]
> >
> >
> > sorry for below parse failure which caused no real line numbers.
> > we will follow further. the orgial dmesg could be fetch from below link.
> >
> >
> > [ 512.079810][ T8305] BUG: kernel NULL pointer dereference, address: 0000002c
> > [ 512.080897][ T8305] #PF: supervisor read access in kernel mode
> > [ 512.081636][ T8305] #PF: error_code(0x0000) - not-present page
> > [ 512.082337][ T8305] *pde = 00000000
> > [ 512.082829][ T8305] Oops: 0000 [#1] PREEMPT SMP
> > [ 512.083407][ T8305] CPU: 1 PID: 8305 Comm: watchdog Tainted: G W N 6.7.0-rc1-00006-g2227a957e1d5 #1 819e6d1a8b887f5f97adb4aed77d98b15504c836
> > [ 512.084986][ T8305] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> > [ 512.086203][ T8305] EIP: set_next_entity (fair.c:?)
>
> There was actually a NULL-test in pick_eevdf() before this commit,
> but I removed it by intent as I found it impossible to be NULL after
> examining 'all' the cases.
>
> Also cc Tiwei who once proposed to add this check back.
> https://lore.kernel.org/all/[email protected]/
>
> Thanks,
> Abel
>

While looking at pick_eevdf(), I have a thought.
Currently the sched entity is sorted by their deadline. During task
pickup, the pick_eevdf() scans for an candidate sched entity with the
smallest deadline. Meanwhile this candidate sched entity must also be
eligible.

The scan is O(lgn) on average, and O(1) at best case. How about making the
average scan even faster by sorting the sched entity not only by deadline,
but also the eligibility? The idea is that, the eligible sched entity with
smaller deadline is sorted at the front the tree. Otherwise, if the entity
is not eligible, even if it has a smaller deadline, it should be sorted
at the end of the tree.

After the change, pick_eevdf() get the leftmost sched entity at O(1) on
average. Besides, it is guaranteed to return non-NULL sched entity in
pick_eevdf(), which prevents suspicious NULL pointer exception in pick_eevdf().

For example, suppose there are two sched entities to be queued, se_a and se_b.
Consider their eligibility and deadline, there are 6 combination:

1. se_a is eligible, se_b is eligible, se_a.deadline < se_b.deadline
2. se_a is eligible, se_b is eligible, se_a.deadline >= se_b.deadline
3. se_a is eligible, se_b is not eligible
4. se_a is not eligible, se_b is eligible
5. se_a is not eligible, se_b is not eligible, se_a.deadline < se_b.deadline
6. se_a is not eligible, se_b is not eligible, se_a.deadline >= se_b.deadline

In scenario 1, 3, 5, sched_entity se_a should be sorted before se_b,
so pick_eevdf() would pick se_a first.

When enqueuing a new sched entity, it is regarded as eligible if its
vlag is positive. In theory later in pick_eevdf(), the eligibility
of this sched entity should be re-checked via entity_eligible(). But
consider if the sched entity is eliglble when enqueued, it is very
likely the same sched entity remains eligible when pick_eevdf(), because
the V keeps moving forward but the vruntime of this sched entity remain
unchanged - the vlag could get larger.

Something like this untested:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 533547e3c90a..831043cc1432 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -551,11 +551,19 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
static inline bool entity_before(const struct sched_entity *a,
const struct sched_entity *b)
{
- /*
- * Tiebreak on vruntime seems unnecessary since it can
- * hardly happen.
- */
- return (s64)(a->deadline - b->deadline) < 0;
+ bool eli_a, eli_b;
+
+ eli_a = (a->vlag >= 0) ? true : false;
+ eli_b = (b->vlag >= 0) ? true : false;
+
+ if ((eli_a && eli_b) || (!eli_a && !eli_b))
+ /*
+ * Tiebreak on vruntime seems unnecessary since it can
+ * hardly happen.
+ */
+ return (s64)(a->deadline - b->deadline) < 0;
+
+ return eli_a ? 1 : 0;
}

static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -877,10 +885,8 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
*/
static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
- struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
- struct sched_entity *best = NULL;

/*
* We can safely skip eligibility check if there is only one entity
@@ -899,45 +905,8 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
return curr;

- /* Pick the leftmost entity if it's eligible */
- if (se && entity_eligible(cfs_rq, se)) {
- best = se;
- goto found;
- }
-
- /* Heap search for the EEVD entity */
- while (node) {
- struct rb_node *left = node->rb_left;
-
- /*
- * Eligible entities in left subtree are always better
- * choices, since they have earlier deadlines.
- */
- if (left && vruntime_eligible(cfs_rq,
- __node_2_se(left)->min_vruntime)) {
- node = left;
- continue;
- }
-
- se = __node_2_se(node);
-
- /*
- * The left subtree either is empty or has no eligible
- * entity, so check the current node since it is the one
- * with earliest deadline that might be eligible.
- */
- if (entity_eligible(cfs_rq, se)) {
- best = se;
- break;
- }
-
- node = node->rb_right;
- }
-found:
- if (!best || (curr && entity_before(curr, best)))
- best = curr;
-
- return best;
+ /* Pick the leftmost entity */
+ return se;
}

#ifdef CONFIG_SCHED_DEBUG
--
2.25.1


2024-02-20 03:40:08

by Abel Wu

[permalink] [raw]
Subject: Re: Re: Re: [linus:master] [sched/eevdf] 2227a957e1: BUG:kernel_NULL_pointer_dereference,address

Hi Chen,

On 2/19/24 8:35 PM, Chen Yu Wrote:
> On 2024-01-31 at 20:28:19 +0800, Abel Wu wrote:
>> On 1/31/24 8:10 PM, Tiwei Bie Wrote:
>>> On 1/30/24 6:13 PM, Abel Wu wrote:
>>>> On 1/30/24 3:24 PM, kernel test robot Wrote:
>>>>>
>>>>> [  512.079810][ T8305] BUG: kernel NULL pointer dereference, address: 0000002c
>>>>> [  512.080897][ T8305] #PF: supervisor read access in kernel mode
>>>>> [  512.081636][ T8305] #PF: error_code(0x0000) - not-present page
>>>>> [  512.082337][ T8305] *pde = 00000000
>>>>> [  512.082829][ T8305] Oops: 0000 [#1] PREEMPT SMP
>>>>> [  512.083407][ T8305] CPU: 1 PID: 8305 Comm: watchdog Tainted: G        W        N 6.7.0-rc1-00006-g2227a957e1d5 #1 819e6d1a8b887f5f97adb4aed77d98b15504c836
>>>>> [  512.084986][ T8305] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
>>>>> [ 512.086203][ T8305] EIP: set_next_entity (fair.c:?)
>>>>
>>>> There was actually a NULL-test in pick_eevdf() before this commit,
>>>> but I removed it by intent as I found it impossible to be NULL after
>>>> examining 'all' the cases.
>>>>
>>>> Also cc Tiwei who once proposed to add this check back.
>>>> https://lore.kernel.org/all/[email protected]/
>>>
>>> Thanks for cc'ing me. That's the case I worried about and why I thought
>>> it might be worthwhile to add the sanity check back. I just sent out a
>>> new version of the above patch with updated commit log and error message.
>>
>> I assuming the real problem is why it *can* be NULL at first place.
>> IMHO the NULL check with a fallback selection doesn't solve this, but
>> it indeed avoids kernel panic which is absolutely important.
>
> I tried to reproduce this issue locally but with no luck. It might

Me either, with a x86 machine. Staring at the dmesg provided by Oliver,
I guess the bug is arch-specific.

> be related to lkp's environment so I'm thinking of adding the following
> change into lkp's test repo to help narrow down: when the pick_eevdf() finds
> an NULL candidate, print the whole rb-tree, including each entity's vruntime,
> min_vruntime, deadline, etc. So we can figure out why we failed to find a proper
> entity, thoughts?

It helps, but not in all ways IMHO. There might be someone else manipulating
the rbtree under improper locking, and if it is the case, the time of debug
could be late.


2024-02-20 04:04:58

by Abel Wu

[permalink] [raw]
Subject: Re: Re: [linus:master] [sched/eevdf] 2227a957e1: BUG:kernel_NULL_pointer_dereference,address

On 2/19/24 8:49 PM, Chen Yu Wrote:
>
> While looking at pick_eevdf(), I have a thought.
> Currently the sched entity is sorted by their deadline. During task
> pickup, the pick_eevdf() scans for an candidate sched entity with the
> smallest deadline. Meanwhile this candidate sched entity must also be
> eligible.
>
> The scan is O(lgn) on average, and O(1) at best case. How about making the
> average scan even faster by sorting the sched entity not only by deadline,
> but also the eligibility? The idea is that, the eligible sched entity with
> smaller deadline is sorted at the front the tree. Otherwise, if the entity
> is not eligible, even if it has a smaller deadline, it should be sorted
> at the end of the tree.

Eligibility is dynamic due to the nature of weighted average vruntime.
IIUC if doing so like above, update_curr() should take the responsibility
to re-sort the tree which seems to be O(logN).

>
> After the change, pick_eevdf() get the leftmost sched entity at O(1) on
> average. Besides, it is guaranteed to return non-NULL sched entity in
> pick_eevdf(), which prevents suspicious NULL pointer exception in pick_eevdf().

It is guaranteed when doing pick that the rbtree is non-NULL, and given
that rq lock is held, I don't think the bug is inside pick_eevdf().

>
> For example, suppose there are two sched entities to be queued, se_a and se_b.
> Consider their eligibility and deadline, there are 6 combination:
>
> 1. se_a is eligible, se_b is eligible, se_a.deadline < se_b.deadline
> 2. se_a is eligible, se_b is eligible, se_a.deadline >= se_b.deadline
> 3. se_a is eligible, se_b is not eligible
> 4. se_a is not eligible, se_b is eligible
> 5. se_a is not eligible, se_b is not eligible, se_a.deadline < se_b.deadline
> 6. se_a is not eligible, se_b is not eligible, se_a.deadline >= se_b.deadline
>
> In scenario 1, 3, 5, sched_entity se_a should be sorted before se_b,
> so pick_eevdf() would pick se_a first.
>
> When enqueuing a new sched entity, it is regarded as eligible if its
> vlag is positive. In theory later in pick_eevdf(), the eligibility
> of this sched entity should be re-checked via entity_eligible(). But
> consider if the sched entity is eliglble when enqueued, it is very
> likely the same sched entity remains eligible when pick_eevdf(), because
> the V keeps moving forward but the vruntime of this sched entity remain
> unchanged - the vlag could get larger.
>
> Something like this untested:
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 533547e3c90a..831043cc1432 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -551,11 +551,19 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
> static inline bool entity_before(const struct sched_entity *a,
> const struct sched_entity *b)
> {
> - /*
> - * Tiebreak on vruntime seems unnecessary since it can
> - * hardly happen.
> - */
> - return (s64)(a->deadline - b->deadline) < 0;
> + bool eli_a, eli_b;
> +
> + eli_a = (a->vlag >= 0) ? true : false;
> + eli_b = (b->vlag >= 0) ? true : false;
> +
> + if ((eli_a && eli_b) || (!eli_a && !eli_b))
> + /*
> + * Tiebreak on vruntime seems unnecessary since it can
> + * hardly happen.
> + */
> + return (s64)(a->deadline - b->deadline) < 0;
> +
> + return eli_a ? 1 : 0;
> }
>
> static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
> @@ -877,10 +885,8 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
> */
> static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> {
> - struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
> struct sched_entity *se = __pick_first_entity(cfs_rq);
> struct sched_entity *curr = cfs_rq->curr;
> - struct sched_entity *best = NULL;
>
> /*
> * We can safely skip eligibility check if there is only one entity
> @@ -899,45 +905,8 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
> return curr;
>
> - /* Pick the leftmost entity if it's eligible */
> - if (se && entity_eligible(cfs_rq, se)) {
> - best = se;
> - goto found;
> - }
> -
> - /* Heap search for the EEVD entity */
> - while (node) {
> - struct rb_node *left = node->rb_left;
> -
> - /*
> - * Eligible entities in left subtree are always better
> - * choices, since they have earlier deadlines.
> - */
> - if (left && vruntime_eligible(cfs_rq,
> - __node_2_se(left)->min_vruntime)) {
> - node = left;
> - continue;
> - }
> -
> - se = __node_2_se(node);
> -
> - /*
> - * The left subtree either is empty or has no eligible
> - * entity, so check the current node since it is the one
> - * with earliest deadline that might be eligible.
> - */
> - if (entity_eligible(cfs_rq, se)) {
> - best = se;
> - break;
> - }
> -
> - node = node->rb_right;
> - }
> -found:
> - if (!best || (curr && entity_before(curr, best)))
> - best = curr;
> -
> - return best;
> + /* Pick the leftmost entity */
> + return se;
> }
>
> #ifdef CONFIG_SCHED_DEBUG

2024-02-20 09:19:12

by Chen Yu

[permalink] [raw]
Subject: Re: Re: [linus:master] [sched/eevdf] 2227a957e1: BUG:kernel_NULL_pointer_dereference,address

On 2024-02-20 at 12:04:33 +0800, Abel Wu wrote:
> On 2/19/24 8:49 PM, Chen Yu Wrote:
> >
> > While looking at pick_eevdf(), I have a thought.
> > Currently the sched entity is sorted by their deadline. During task
> > pickup, the pick_eevdf() scans for an candidate sched entity with the
> > smallest deadline. Meanwhile this candidate sched entity must also be
> > eligible.
> >
> > The scan is O(lgn) on average, and O(1) at best case. How about making the
> > average scan even faster by sorting the sched entity not only by deadline,
> > but also the eligibility? The idea is that, the eligible sched entity with
> > smaller deadline is sorted at the front the tree. Otherwise, if the entity
> > is not eligible, even if it has a smaller deadline, it should be sorted
> > at the end of the tree.
>
> Eligibility is dynamic due to the nature of weighted average vruntime.
> IIUC if doing so like above, update_curr() should take the responsibility
> to re-sort the tree which seems to be O(logN).
>
> >
> > After the change, pick_eevdf() get the leftmost sched entity at O(1) on
> > average. Besides, it is guaranteed to return non-NULL sched entity in
> > pick_eevdf(), which prevents suspicious NULL pointer exception in pick_eevdf().
>
> It is guaranteed when doing pick that the rbtree is non-NULL, and given
> that rq lock is held, I don't think the bug is inside pick_eevdf().
>

That's true, my suspect is that although the tree is not NULL, the eligible
check might return false negative thus no candidate is chosen, and pick_eevdf()
is the victim of that. Previously commit 8dafa9d0eb1a ("sched/eevdf: Fix min_deadline
heap integrity") did similar thing to fix the min_deadline. Without this change,
the tree might be traversed incorrectly and found a NULL candidate. And this is
why I'm thinking of printing the whole tree when NULL entity is chosen in
pick_eevdf().

thanks,
Chenyu

2024-02-21 02:17:24

by Oliver Sang

[permalink] [raw]
Subject: Re: Re: [linus:master] [sched/eevdf] 2227a957e1: BUG:kernel_NULL_pointer_dereference,address

hi, Chen Yu,

On Mon, Feb 19, 2024 at 08:35:21PM +0800, Chen, Yu C wrote:
> On 2024-01-31 at 20:28:19 +0800, Abel Wu wrote:
> > On 1/31/24 8:10 PM, Tiwei Bie Wrote:
> > > On 1/30/24 6:13 PM, Abel Wu wrote:
> > > > On 1/30/24 3:24 PM, kernel test robot Wrote:
> > > > >
> > > > > [? 512.079810][ T8305] BUG: kernel NULL pointer dereference, address: 0000002c
> > > > > [? 512.080897][ T8305] #PF: supervisor read access in kernel mode
> > > > > [? 512.081636][ T8305] #PF: error_code(0x0000) - not-present page
> > > > > [? 512.082337][ T8305] *pde = 00000000
> > > > > [? 512.082829][ T8305] Oops: 0000 [#1] PREEMPT SMP
> > > > > [? 512.083407][ T8305] CPU: 1 PID: 8305 Comm: watchdog Tainted: G??????? W??????? N 6.7.0-rc1-00006-g2227a957e1d5 #1 819e6d1a8b887f5f97adb4aed77d98b15504c836
> > > > > [? 512.084986][ T8305] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> > > > > [ 512.086203][ T8305] EIP: set_next_entity (fair.c:?)
> > > >
> > > > There was actually a NULL-test in pick_eevdf() before this commit,
> > > > but I removed it by intent as I found it impossible to be NULL after
> > > > examining 'all' the cases.
> > > >
> > > > Also cc Tiwei who once proposed to add this check back.
> > > > https://lore.kernel.org/all/[email protected]/
> > >
> > > Thanks for cc'ing me. That's the case I worried about and why I thought
> > > it might be worthwhile to add the sanity check back. I just sent out a
> > > new version of the above patch with updated commit log and error message.
> >
> > I assuming the real problem is why it *can* be NULL at first place.
> > IMHO the NULL check with a fallback selection doesn't solve this, but
> > it indeed avoids kernel panic which is absolutely important.
>
> I tried to reproduce this issue locally but with no luck. It might
> be related to lkp's environment so I'm thinking of adding the following
> change into lkp's test repo to help narrow down: when the pick_eevdf() finds
> an NULL candidate, print the whole rb-tree, including each entity's vruntime,
> min_vruntime, deadline, etc. So we can figure out why we failed to find a proper
> entity, thoughts?
>
> Example output:
>
> [ 325.425017] cfs_rq avg_vruntime:-798130176 avg_load:8192 min_vruntime:29296269335
> [ 325.425247] current se 0xff11000105858080 is on_rq, deadline:29299081075 min_vruntime:29296537185 vruntime:29297356572, load:1024
> [ 325.425514] Traverse rb-tree from left to right
> [ 325.425635] se 0xff11000101a28080 deadline:29293835648 min_vruntime:29290835648 vruntime:29290835648 eligible
> [ 325.425876] se 0xff11000113113000 deadline:29298566205 min_vruntime:29290835648 vruntime:29297726206 non-eligible
> [ 325.426121] se 0xff110001036cb000 deadline:29298613981 min_vruntime:29290835648 vruntime:29297094477 non-eligible
> [ 325.426358] se 0xff11000118148080 deadline:29298616409 min_vruntime:29297255015 vruntime:29297255015 non-eligible
> [ 325.426597] se 0xff11000111523000 deadline:29299066492 min_vruntime:29290835648 vruntime:29296949653 non-eligible
> [ 325.426836] se 0xff11000106173000 deadline:29299152426 min_vruntime:29296537185 vruntime:29296537185 non-eligible
> [ 325.427085] se 0xff11000105f03000 deadline:29299269335 min_vruntime:29296269335 vruntime:29296269335 eligible
> [ 325.427322] se 0xff110001043c0080 deadline:29299707737 min_vruntime:29296707737 vruntime:29296707737 non-eligible
> [ 325.427558] Traverse rb-tree from topdown
> [ 325.427648] left se 0xff110001036cb000 deadline:29298613981 min_vruntime:29290835648 vruntime:29297094477 left half eligible
> [ 325.427917] left se 0xff11000113113000 deadline:29298566205 min_vruntime:29290835648 vruntime:29297726206 left half eligible
> [ 325.428171] left se 0xff11000101a28080 deadline:29293835648 min_vruntime:29290835648 vruntime:29290835648 left half eligible
> [ 325.428440] middle se 0xff11000101a28080 deadline:29293835648 min_vruntime:29290835648 vruntime:29290835648 eligible
> [ 325.428679] Found best:0xff11000101a28080

I applied below patch upon 2227a957e1, captured such like:

[ 514.461242][ T8390] cfs_rq avg_vruntime:386638640128 avg_load:2048 min_vruntime:763383370431
[ 514.461431][ T8390]

..

[ 514.535935][ T8390] current on_rq se 0xc5851400, deadline:-10892060147320170 min_vruntime:-9622957955883918 vruntime:-10892060147608212, load:629
[ 514.536772][ T8390] Traverse rb-tree from left to right
[ 514.537138][ T8390] se 0xec1234e0 deadline:763384870431 min_vruntime:763383370431 vruntime:763383370431 non-eligible
[ 514.537835][ T8390] se 0xec4fcf20 deadline:763762447228 min_vruntime:763760947228 vruntime:763760947228 non-eligible
[ 514.538539][ T8390] Traverse rb-tree from topdown
[ 514.538877][ T8390] middle se 0xec1234e0 deadline:763384870431 min_vruntime:763383370431 vruntime:763383370431 non-eligible
[ 514.539605][ T8390] middle se 0xec4fcf20 deadline:763762447228 min_vruntime:763760947228 vruntime:763760947228 non-eligible
[ 514.540340][ T8390] Found best:0x0
[ 514.540613][ T8390] BUG: kernel NULL pointer dereference, address: 00000074


more details are in attached dmesg-debug.xz


>
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 533547e3c90a..ddacb3dc91b9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -856,6 +856,69 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
> return __node_2_se(left);
> }
>
> +static void debug_print_eevdf(struct cfs_rq *cfs_rq)
> +{
> + struct sched_entity *se, *curr = cfs_rq->curr;
> + struct sched_entity *best = NULL;
> + struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
> + struct rb_node *n;
> +
> + printk("cfs_rq avg_vruntime:%lld avg_load:%lld min_vruntime:%lld\n",
> + cfs_rq->avg_vruntime, cfs_rq->avg_load, cfs_rq->min_vruntime);
> +
> + if (curr && curr->on_rq)
> + printk("current on_rq se 0x%lx, deadline:%lld min_vruntime:%lld vruntime:%lld, load:%ld\n",
> + (unsigned long)curr, curr->deadline, curr->min_vruntime,
> + curr->vruntime, scale_load_down(curr->load.weight));
> +
> + printk("Traverse rb-tree from left to right\n");
> + for (n = rb_first(&cfs_rq->tasks_timeline.rb_root); n != NULL; n = rb_next(n)) {
> + bool eli;
> +
> + se = __node_2_se(n);
> + eli = entity_eligible(cfs_rq, se);
> + printk(" se 0x%lx deadline:%lld min_vruntime:%lld vruntime:%lld %s\n",
> + (unsigned long)se, se->deadline, se->min_vruntime, se->vruntime,
> + eli ? "eligible" : "non-eligible");
> + }
> + printk("Traverse rb-tree from topdown\n");
> + while (node) {
> + struct rb_node *left = node->rb_left;
> + struct sched_entity *se_left = __node_2_se(left);
> + bool eli = false;
> +
> + if (left) {
> + eli = vruntime_eligible(cfs_rq,
> + __node_2_se(left)->min_vruntime);
> +
> + printk(" left se 0x%lx deadline:%lld min_vruntime:%lld vruntime:%lld %s\n",
> + (unsigned long)se_left, se_left->deadline, se_left->min_vruntime,
> + se_left->vruntime,
> + eli ? "left half eligible" : "left half non-eligible");
> + }
> +
> + if (left && eli) {
> + node = left;
> + continue;
> + }
> +
> + se = __node_2_se(node);
> +
> + eli = entity_eligible(cfs_rq, se);
> + printk(" middle se 0x%lx deadline:%lld min_vruntime:%lld vruntime:%lld %s\n",
> + (unsigned long)se, se->deadline, se->min_vruntime, se->vruntime,
> + eli ? "eligible" : "non-eligible");
> +
> + if (eli) {
> + best = se;
> + break;
> + }
> +
> + node = node->rb_right;
> + }
> + printk("Found best:0x%lx\n", (unsigned long)best);
> +}
> +
> /*
> * Earliest Eligible Virtual Deadline First
> *
> @@ -937,6 +1000,9 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> if (!best || (curr && entity_before(curr, best)))
> best = curr;
>
> + if (unlikely(!best))
> + debug_print_eevdf(cfs_rq);
> +
> return best;
> }
>
> --
> 2.34.1
>
> thanks,
> Chenyu


Attachments:
(No filename) (7.89 kB)
dmesg-debug.xz (57.83 kB)
Download all attachments

2024-02-21 02:33:01

by Oliver Sang

[permalink] [raw]
Subject: Re: [linus:master] [sched/eevdf] 2227a957e1: BUG:kernel_NULL_pointer_dereference,address

hi, Chen Yu,

On Mon, Feb 19, 2024 at 08:49:51PM +0800, Chen, Yu C wrote:
> On 2024-01-30 at 18:13:32 +0800, Abel Wu wrote:
> > On 1/30/24 3:24 PM, kernel test robot Wrote:
> > >
> > >
> > > Hello,
> > >
> > > (besides a previous performance report),
> > > kernel test robot noticed "BUG:kernel_NULL_pointer_dereference,address" on:
> > >
> > > commit: 2227a957e1d5b1941be4e4207879ec74f4bb37f8 ("sched/eevdf: Sort the rbtree by virtual deadline")
> > > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master
> > >
> > > [test failed on linus/master 3a5879d495b226d0404098e3564462d5f1daa33b]
> > > [test failed on linux-next/master 01af33cc9894b4489fb68fa35c40e9fe85df63dc]
> > >
> > > in testcase: trinity
> > > version: trinity-i386-abe9de86-1_20230429
> > > with following parameters:
> > >
> > > runtime: 300s
> > > group: group-03
> > > nr_groups: 5
> > >
> > > test-description: Trinity is a linux system call fuzz tester.
> > > test-url: http://codemonkey.org.uk/projects/trinity/
> > >
> > >
> > > compiler: clang-17
> > > test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
> > >
> > > (please refer to attached dmesg/kmsg for entire log/backtrace)
> > >
> > >
> > >
> > > we found this issue happens in very random way (23 out of 999 runs).
> > > but keeps clean on parent.
> >
> > Thanks for reporting, I will try to reproduce the issue. Does the 'parent'
> > mean the same code branch without this commit?
> >
> > >
> > > 84db47ca7146d7bd 2227a957e1d5b1941be4e420787
> > > ---------------- ---------------------------
> > > fail:runs %reproduction fail:runs
> > > | | |
> > > :999 2% 23:999 dmesg.BUG:kernel_NULL_pointer_dereference,address
> > > :999 2% 23:999 dmesg.Kernel_panic-not_syncing:Fatal_exception
> > > :999 2% 23:999 dmesg.Oops:#[##]
> > >
> > >
> > >
> > >
> > > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > > the same patch/commit), kindly add following tags
> > > | Reported-by: kernel test robot <[email protected]>
> > > | Closes: https://lore.kernel.org/oe-lkp/[email protected]
> > >
> > >
> > > sorry for below parse failure which caused no real line numbers.
> > > we will follow further. the orgial dmesg could be fetch from below link.
> > >
> > >
> > > [ 512.079810][ T8305] BUG: kernel NULL pointer dereference, address: 0000002c
> > > [ 512.080897][ T8305] #PF: supervisor read access in kernel mode
> > > [ 512.081636][ T8305] #PF: error_code(0x0000) - not-present page
> > > [ 512.082337][ T8305] *pde = 00000000
> > > [ 512.082829][ T8305] Oops: 0000 [#1] PREEMPT SMP
> > > [ 512.083407][ T8305] CPU: 1 PID: 8305 Comm: watchdog Tainted: G W N 6.7.0-rc1-00006-g2227a957e1d5 #1 819e6d1a8b887f5f97adb4aed77d98b15504c836
> > > [ 512.084986][ T8305] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> > > [ 512.086203][ T8305] EIP: set_next_entity (fair.c:?)
> >
> > There was actually a NULL-test in pick_eevdf() before this commit,
> > but I removed it by intent as I found it impossible to be NULL after
> > examining 'all' the cases.
> >
> > Also cc Tiwei who once proposed to add this check back.
> > https://lore.kernel.org/all/[email protected]/
> >
> > Thanks,
> > Abel
> >
>
> While looking at pick_eevdf(), I have a thought.

I applied below patch upon v6.8-rc5. in 999 runs, we cannot reproduce previous
issues which still exists on v6.8-rc5.

commit 32dff117ae985799019211b78b3f2d26f7d30109
Author: Chen Yu <[email protected]>
Date: Wed Feb 14 19:32:11 2024 +0800

sched/eevdf: Sort the entity by both deadline and eligibility


v6.8-rc5 32dff117ae985799019211b78b3
---------------- ---------------------------
fail:runs %reproduction fail:runs
| | |
45:999 -5% :999 dmesg.BUG:kernel_NULL_pointer_dereference,address



> Currently the sched entity is sorted by their deadline. During task
> pickup, the pick_eevdf() scans for an candidate sched entity with the
> smallest deadline. Meanwhile this candidate sched entity must also be
> eligible.
>
> The scan is O(lgn) on average, and O(1) at best case. How about making the
> average scan even faster by sorting the sched entity not only by deadline,
> but also the eligibility? The idea is that, the eligible sched entity with
> smaller deadline is sorted at the front the tree. Otherwise, if the entity
> is not eligible, even if it has a smaller deadline, it should be sorted
> at the end of the tree.
>
> After the change, pick_eevdf() get the leftmost sched entity at O(1) on
> average. Besides, it is guaranteed to return non-NULL sched entity in
> pick_eevdf(), which prevents suspicious NULL pointer exception in pick_eevdf().
>
> For example, suppose there are two sched entities to be queued, se_a and se_b.
> Consider their eligibility and deadline, there are 6 combination:
>
> 1. se_a is eligible, se_b is eligible, se_a.deadline < se_b.deadline
> 2. se_a is eligible, se_b is eligible, se_a.deadline >= se_b.deadline
> 3. se_a is eligible, se_b is not eligible
> 4. se_a is not eligible, se_b is eligible
> 5. se_a is not eligible, se_b is not eligible, se_a.deadline < se_b.deadline
> 6. se_a is not eligible, se_b is not eligible, se_a.deadline >= se_b.deadline
>
> In scenario 1, 3, 5, sched_entity se_a should be sorted before se_b,
> so pick_eevdf() would pick se_a first.
>
> When enqueuing a new sched entity, it is regarded as eligible if its
> vlag is positive. In theory later in pick_eevdf(), the eligibility
> of this sched entity should be re-checked via entity_eligible(). But
> consider if the sched entity is eliglble when enqueued, it is very
> likely the same sched entity remains eligible when pick_eevdf(), because
> the V keeps moving forward but the vruntime of this sched entity remain
> unchanged - the vlag could get larger.
>
> Something like this untested:
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 533547e3c90a..831043cc1432 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -551,11 +551,19 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
> static inline bool entity_before(const struct sched_entity *a,
> const struct sched_entity *b)
> {
> - /*
> - * Tiebreak on vruntime seems unnecessary since it can
> - * hardly happen.
> - */
> - return (s64)(a->deadline - b->deadline) < 0;
> + bool eli_a, eli_b;
> +
> + eli_a = (a->vlag >= 0) ? true : false;
> + eli_b = (b->vlag >= 0) ? true : false;
> +
> + if ((eli_a && eli_b) || (!eli_a && !eli_b))
> + /*
> + * Tiebreak on vruntime seems unnecessary since it can
> + * hardly happen.
> + */
> + return (s64)(a->deadline - b->deadline) < 0;
> +
> + return eli_a ? 1 : 0;
> }
>
> static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
> @@ -877,10 +885,8 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
> */
> static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> {
> - struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
> struct sched_entity *se = __pick_first_entity(cfs_rq);
> struct sched_entity *curr = cfs_rq->curr;
> - struct sched_entity *best = NULL;
>
> /*
> * We can safely skip eligibility check if there is only one entity
> @@ -899,45 +905,8 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
> return curr;
>
> - /* Pick the leftmost entity if it's eligible */
> - if (se && entity_eligible(cfs_rq, se)) {
> - best = se;
> - goto found;
> - }
> -
> - /* Heap search for the EEVD entity */
> - while (node) {
> - struct rb_node *left = node->rb_left;
> -
> - /*
> - * Eligible entities in left subtree are always better
> - * choices, since they have earlier deadlines.
> - */
> - if (left && vruntime_eligible(cfs_rq,
> - __node_2_se(left)->min_vruntime)) {
> - node = left;
> - continue;
> - }
> -
> - se = __node_2_se(node);
> -
> - /*
> - * The left subtree either is empty or has no eligible
> - * entity, so check the current node since it is the one
> - * with earliest deadline that might be eligible.
> - */
> - if (entity_eligible(cfs_rq, se)) {
> - best = se;
> - break;
> - }
> -
> - node = node->rb_right;
> - }
> -found:
> - if (!best || (curr && entity_before(curr, best)))
> - best = curr;
> -
> - return best;
> + /* Pick the leftmost entity */
> + return se;
> }
>
> #ifdef CONFIG_SCHED_DEBUG
> --
> 2.25.1
>