2024-04-25 09:12:16

by syzbot

[permalink] [raw]
Subject: [syzbot] [bpf?] [trace?] possible deadlock in force_sig_info_to_task

Hello,

syzbot found the following issue on:

HEAD commit: 977b1ef51866 Merge tag 'block-6.9-20240420' of git://git.k..
git tree: upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=17080d20980000
kernel config: https://syzkaller.appspot.com/x/.config?x=f47e5e015c177e57
dashboard link: https://syzkaller.appspot.com/bug?extid=83e7f982ca045ab4405c
compiler: Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 2.40

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/549d1add1da9/disk-977b1ef5.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/3e8e501c8aa2/vmlinux-977b1ef5.xz
kernel image: https://storage.googleapis.com/syzbot-assets/d02f7cb905b8/bzImage-977b1ef5.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: [email protected]

======================================================
WARNING: possible circular locking dependency detected
6.9.0-rc4-syzkaller-00266-g977b1ef51866 #0 Not tainted
------------------------------------------------------
syz-executor.0/11241 is trying to acquire lock:
ffff888020a2c0d8 (&sighand->siglock){-.-.}-{2:2}, at: force_sig_info_to_task+0x68/0x580 kernel/signal.c:1334

but task is already holding lock:
ffff8880b943e658 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x2a/0x140 kernel/sched/core.c:559

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (&rq->__lock){-.-.}-{2:2}:
lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
_raw_spin_lock_nested+0x31/0x40 kernel/locking/spinlock.c:378
raw_spin_rq_lock_nested+0x2a/0x140 kernel/sched/core.c:559
raw_spin_rq_lock kernel/sched/sched.h:1385 [inline]
_raw_spin_rq_lock_irqsave kernel/sched/sched.h:1404 [inline]
rq_lock_irqsave kernel/sched/sched.h:1683 [inline]
class_rq_lock_irqsave_constructor kernel/sched/sched.h:1737 [inline]
sched_mm_cid_exit_signals+0x17b/0x4b0 kernel/sched/core.c:12005
exit_signals+0x2a1/0x5c0 kernel/signal.c:3016
do_exit+0x6a8/0x27e0 kernel/exit.c:837
__do_sys_exit kernel/exit.c:994 [inline]
__se_sys_exit kernel/exit.c:992 [inline]
__pfx___ia32_sys_exit+0x0/0x10 kernel/exit.c:992
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x77/0x7f

-> #0 (&sighand->siglock){-.-.}-{2:2}:
check_prev_add kernel/locking/lockdep.c:3134 [inline]
check_prevs_add kernel/locking/lockdep.c:3253 [inline]
validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869
__lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
_raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
force_sig_info_to_task+0x68/0x580 kernel/signal.c:1334
force_sig_fault_to_task kernel/signal.c:1733 [inline]
force_sig_fault+0x12c/0x1d0 kernel/signal.c:1738
__bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
handle_page_fault arch/x86/mm/fault.c:1505 [inline]
exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
strncpy_from_user+0x2c6/0x2f0 lib/strncpy_from_user.c:138
strncpy_from_user_nofault+0x71/0x140 mm/maccess.c:186
bpf_probe_read_user_str_common kernel/trace/bpf_trace.c:216 [inline]
____bpf_probe_read_compat_str kernel/trace/bpf_trace.c:311 [inline]
bpf_probe_read_compat_str+0xe9/0x180 kernel/trace/bpf_trace.c:307
bpf_prog_e42f6260c1b72fb3+0x3d/0x3f
bpf_dispatcher_nop_func include/linux/bpf.h:1234 [inline]
__bpf_prog_run include/linux/filter.h:657 [inline]
bpf_prog_run include/linux/filter.h:664 [inline]
__bpf_trace_run kernel/trace/bpf_trace.c:2381 [inline]
bpf_trace_run4+0x25a/0x490 kernel/trace/bpf_trace.c:2422
__traceiter_sched_switch+0x98/0xd0 include/trace/events/sched.h:222
trace_sched_switch include/trace/events/sched.h:222 [inline]
__schedule+0x2535/0x4a00 kernel/sched/core.c:6743
preempt_schedule_irq+0xfb/0x1c0 kernel/sched/core.c:7068
irqentry_exit+0x5e/0x90 kernel/entry/common.c:354
asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702
force_sig_fault+0x0/0x1d0
__bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
handle_page_fault arch/x86/mm/fault.c:1505 [inline]
exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
__put_user_handle_exception+0x0/0x10
__do_sys_gettimeofday kernel/time/time.c:147 [inline]
__se_sys_gettimeofday+0xd9/0x240 kernel/time/time.c:140
emulate_vsyscall+0xe23/0x1290 arch/x86/entry/vsyscall/vsyscall_64.c:247
do_user_addr_fault arch/x86/mm/fault.c:1346 [inline]
handle_page_fault arch/x86/mm/fault.c:1505 [inline]
exc_page_fault+0x160/0x8e0 arch/x86/mm/fault.c:1563
asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
_end+0x6a9da000/0x0

other info that might help us debug this:

Possible unsafe locking scenario:

CPU0 CPU1
---- ----
lock(&rq->__lock);
lock(&sighand->siglock);
lock(&rq->__lock);
lock(&sighand->siglock);

*** DEADLOCK ***

2 locks held by syz-executor.0/11241:
#0: ffff8880b943e658 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x2a/0x140 kernel/sched/core.c:559
#1: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:329 [inline]
#1: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:781 [inline]
#1: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: __bpf_trace_run kernel/trace/bpf_trace.c:2380 [inline]
#1: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: bpf_trace_run4+0x16e/0x490 kernel/trace/bpf_trace.c:2422

stack backtrace:
CPU: 0 PID: 11241 Comm: syz-executor.0 Not tainted 6.9.0-rc4-syzkaller-00266-g977b1ef51866 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114
check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2187
check_prev_add kernel/locking/lockdep.c:3134 [inline]
check_prevs_add kernel/locking/lockdep.c:3253 [inline]
validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869
__lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
_raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
force_sig_info_to_task+0x68/0x580 kernel/signal.c:1334
force_sig_fault_to_task kernel/signal.c:1733 [inline]
force_sig_fault+0x12c/0x1d0 kernel/signal.c:1738
__bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
handle_page_fault arch/x86/mm/fault.c:1505 [inline]
exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
RIP: 0010:do_strncpy_from_user lib/strncpy_from_user.c:72 [inline]
RIP: 0010:strncpy_from_user+0x2c6/0x2f0 lib/strncpy_from_user.c:139
Code: cc cc cc cc e8 ab 95 b6 fc 45 31 ed eb e0 e8 a1 95 b6 fc 49 c7 c5 f2 ff ff ff eb d2 e8 93 95 b6 fc 49 c7 c5 f2 ff ff ff eb c4 <f3> 0f 1e fa e8 81 95 b6 fc eb a1 f3 0f 1e fa e8 76 95 b6 fc 4d 29
RSP: 0018:ffffc90009f9f5e0 EFLAGS: 00050046
RAX: 0000000000000002 RBX: ffff8880795c3584 RCX: ffff8880795c1e00
RDX: ffffc90004bf1000 RSI: 0000000000000001 RDI: 0000000000000000
RBP: 0000000000000001 R08: ffffffff84df6a34 R09: ffffffff82056cb7
R10: 0000000000000003 R11: ffff8880795c1e00 R12: 0000000000000000
R13: 0000000000000000 R14: ffffc90009f9f6a8 R15: 0000000000000000
strncpy_from_user_nofault+0x71/0x140 mm/maccess.c:186
bpf_probe_read_user_str_common kernel/trace/bpf_trace.c:216 [inline]
____bpf_probe_read_compat_str kernel/trace/bpf_trace.c:311 [inline]
bpf_probe_read_compat_str+0xe9/0x180 kernel/trace/bpf_trace.c:307
bpf_prog_e42f6260c1b72fb3+0x3d/0x3f
bpf_dispatcher_nop_func include/linux/bpf.h:1234 [inline]
__bpf_prog_run include/linux/filter.h:657 [inline]
bpf_prog_run include/linux/filter.h:664 [inline]
__bpf_trace_run kernel/trace/bpf_trace.c:2381 [inline]
bpf_trace_run4+0x25a/0x490 kernel/trace/bpf_trace.c:2422
__traceiter_sched_switch+0x98/0xd0 include/trace/events/sched.h:222
trace_sched_switch include/trace/events/sched.h:222 [inline]
__schedule+0x2535/0x4a00 kernel/sched/core.c:6743
preempt_schedule_irq+0xfb/0x1c0 kernel/sched/core.c:7068
irqentry_exit+0x5e/0x90 kernel/entry/common.c:354
asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702
RIP: 0010:force_sig_fault+0x0/0x1d0 kernel/signal.c:1737
Code: 9a 00 e9 31 ff ff ff e8 1e 7e 1a 0a 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 <f3> 0f 1e fa 55 48 89 e5 41 57 41 56 41 55 41 54 53 48 83 e4 e0 48
RSP: 0018:ffffc90009f9fb78 EFLAGS: 00000286
RAX: ffffffff8141f9d7 RBX: 0000000000000000 RCX: 0000000000040000
RDX: 0000000000000019 RSI: 0000000000000001 RDI: 000000000000000b
RBP: ffffc90009f9fc78 R08: ffffffff8141f976 R09: ffffffff81423712
R10: 0000000000000014 R11: ffff8880795c1e00 R12: dffffc0000000000
R13: ffffc90009f9fd70 R14: 1ffff920013f3fae R15: 0000000000000002
__bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
handle_page_fault arch/x86/mm/fault.c:1505 [inline]
exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
RIP: 0010:__put_user_handle_exception+0x0/0x10 arch/x86/lib/putuser.S:125
Code: 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 01 cb 48 89 01 31 c9 0f 01 ca c3 cc cc cc cc 66 2e 0f 1f 84 00 00 00 00 00 66 90 <0f> 01 ca b9 f2 ff ff ff c3 cc cc cc cc 0f 1f 00 90 90 90 90 90 90
RSP: 0018:ffffc90009f9fd98 EFLAGS: 00050202
RAX: 000000006624d6a7 RBX: 0000000000000000 RCX: 0000000000000019
RDX: 0000000000000000 RSI: ffffffff8bcaca20 RDI: ffffffff8c1eb160
RBP: ffffc90009f9fe50 R08: ffffffff8fa7b6af R09: 1ffffffff1f4f6d5
R10: dffffc0000000000 R11: fffffbfff1f4f6d6 R12: ffffc90009f9fde0
R13: dffffc0000000000 R14: 1ffff920013f3fb8 R15: 0000000000000019
__do_sys_gettimeofday kernel/time/time.c:147 [inline]
__se_sys_gettimeofday+0xd9/0x240 kernel/time/time.c:140
emulate_vsyscall+0xe23/0x1290 arch/x86/entry/vsyscall/vsyscall_64.c:247
do_user_addr_fault arch/x86/mm/fault.c:1346 [inline]
handle_page_fault arch/x86/mm/fault.c:1505 [inline]
exc_page_fault+0x160/0x8e0 arch/x86/mm/fault.c:1563
asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
RIP: 0033:_end+0x6a9da000/0x0
Code: Unable to access opcode bytes at 0xffffffffff5fffd6.
RSP: 002b:00007f9364a35b38 EFLAGS: 00010246
RAX: ffffffffffffffda RBX: 00007f9363dabf80 RCX: 00007f9363c7dea9
RDX: 00007f9364a35b40 RSI: 00007f9364a35c70 RDI: 0000000000000019
RBP: 00007f9363cca4a4 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000007 R11: 0000000000000246 R12: 0000000000000000
R13: 000000000000000b R14: 00007f9363dabf80 R15: 00007ffc4b734e88
</TASK>
----------------
Code disassembly (best guess):
0: cc int3
1: cc int3
2: cc int3
3: cc int3
4: e8 ab 95 b6 fc call 0xfcb695b4
9: 45 31 ed xor %r13d,%r13d
c: eb e0 jmp 0xffffffee
e: e8 a1 95 b6 fc call 0xfcb695b4
13: 49 c7 c5 f2 ff ff ff mov $0xfffffffffffffff2,%r13
1a: eb d2 jmp 0xffffffee
1c: e8 93 95 b6 fc call 0xfcb695b4
21: 49 c7 c5 f2 ff ff ff mov $0xfffffffffffffff2,%r13
28: eb c4 jmp 0xffffffee
* 2a: f3 0f 1e fa endbr64 <-- trapping instruction
2e: e8 81 95 b6 fc call 0xfcb695b4
33: eb a1 jmp 0xffffffd6
35: f3 0f 1e fa endbr64
39: e8 76 95 b6 fc call 0xfcb695b4
3e: 4d rex.WRB
3f: 29 .byte 0x29


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at [email protected].

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup


2024-04-25 17:54:43

by Jiri Olsa

[permalink] [raw]
Subject: Re: [syzbot] [bpf?] [trace?] possible deadlock in force_sig_info_to_task

On Thu, Apr 25, 2024 at 02:05:31AM -0700, syzbot wrote:
> Hello,
>
> syzbot found the following issue on:
>
> HEAD commit: 977b1ef51866 Merge tag 'block-6.9-20240420' of git://git.k..
> git tree: upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=17080d20980000
> kernel config: https://syzkaller.appspot.com/x/.config?x=f47e5e015c177e57
> dashboard link: https://syzkaller.appspot.com/bug?extid=83e7f982ca045ab4405c
> compiler: Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 2.40
>
> Unfortunately, I don't have any reproducer for this issue yet.
>
> Downloadable assets:
> disk image: https://storage.googleapis.com/syzbot-assets/549d1add1da9/disk-977b1ef5.raw.xz
> vmlinux: https://storage.googleapis.com/syzbot-assets/3e8e501c8aa2/vmlinux-977b1ef5.xz
> kernel image: https://storage.googleapis.com/syzbot-assets/d02f7cb905b8/bzImage-977b1ef5.xz
>
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: [email protected]
>
> ======================================================
> WARNING: possible circular locking dependency detected
> 6.9.0-rc4-syzkaller-00266-g977b1ef51866 #0 Not tainted
> ------------------------------------------------------
> syz-executor.0/11241 is trying to acquire lock:
> ffff888020a2c0d8 (&sighand->siglock){-.-.}-{2:2}, at: force_sig_info_to_task+0x68/0x580 kernel/signal.c:1334
>
> but task is already holding lock:
> ffff8880b943e658 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x2a/0x140 kernel/sched/core.c:559
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #1 (&rq->__lock){-.-.}-{2:2}:
> lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
> _raw_spin_lock_nested+0x31/0x40 kernel/locking/spinlock.c:378
> raw_spin_rq_lock_nested+0x2a/0x140 kernel/sched/core.c:559
> raw_spin_rq_lock kernel/sched/sched.h:1385 [inline]
> _raw_spin_rq_lock_irqsave kernel/sched/sched.h:1404 [inline]
> rq_lock_irqsave kernel/sched/sched.h:1683 [inline]
> class_rq_lock_irqsave_constructor kernel/sched/sched.h:1737 [inline]
> sched_mm_cid_exit_signals+0x17b/0x4b0 kernel/sched/core.c:12005
> exit_signals+0x2a1/0x5c0 kernel/signal.c:3016
> do_exit+0x6a8/0x27e0 kernel/exit.c:837
> __do_sys_exit kernel/exit.c:994 [inline]
> __se_sys_exit kernel/exit.c:992 [inline]
> __pfx___ia32_sys_exit+0x0/0x10 kernel/exit.c:992
> do_syscall_x64 arch/x86/entry/common.c:52 [inline]
> do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83
> entry_SYSCALL_64_after_hwframe+0x77/0x7f
>
> -> #0 (&sighand->siglock){-.-.}-{2:2}:
> check_prev_add kernel/locking/lockdep.c:3134 [inline]
> check_prevs_add kernel/locking/lockdep.c:3253 [inline]
> validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869
> __lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
> lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
> __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
> _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
> force_sig_info_to_task+0x68/0x580 kernel/signal.c:1334
> force_sig_fault_to_task kernel/signal.c:1733 [inline]
> force_sig_fault+0x12c/0x1d0 kernel/signal.c:1738
> __bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
> handle_page_fault arch/x86/mm/fault.c:1505 [inline]
> exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
> asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
> strncpy_from_user+0x2c6/0x2f0 lib/strncpy_from_user.c:138
> strncpy_from_user_nofault+0x71/0x140 mm/maccess.c:186
> bpf_probe_read_user_str_common kernel/trace/bpf_trace.c:216 [inline]
> ____bpf_probe_read_compat_str kernel/trace/bpf_trace.c:311 [inline]
> bpf_probe_read_compat_str+0xe9/0x180 kernel/trace/bpf_trace.c:307
> bpf_prog_e42f6260c1b72fb3+0x3d/0x3f
> bpf_dispatcher_nop_func include/linux/bpf.h:1234 [inline]
> __bpf_prog_run include/linux/filter.h:657 [inline]
> bpf_prog_run include/linux/filter.h:664 [inline]
> __bpf_trace_run kernel/trace/bpf_trace.c:2381 [inline]
> bpf_trace_run4+0x25a/0x490 kernel/trace/bpf_trace.c:2422
> __traceiter_sched_switch+0x98/0xd0 include/trace/events/sched.h:222
> trace_sched_switch include/trace/events/sched.h:222 [inline]
> __schedule+0x2535/0x4a00 kernel/sched/core.c:6743
> preempt_schedule_irq+0xfb/0x1c0 kernel/sched/core.c:7068
> irqentry_exit+0x5e/0x90 kernel/entry/common.c:354
> asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702
> force_sig_fault+0x0/0x1d0
> __bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
> handle_page_fault arch/x86/mm/fault.c:1505 [inline]
> exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
> asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
> __put_user_handle_exception+0x0/0x10
> __do_sys_gettimeofday kernel/time/time.c:147 [inline]
> __se_sys_gettimeofday+0xd9/0x240 kernel/time/time.c:140
> emulate_vsyscall+0xe23/0x1290 arch/x86/entry/vsyscall/vsyscall_64.c:247
> do_user_addr_fault arch/x86/mm/fault.c:1346 [inline]
> handle_page_fault arch/x86/mm/fault.c:1505 [inline]
> exc_page_fault+0x160/0x8e0 arch/x86/mm/fault.c:1563
> asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
> _end+0x6a9da000/0x0
>

I tried and I can reproduce similar splat below, but no clue yet ;-)
I wonder the current->thread.sig_on_uaccess_err flag might affect the
second bpf fault behaviour

jirka


---
[ 315.747916] [ BUG: Invalid wait context ]
[ 315.748511] 6.9.0-rc1+ #60 Tainted: G OE
[ 315.749227] -----------------------------
[ 315.749820] test_progs/1263 is trying to lock:
[ 315.750463][ T1263] ffff888109f06f58 (&sighand->siglock){....}-{3:3}, at: force_sig_info_to_task+0x25/0x140
[ 315.751630][ T1263] other info that might help us debug this:
[ 315.752337][ T1263] context-{5:5}
[ 315.752796][ T1263] 2 locks held by test_progs/1263:
[ 315.753426][ T1263] #0: ffff88846d808918 (&rq->__lock){-.-.}-{2:2}, at: __schedule+0x11b/0x1040
[ 315.754531][ T1263] #1: ffffffff839a3700 (rcu_read_lock){....}-{1:3}, at: trace_call_bpf+0x6d/0x4a0
[ 315.755630][ T1263] stack backtrace:
[ 315.756133][ T1263] CPU: 2 PID: 1263 Comm: test_progs Tainted: G OE 6.9.0-rc1+ #60 e3139236695c37204e0f43029c0efe69ab334496
[ 315.757607][ T1263] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc38 04/01/2014
[ 315.758741][ T1263] Call Trace:
[ 315.759208][ T1263] <TASK>
[ 315.759631][ T1263] dump_stack_lvl+0x133/0x150
[ 315.760252][ T1263] __lock_acquire+0x98a/0x2420
[ 315.760876][ T1263] ? ring_buffer_lock_reserve+0x126/0x410
[ 315.761623][ T1263] lock_acquire+0x105/0x380
[ 315.762216][ T1263] ? force_sig_info_to_task+0x25/0x140
[ 315.762892][ T1263] ? kernelmode_fixup_or_oops+0x42/0x170
[ 315.763577][ T1263] ? trace_vbprintk+0x173/0x260
[ 315.764191][ T1263] _raw_spin_lock_irqsave+0x69/0xc0
[ 315.764842][ T1263] ? force_sig_info_to_task+0x25/0x140
[ 315.765519][ T1263] force_sig_info_to_task+0x25/0x140
[ 315.766182][ T1263] force_sig_fault+0x5a/0x80
[ 315.766767][ T1263] exc_page_fault+0x82/0x2a0
[ 315.767382][ T1263] asm_exc_page_fault+0x22/0x30
[ 315.768018][ T1263] RIP: 0010:strncpy_from_user+0xe5/0x140
[ 315.768713][ T1263] Code: d8 48 89 44 15 00 48 b8 08 06 05 04 03 02 01 00 48 0f af d8 48 c1 eb 38 48 8d 04 13 0f 01 ca 5b 5d 41 5c 41 5d c3 cc cc cc cc <48> 85 db 74 20 48 01 d3 eb 09 48 83 c2 01 48 39 da 74 15 41 8a 44
[ 315.770996][ T1263] RSP: 0000:ffffc90001947a40 EFLAGS: 00050002
[ 315.771780][ T1263] RAX: 0000000000000001 RBX: 0000000000000008 RCX: 0000000000000004
[ 315.772834][ T1263] RDX: 0000000000000000 RSI: 8080808080808080 RDI: ffffc90001947aa8
[ 315.773880][ T1263] RBP: ffffc90001947aa8 R08: fefefefefefefeff R09: 0000000000008a47
[ 315.774965][ T1263] R10: 0000000000000011 R11: 000000000001600b R12: 0000000000000008
[ 315.775830][ T1263] R13: 0000000000000001 R14: 0000000000000001 R15: ffffc90001633000
[ 315.776698][ T1263] strncpy_from_user_nofault+0x28/0x70
[ 315.778139][ T1263] bpf_probe_read_compat_str+0x51/0x90
[ 315.778707][ T1263] bpf_prog_9199568cec6305d9_krava+0x3c/0x40
[ 315.779314][ T1263] trace_call_bpf+0x127/0x4a0
[ 315.779801][ T1263] perf_trace_run_bpf_submit+0x4f/0xd0
[ 315.786332][ T1263] perf_trace_sched_switch+0x163/0x1a0
[ 315.786885][ T1263] __traceiter_sched_switch+0x3e/0x60
[ 315.787427][ T1263] __schedule+0x5eb/0x1040
[ 315.787891][ T1263] ? asm_sysvec_call_function_single+0x16/0x20
[ 315.788503][ T1263] ? preempt_schedule_thunk+0x16/0x30
[ 315.789041][ T1263] preempt_schedule_common+0x2c/0x70
[ 315.789568][ T1263] preempt_schedule_thunk+0x16/0x30
[ 315.790096][ T1263] _raw_spin_unlock_irqrestore+0x8e/0xa0
[ 315.790654][ T1263] force_sig_info_to_task+0xf3/0x140
[ 315.791174][ T1263] force_sig_fault+0x5a/0x80
[ 315.791642][ T1263] exc_page_fault+0x82/0x2a0
[ 315.792108][ T1263] asm_exc_page_fault+0x22/0x30
[ 315.792589][ T1263] RIP: 0010:_copy_to_user+0x45/0x60
[ 315.793116][ T1263] Code: 1b 9a ff 48 89 d8 48 01 e8 0f 92 c2 48 85 c0 78 28 0f b6 d2 48 85 d2 75 20 0f 01 cb 48 89 d9 48 89 ef 4c 89 e6 f3 a4 0f 1f 00 <0f> 01 ca 5b 48 89 c8 5d 41 5c c3 cc cc cc cc 48 89 d8 5b 5d 41 5c
[ 315.794884][ T1263] RSP: 0000:ffffc90001947e50 EFLAGS: 00050246
[ 315.795476][ T1263] RAX: 0000000000000009 RBX: 0000000000000008 RCX: 0000000000000008
[ 315.796256][ T1263] RDX: 0000000000000000 RSI: ffffffff85577050 RDI: 0000000000000001
[ 315.797012][ T1263] RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000001
[ 315.797753][ T1263] R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff85577050
[ 315.798518][ T1263] R13: ffffc90001947f58 R14: ffff88817d51b6c0 R15: 0000000000000060
[ 315.799287][ T1263] __x64_sys_gettimeofday+0xc9/0xe0
[ 315.799801][ T1263] ? 0xffffffffff600000
[ 315.800241][ T1263] emulate_vsyscall+0x1be/0x420
[ 315.800732][ T1263] ? 0xffffffffff600000
[ 315.801168][ T1263] do_user_addr_fault+0x4d3/0x8b0
[ 315.801651][ T1263] ? 0xffffffffff600000
[ 315.802067][ T1263] ? 0xffffffffff600000
[ 315.802460][ T1263] exc_page_fault+0x82/0x2a0
[ 315.802908][ T1263] asm_exc_page_fault+0x22/0x30
[ 315.803389][ T1263] RIP: 0033:__init_scratch_end+0x79200000/0xffffffffffa26000
[ 315.804112][ T1263] Code: Unable to access opcode bytes at 0xffffffffff5fffd6.
[ 315.804802][ T1263] RSP: 002b:00007ffc2bdaee98 EFLAGS: 00010246
[ 315.805352][ T1263] RAX: ffffffffffffffda RBX: 00007ffc2bdaf138 RCX: 0000000000000000
[ 315.806099][ T1263] RDX: 0000000000000002 RSI: 0000000000000001 RDI: 0000000000000000
[ 315.806863][ T1263] RBP: 00007ffc2bdaeef0 R08: 0000000000000064 R09: 0000000000000000
[ 315.808206][ T1263] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000004
[ 315.808929][ T1263] R13: 0000000000000000 R14: 00007f7ef5921000 R15: 0000000001402db0
[ 315.809628][ T1263] </TASK>

2024-04-27 20:00:53

by syzbot

[permalink] [raw]
Subject: Re: [syzbot] [bpf?] [trace?] possible deadlock in force_sig_info_to_task

syzbot has found a reproducer for the following issue on:

HEAD commit: 5eb4573ea63d Merge tag 'soc-fixes-6.9-2' of git://git.kern..
git tree: upstream
console+strace: https://syzkaller.appspot.com/x/log.txt?x=17b2b240980000
kernel config: https://syzkaller.appspot.com/x/.config?x=3d46aa9d7a44f40d
dashboard link: https://syzkaller.appspot.com/bug?extid=83e7f982ca045ab4405c
compiler: Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 2.40
syz repro: https://syzkaller.appspot.com/x/repro.syz?x=120f79ef180000
C reproducer: https://syzkaller.appspot.com/x/repro.c?x=13a1cd27180000

Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/d647177a878d/disk-5eb4573e.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/977f32ca169c/vmlinux-5eb4573e.xz
kernel image: https://storage.googleapis.com/syzbot-assets/67f3b92c1012/bzImage-5eb4573e.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: [email protected]

======================================================
WARNING: possible circular locking dependency detected
6.9.0-rc5-syzkaller-00296-g5eb4573ea63d #0 Not tainted
------------------------------------------------------
syz-executor324/5151 is trying to acquire lock:
ffff88802a6c8018 (&sighand->siglock){....}-{2:2}, at: force_sig_info_to_task+0x68/0x580 kernel/signal.c:1334

but task is already holding lock:
ffff8880b943e658 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x2a/0x140 kernel/sched/core.c:559

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #2 (&rq->__lock){-.-.}-{2:2}:
lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
_raw_spin_lock_nested+0x31/0x40 kernel/locking/spinlock.c:378
raw_spin_rq_lock_nested+0x2a/0x140 kernel/sched/core.c:559
raw_spin_rq_lock kernel/sched/sched.h:1387 [inline]
rq_lock kernel/sched/sched.h:1701 [inline]
task_fork_fair+0x61/0x1e0 kernel/sched/fair.c:12635
sched_cgroup_fork+0x37c/0x410 kernel/sched/core.c:4845
copy_process+0x2217/0x3df0 kernel/fork.c:2499
kernel_clone+0x223/0x870 kernel/fork.c:2797
user_mode_thread+0x132/0x1a0 kernel/fork.c:2875
rest_init+0x23/0x300 init/main.c:704
start_kernel+0x47a/0x500 init/main.c:1081
x86_64_start_reservations+0x2a/0x30 arch/x86/kernel/head64.c:507
x86_64_start_kernel+0x99/0xa0 arch/x86/kernel/head64.c:488
common_startup_64+0x13e/0x147

-> #1 (&p->pi_lock){-.-.}-{2:2}:
lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
_raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
class_raw_spinlock_irqsave_constructor include/linux/spinlock.h:553 [inline]
try_to_wake_up+0xb0/0x1470 kernel/sched/core.c:4262
signal_wake_up_state+0xb4/0x120 kernel/signal.c:773
signal_wake_up include/linux/sched/signal.h:448 [inline]
complete_signal+0x94a/0xcf0 kernel/signal.c:1065
__send_signal_locked+0xb1b/0xdc0 kernel/signal.c:1185
do_notify_parent+0xd96/0x10a0 kernel/signal.c:2143
exit_notify kernel/exit.c:757 [inline]
do_exit+0x1811/0x27e0 kernel/exit.c:898
do_group_exit+0x207/0x2c0 kernel/exit.c:1027
__do_sys_exit_group kernel/exit.c:1038 [inline]
__se_sys_exit_group kernel/exit.c:1036 [inline]
__x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1036
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x77/0x7f

-> #0 (&sighand->siglock){....}-{2:2}:
check_prev_add kernel/locking/lockdep.c:3134 [inline]
check_prevs_add kernel/locking/lockdep.c:3253 [inline]
validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869
__lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
_raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
force_sig_info_to_task+0x68/0x580 kernel/signal.c:1334
force_sig_fault_to_task kernel/signal.c:1733 [inline]
force_sig_fault+0x12c/0x1d0 kernel/signal.c:1738
__bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
handle_page_fault arch/x86/mm/fault.c:1505 [inline]
exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
rep_movs_alternative+0x22/0x70 arch/x86/lib/copy_user_64.S:48
copy_user_generic arch/x86/include/asm/uaccess_64.h:110 [inline]
raw_copy_from_user arch/x86/include/asm/uaccess_64.h:125 [inline]
__copy_from_user_inatomic include/linux/uaccess.h:87 [inline]
copy_from_user_nofault+0xbc/0x150 mm/maccess.c:125
bpf_probe_read_user_common kernel/trace/bpf_trace.c:179 [inline]
____bpf_probe_read_compat kernel/trace/bpf_trace.c:292 [inline]
bpf_probe_read_compat+0xe9/0x180 kernel/trace/bpf_trace.c:288
bpf_prog_1878750df62aa1fb+0x48/0x4a
bpf_dispatcher_nop_func include/linux/bpf.h:1234 [inline]
__bpf_prog_run include/linux/filter.h:657 [inline]
bpf_prog_run include/linux/filter.h:664 [inline]
__bpf_trace_run kernel/trace/bpf_trace.c:2381 [inline]
bpf_trace_run4+0x25a/0x490 kernel/trace/bpf_trace.c:2422
__traceiter_sched_switch+0x98/0xd0 include/trace/events/sched.h:222
trace_sched_switch include/trace/events/sched.h:222 [inline]
__schedule+0x2535/0x4a00 kernel/sched/core.c:6743
preempt_schedule_common+0x84/0xd0 kernel/sched/core.c:6925
preempt_schedule+0xe1/0xf0 kernel/sched/core.c:6949
preempt_schedule_thunk+0x1a/0x30 arch/x86/entry/thunk_64.S:12
__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline]
_raw_spin_unlock_irqrestore+0x130/0x140 kernel/locking/spinlock.c:194
spin_unlock_irqrestore include/linux/spinlock.h:406 [inline]
force_sig_info_to_task+0x41c/0x580 kernel/signal.c:1356
force_sig_fault_to_task kernel/signal.c:1733 [inline]
force_sig_fault+0x12c/0x1d0 kernel/signal.c:1738
__bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
handle_page_fault arch/x86/mm/fault.c:1505 [inline]
exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
__put_user_handle_exception+0x0/0x10
__do_sys_gettimeofday kernel/time/time.c:147 [inline]
__se_sys_gettimeofday+0xd9/0x240 kernel/time/time.c:140
emulate_vsyscall+0xe23/0x1290 arch/x86/entry/vsyscall/vsyscall_64.c:247
do_user_addr_fault arch/x86/mm/fault.c:1346 [inline]
handle_page_fault arch/x86/mm/fault.c:1505 [inline]
exc_page_fault+0x160/0x8e0 arch/x86/mm/fault.c:1563
asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
_end+0x6a9da000/0x0

other info that might help us debug this:

Chain exists of:
&sighand->siglock --> &p->pi_lock --> &rq->__lock

Possible unsafe locking scenario:

CPU0 CPU1
---- ----
lock(&rq->__lock);
lock(&p->pi_lock);
lock(&rq->__lock);
lock(&sighand->siglock);

*** DEADLOCK ***

2 locks held by syz-executor324/5151:
#0: ffff8880b943e658 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x2a/0x140 kernel/sched/core.c:559
#1: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:329 [inline]
#1: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:781 [inline]
#1: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: __bpf_trace_run kernel/trace/bpf_trace.c:2380 [inline]
#1: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: bpf_trace_run4+0x16e/0x490 kernel/trace/bpf_trace.c:2422

stack backtrace:
CPU: 0 PID: 5151 Comm: syz-executor324 Not tainted 6.9.0-rc5-syzkaller-00296-g5eb4573ea63d #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114
check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2187
check_prev_add kernel/locking/lockdep.c:3134 [inline]
check_prevs_add kernel/locking/lockdep.c:3253 [inline]
validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869
__lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
_raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
force_sig_info_to_task+0x68/0x580 kernel/signal.c:1334
force_sig_fault_to_task kernel/signal.c:1733 [inline]
force_sig_fault+0x12c/0x1d0 kernel/signal.c:1738
__bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
handle_page_fault arch/x86/mm/fault.c:1505 [inline]
exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
RIP: 0010:rep_movs_alternative+0x22/0x70 arch/x86/lib/copy_user_64.S:50
Code: 90 90 90 90 90 90 90 90 f3 0f 1e fa 48 83 f9 40 73 40 83 f9 08 73 21 85 c9 74 0f 8a 06 88 07 48 ff c7 48 ff c6 48 ff c9 75 f1 <c3> cc cc cc cc 66 0f 1f 84 00 00 00 00 00 48 8b 06 48 89 07 48 83
RSP: 0000:ffffc90004137468 EFLAGS: 00050002
RAX: ffffffff8205ce4e RBX: dffffc0000000000 RCX: 0000000000000002
RDX: 0000000000000000 RSI: 0000000000000900 RDI: ffffc900041374e8
RBP: ffff88802d039784 R08: 0000000000000005 R09: ffffffff8205ce37
R10: 0000000000000003 R11: ffff88802d038000 R12: 1ffff11005a072f0
R13: 0000000000000900 R14: 0000000000000002 R15: ffffc900041374e8
copy_user_generic arch/x86/include/asm/uaccess_64.h:110 [inline]
raw_copy_from_user arch/x86/include/asm/uaccess_64.h:125 [inline]
__copy_from_user_inatomic include/linux/uaccess.h:87 [inline]
copy_from_user_nofault+0xbc/0x150 mm/maccess.c:125
bpf_probe_read_user_common kernel/trace/bpf_trace.c:179 [inline]
____bpf_probe_read_compat kernel/trace/bpf_trace.c:292 [inline]
bpf_probe_read_compat+0xe9/0x180 kernel/trace/bpf_trace.c:288
bpf_prog_1878750df62aa1fb+0x48/0x4a
bpf_dispatcher_nop_func include/linux/bpf.h:1234 [inline]
__bpf_prog_run include/linux/filter.h:657 [inline]
bpf_prog_run include/linux/filter.h:664 [inline]
__bpf_trace_run kernel/trace/bpf_trace.c:2381 [inline]
bpf_trace_run4+0x25a/0x490 kernel/trace/bpf_trace.c:2422
__traceiter_sched_switch+0x98/0xd0 include/trace/events/sched.h:222
trace_sched_switch include/trace/events/sched.h:222 [inline]
__schedule+0x2535/0x4a00 kernel/sched/core.c:6743
preempt_schedule_common+0x84/0xd0 kernel/sched/core.c:6925
preempt_schedule+0xe1/0xf0 kernel/sched/core.c:6949
preempt_schedule_thunk+0x1a/0x30 arch/x86/entry/thunk_64.S:12
__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline]
_raw_spin_unlock_irqrestore+0x130/0x140 kernel/locking/spinlock.c:194
spin_unlock_irqrestore include/linux/spinlock.h:406 [inline]
force_sig_info_to_task+0x41c/0x580 kernel/signal.c:1356
force_sig_fault_to_task kernel/signal.c:1733 [inline]
force_sig_fault+0x12c/0x1d0 kernel/signal.c:1738
__bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
handle_page_fault arch/x86/mm/fault.c:1505 [inline]
exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
RIP: 0010:__put_user_handle_exception+0x0/0x10 arch/x86/lib/putuser.S:125
Code: 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 01 cb 48 89 01 31 c9 0f 01 ca c3 cc cc cc cc 66 2e 0f 1f 84 00 00 00 00 00 66 90 <0f> 01 ca b9 f2 ff ff ff c3 cc cc cc cc 0f 1f 00 90 90 90 90 90 90
RSP: 0000:ffffc90004137d98 EFLAGS: 00050202
RAX: 00000000662d5943 RBX: 0000000000000000 RCX: 0000000000000019
RDX: 0000000000000000 RSI: ffffffff8bcaca20 RDI: ffffffff8c1eaba0
RBP: ffffc90004137e50 R08: ffffffff8fa7cd6f R09: 1ffffffff1f4f9ad
R10: dffffc0000000000 R11: fffffbfff1f4f9ae R12: ffffc90004137de0
R13: dffffc0000000000 R14: 1ffff92000826fb8 R15: 0000000000000019
__do_sys_gettimeofday kernel/time/time.c:147 [inline]
__se_sys_gettimeofday+0xd9/0x240 kernel/time/time.c:140
emulate_vsyscall+0xe23/0x1290 arch/x86/entry/vsyscall/vsyscall_64.c:247
do_user_addr_fault arch/x86/mm/fault.c:1346 [inline]
handle_page_fault arch/x86/mm/fault.c:1505 [inline]
exc_page_fault+0x160/0x8e0 arch/x86/mm/fault.c:1563
asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
RIP: 0033:_end+0x6a9da000/0x0
Code: Unable to access opcode bytes at 0xffffffffff5fffd6.
RSP: 002b:00007fbb40c81c78 EFLAGS: 00010246
RAX: ffffffffffffffda RBX: 00007fbb40d73418 RCX: 00007fbb40ce97d9
RDX: 00007fbb40c81c80 RSI: 00007fbb40c81db0 RDI: 0000000000000019
RBP: 00007fbb40d73410 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000007 R11: 0000000000000246 R12: 00007fbb40d402b0
R13: 77735f6465686373 R14: 66aa589070d556b8 R15: 0400000000000004
</TASK>
syz-executor324[5151] vsyscall fault (exploit attempt?) ip:ffffffffff600000 cs:33 sp:7fbb40c81c78 ax:ffffffffffffffda si:7fbb40c81db0 di:19
----------------
Code disassembly (best guess):
0: 90 nop
1: 90 nop
2: 90 nop
3: 90 nop
4: 90 nop
5: 90 nop
6: 90 nop
7: 90 nop
8: f3 0f 1e fa endbr64
c: 48 83 f9 40 cmp $0x40,%rcx
10: 73 40 jae 0x52
12: 83 f9 08 cmp $0x8,%ecx
15: 73 21 jae 0x38
17: 85 c9 test %ecx,%ecx
19: 74 0f je 0x2a
1b: 8a 06 mov (%rsi),%al
1d: 88 07 mov %al,(%rdi)
1f: 48 ff c7 inc %rdi
22: 48 ff c6 inc %rsi
25: 48 ff c9 dec %rcx
28: 75 f1 jne 0x1b
* 2a: c3 ret <-- trapping instruction
2b: cc int3
2c: cc int3
2d: cc int3
2e: cc int3
2f: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
36: 00 00
38: 48 8b 06 mov (%rsi),%rax
3b: 48 89 07 mov %rax,(%rdi)
3e: 48 rex.W
3f: 83 .byte 0x83


---
If you want syzbot to run the reproducer, reply with:
#syz test: git://repo/address.git branch-or-commit-hash
If you attach or paste a git patch, syzbot will apply it before testing.

2024-04-27 23:13:38

by Hillf Danton

[permalink] [raw]
Subject: Re: [syzbot] [bpf?] [trace?] possible deadlock in force_sig_info_to_task

On Sat, 27 Apr 2024 13:00:42 -0700
> syzbot has found a reproducer for the following issue on:
>
> HEAD commit: 5eb4573ea63d Merge tag 'soc-fixes-6.9-2' of git://git.kern..
> git tree: upstream
> console+strace: https://syzkaller.appspot.com/x/log.txt?x=17b2b240980000
> kernel config: https://syzkaller.appspot.com/x/.config?x=3d46aa9d7a44f40d
> dashboard link: https://syzkaller.appspot.com/bug?extid=83e7f982ca045ab4405c
> compiler: Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 2.40
> syz repro: https://syzkaller.appspot.com/x/repro.syz?x=120f79ef180000
> C reproducer: https://syzkaller.appspot.com/x/repro.c?x=13a1cd27180000
>
> Downloadable assets:
> disk image: https://storage.googleapis.com/syzbot-assets/d647177a878d/disk-5eb4573e.raw.xz
> vmlinux: https://storage.googleapis.com/syzbot-assets/977f32ca169c/vmlinux-5eb4573e.xz
> kernel image: https://storage.googleapis.com/syzbot-assets/67f3b92c1012/bzImage-5eb4573e.xz
>
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: [email protected]
>
> ======================================================
> WARNING: possible circular locking dependency detected
> 6.9.0-rc5-syzkaller-00296-g5eb4573ea63d #0 Not tainted
> ------------------------------------------------------
> syz-executor324/5151 is trying to acquire lock:
> ffff88802a6c8018 (&sighand->siglock){....}-{2:2}, at: force_sig_info_to_task+0x68/0x580 kernel/signal.c:1334
>
> but task is already holding lock:
> ffff8880b943e658 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x2a/0x140 kernel/sched/core.c:559
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #2 (&rq->__lock){-.-.}-{2:2}:
> lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
> _raw_spin_lock_nested+0x31/0x40 kernel/locking/spinlock.c:378
> raw_spin_rq_lock_nested+0x2a/0x140 kernel/sched/core.c:559
> raw_spin_rq_lock kernel/sched/sched.h:1387 [inline]
> rq_lock kernel/sched/sched.h:1701 [inline]
> task_fork_fair+0x61/0x1e0 kernel/sched/fair.c:12635
> sched_cgroup_fork+0x37c/0x410 kernel/sched/core.c:4845
> copy_process+0x2217/0x3df0 kernel/fork.c:2499
> kernel_clone+0x223/0x870 kernel/fork.c:2797
> user_mode_thread+0x132/0x1a0 kernel/fork.c:2875
> rest_init+0x23/0x300 init/main.c:704
> start_kernel+0x47a/0x500 init/main.c:1081
> x86_64_start_reservations+0x2a/0x30 arch/x86/kernel/head64.c:507
> x86_64_start_kernel+0x99/0xa0 arch/x86/kernel/head64.c:488
> common_startup_64+0x13e/0x147
>
> -> #1 (&p->pi_lock){-.-.}-{2:2}:
> lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
> __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
> _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
> class_raw_spinlock_irqsave_constructor include/linux/spinlock.h:553 [inline]
> try_to_wake_up+0xb0/0x1470 kernel/sched/core.c:4262
> signal_wake_up_state+0xb4/0x120 kernel/signal.c:773
> signal_wake_up include/linux/sched/signal.h:448 [inline]
> complete_signal+0x94a/0xcf0 kernel/signal.c:1065
> __send_signal_locked+0xb1b/0xdc0 kernel/signal.c:1185
> do_notify_parent+0xd96/0x10a0 kernel/signal.c:2143
> exit_notify kernel/exit.c:757 [inline]
> do_exit+0x1811/0x27e0 kernel/exit.c:898
> do_group_exit+0x207/0x2c0 kernel/exit.c:1027
> __do_sys_exit_group kernel/exit.c:1038 [inline]
> __se_sys_exit_group kernel/exit.c:1036 [inline]
> __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1036
> do_syscall_x64 arch/x86/entry/common.c:52 [inline]
> do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83
> entry_SYSCALL_64_after_hwframe+0x77/0x7f
>
> -> #0 (&sighand->siglock){....}-{2:2}:
> check_prev_add kernel/locking/lockdep.c:3134 [inline]
> check_prevs_add kernel/locking/lockdep.c:3253 [inline]
> validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869
> __lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
> lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
> __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
> _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
> force_sig_info_to_task+0x68/0x580 kernel/signal.c:1334
> force_sig_fault_to_task kernel/signal.c:1733 [inline]
> force_sig_fault+0x12c/0x1d0 kernel/signal.c:1738
> __bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
> handle_page_fault arch/x86/mm/fault.c:1505 [inline]

Given page fault with runqueue locked, bpf makes trouble instead of
helping anything in this case.

> exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
> asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
> rep_movs_alternative+0x22/0x70 arch/x86/lib/copy_user_64.S:48
> copy_user_generic arch/x86/include/asm/uaccess_64.h:110 [inline]
> raw_copy_from_user arch/x86/include/asm/uaccess_64.h:125 [inline]
> __copy_from_user_inatomic include/linux/uaccess.h:87 [inline]
> copy_from_user_nofault+0xbc/0x150 mm/maccess.c:125
> bpf_probe_read_user_common kernel/trace/bpf_trace.c:179 [inline]
> ____bpf_probe_read_compat kernel/trace/bpf_trace.c:292 [inline]
> bpf_probe_read_compat+0xe9/0x180 kernel/trace/bpf_trace.c:288
> bpf_prog_1878750df62aa1fb+0x48/0x4a
> bpf_dispatcher_nop_func include/linux/bpf.h:1234 [inline]
> __bpf_prog_run include/linux/filter.h:657 [inline]
> bpf_prog_run include/linux/filter.h:664 [inline]
> __bpf_trace_run kernel/trace/bpf_trace.c:2381 [inline]
> bpf_trace_run4+0x25a/0x490 kernel/trace/bpf_trace.c:2422
> __traceiter_sched_switch+0x98/0xd0 include/trace/events/sched.h:222
> trace_sched_switch include/trace/events/sched.h:222 [inline]
> __schedule+0x2535/0x4a00 kernel/sched/core.c:6743
> preempt_schedule_common+0x84/0xd0 kernel/sched/core.c:6925
> preempt_schedule+0xe1/0xf0 kernel/sched/core.c:6949
> preempt_schedule_thunk+0x1a/0x30 arch/x86/entry/thunk_64.S:12
> __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline]
> _raw_spin_unlock_irqrestore+0x130/0x140 kernel/locking/spinlock.c:194
> spin_unlock_irqrestore include/linux/spinlock.h:406 [inline]
> force_sig_info_to_task+0x41c/0x580 kernel/signal.c:1356
> force_sig_fault_to_task kernel/signal.c:1733 [inline]
> force_sig_fault+0x12c/0x1d0 kernel/signal.c:1738
> __bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
> handle_page_fault arch/x86/mm/fault.c:1505 [inline]
> exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
> asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
> __put_user_handle_exception+0x0/0x10
> __do_sys_gettimeofday kernel/time/time.c:147 [inline]
> __se_sys_gettimeofday+0xd9/0x240 kernel/time/time.c:140
> emulate_vsyscall+0xe23/0x1290 arch/x86/entry/vsyscall/vsyscall_64.c:247
> do_user_addr_fault arch/x86/mm/fault.c:1346 [inline]
> handle_page_fault arch/x86/mm/fault.c:1505 [inline]
> exc_page_fault+0x160/0x8e0 arch/x86/mm/fault.c:1563
> asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
> _end+0x6a9da000/0x0
>
> other info that might help us debug this:
>
> Chain exists of:
> &sighand->siglock --> &p->pi_lock --> &rq->__lock
>
> Possible unsafe locking scenario:
>
> CPU0 CPU1
> ---- ----
> lock(&rq->__lock);
> lock(&p->pi_lock);
> lock(&rq->__lock);
> lock(&sighand->siglock);
>
> *** DEADLOCK ***
>
> 2 locks held by syz-executor324/5151:
> #0: ffff8880b943e658 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x2a/0x140 kernel/sched/core.c:559
> #1: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:329 [inline]
> #1: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:781 [inline]
> #1: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: __bpf_trace_run kernel/trace/bpf_trace.c:2380 [inline]
> #1: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: bpf_trace_run4+0x16e/0x490 kernel/trace/bpf_trace.c:2422
>
> stack backtrace:
> CPU: 0 PID: 5151 Comm: syz-executor324 Not tainted 6.9.0-rc5-syzkaller-00296-g5eb4573ea63d #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024
> Call Trace:
> <TASK>
> __dump_stack lib/dump_stack.c:88 [inline]
> dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114
> check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2187
> check_prev_add kernel/locking/lockdep.c:3134 [inline]
> check_prevs_add kernel/locking/lockdep.c:3253 [inline]
> validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869
> __lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
> lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
> __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
> _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
> force_sig_info_to_task+0x68/0x580 kernel/signal.c:1334
> force_sig_fault_to_task kernel/signal.c:1733 [inline]
> force_sig_fault+0x12c/0x1d0 kernel/signal.c:1738
> __bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
> handle_page_fault arch/x86/mm/fault.c:1505 [inline]
> exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
> asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
> RIP: 0010:rep_movs_alternative+0x22/0x70 arch/x86/lib/copy_user_64.S:50
> Code: 90 90 90 90 90 90 90 90 f3 0f 1e fa 48 83 f9 40 73 40 83 f9 08 73 21 85 c9 74 0f 8a 06 88 07 48 ff c7 48 ff c6 48 ff c9 75 f1 <c3> cc cc cc cc 66 0f 1f 84 00 00 00 00 00 48 8b 06 48 89 07 48 83
> RSP: 0000:ffffc90004137468 EFLAGS: 00050002
> RAX: ffffffff8205ce4e RBX: dffffc0000000000 RCX: 0000000000000002
> RDX: 0000000000000000 RSI: 0000000000000900 RDI: ffffc900041374e8
> RBP: ffff88802d039784 R08: 0000000000000005 R09: ffffffff8205ce37
> R10: 0000000000000003 R11: ffff88802d038000 R12: 1ffff11005a072f0
> R13: 0000000000000900 R14: 0000000000000002 R15: ffffc900041374e8
> copy_user_generic arch/x86/include/asm/uaccess_64.h:110 [inline]
> raw_copy_from_user arch/x86/include/asm/uaccess_64.h:125 [inline]
> __copy_from_user_inatomic include/linux/uaccess.h:87 [inline]
> copy_from_user_nofault+0xbc/0x150 mm/maccess.c:125
> bpf_probe_read_user_common kernel/trace/bpf_trace.c:179 [inline]
> ____bpf_probe_read_compat kernel/trace/bpf_trace.c:292 [inline]
> bpf_probe_read_compat+0xe9/0x180 kernel/trace/bpf_trace.c:288
> bpf_prog_1878750df62aa1fb+0x48/0x4a
> bpf_dispatcher_nop_func include/linux/bpf.h:1234 [inline]
> __bpf_prog_run include/linux/filter.h:657 [inline]
> bpf_prog_run include/linux/filter.h:664 [inline]
> __bpf_trace_run kernel/trace/bpf_trace.c:2381 [inline]
> bpf_trace_run4+0x25a/0x490 kernel/trace/bpf_trace.c:2422
> __traceiter_sched_switch+0x98/0xd0 include/trace/events/sched.h:222
> trace_sched_switch include/trace/events/sched.h:222 [inline]
> __schedule+0x2535/0x4a00 kernel/sched/core.c:6743
> preempt_schedule_common+0x84/0xd0 kernel/sched/core.c:6925
> preempt_schedule+0xe1/0xf0 kernel/sched/core.c:6949
> preempt_schedule_thunk+0x1a/0x30 arch/x86/entry/thunk_64.S:12
> __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline]
> _raw_spin_unlock_irqrestore+0x130/0x140 kernel/locking/spinlock.c:194
> spin_unlock_irqrestore include/linux/spinlock.h:406 [inline]
> force_sig_info_to_task+0x41c/0x580 kernel/signal.c:1356
> force_sig_fault_to_task kernel/signal.c:1733 [inline]
> force_sig_fault+0x12c/0x1d0 kernel/signal.c:1738
> __bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
> handle_page_fault arch/x86/mm/fault.c:1505 [inline]
> exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
> asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
> RIP: 0010:__put_user_handle_exception+0x0/0x10 arch/x86/lib/putuser.S:125
> Code: 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 01 cb 48 89 01 31 c9 0f 01 ca c3 cc cc cc cc 66 2e 0f 1f 84 00 00 00 00 00 66 90 <0f> 01 ca b9 f2 ff ff ff c3 cc cc cc cc 0f 1f 00 90 90 90 90 90 90
> RSP: 0000:ffffc90004137d98 EFLAGS: 00050202
> RAX: 00000000662d5943 RBX: 0000000000000000 RCX: 0000000000000019
> RDX: 0000000000000000 RSI: ffffffff8bcaca20 RDI: ffffffff8c1eaba0
> RBP: ffffc90004137e50 R08: ffffffff8fa7cd6f R09: 1ffffffff1f4f9ad
> R10: dffffc0000000000 R11: fffffbfff1f4f9ae R12: ffffc90004137de0
> R13: dffffc0000000000 R14: 1ffff92000826fb8 R15: 0000000000000019
> __do_sys_gettimeofday kernel/time/time.c:147 [inline]
> __se_sys_gettimeofday+0xd9/0x240 kernel/time/time.c:140
> emulate_vsyscall+0xe23/0x1290 arch/x86/entry/vsyscall/vsyscall_64.c:247
> do_user_addr_fault arch/x86/mm/fault.c:1346 [inline]
> handle_page_fault arch/x86/mm/fault.c:1505 [inline]
> exc_page_fault+0x160/0x8e0 arch/x86/mm/fault.c:1563
> asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
> RIP: 0033:_end+0x6a9da000/0x0
> Code: Unable to access opcode bytes at 0xffffffffff5fffd6.
> RSP: 002b:00007fbb40c81c78 EFLAGS: 00010246
> RAX: ffffffffffffffda RBX: 00007fbb40d73418 RCX: 00007fbb40ce97d9
> RDX: 00007fbb40c81c80 RSI: 00007fbb40c81db0 RDI: 0000000000000019
> RBP: 00007fbb40d73410 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000007 R11: 0000000000000246 R12: 00007fbb40d402b0
> R13: 77735f6465686373 R14: 66aa589070d556b8 R15: 0400000000000004
> </TASK>
> syz-executor324[5151] vsyscall fault (exploit attempt?) ip:ffffffffff600000 cs:33 sp:7fbb40c81c78 ax:ffffffffffffffda si:7fbb40c81db0 di:19
> ----------------
> Code disassembly (best guess):
> 0: 90 nop
> 1: 90 nop
> 2: 90 nop
> 3: 90 nop
> 4: 90 nop
> 5: 90 nop
> 6: 90 nop
> 7: 90 nop
> 8: f3 0f 1e fa endbr64
> c: 48 83 f9 40 cmp $0x40,%rcx
> 10: 73 40 jae 0x52
> 12: 83 f9 08 cmp $0x8,%ecx
> 15: 73 21 jae 0x38
> 17: 85 c9 test %ecx,%ecx
> 19: 74 0f je 0x2a
> 1b: 8a 06 mov (%rsi),%al
> 1d: 88 07 mov %al,(%rdi)
> 1f: 48 ff c7 inc %rdi
> 22: 48 ff c6 inc %rsi
> 25: 48 ff c9 dec %rcx
> 28: 75 f1 jne 0x1b
> * 2a: c3 ret <-- trapping instruction
> 2b: cc int3
> 2c: cc int3
> 2d: cc int3
> 2e: cc int3
> 2f: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
> 36: 00 00
> 38: 48 8b 06 mov (%rsi),%rax
> 3b: 48 89 07 mov %rax,(%rdi)
> 3e: 48 rex.W
> 3f: 83 .byte 0x83
>
>
> ---
> If you want syzbot to run the reproducer, reply with:
> #syz test: git://repo/address.git branch-or-commit-hash
> If you attach or paste a git patch, syzbot will apply it before testing.
>

2024-04-28 20:01:52

by Linus Torvalds

[permalink] [raw]
Subject: Re: [syzbot] [bpf?] [trace?] possible deadlock in force_sig_info_to_task

On Sat, 27 Apr 2024 at 16:13, Hillf Danton <[email protected]> wrote:
>
> > -> #0 (&sighand->siglock){....}-{2:2}:
> > check_prev_add kernel/locking/lockdep.c:3134 [inline]
> > check_prevs_add kernel/locking/lockdep.c:3253 [inline]
> > validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869
> > __lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
> > lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
> > __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
> > _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
> > force_sig_info_to_task+0x68/0x580 kernel/signal.c:1334
> > force_sig_fault_to_task kernel/signal.c:1733 [inline]
> > force_sig_fault+0x12c/0x1d0 kernel/signal.c:1738
> > __bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
> > handle_page_fault arch/x86/mm/fault.c:1505 [inline]
>
> Given page fault with runqueue locked, bpf makes trouble instead of
> helping anything in this case.

That's not the odd thing here.

Look, the callchain is:

> > exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
> > asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
> > rep_movs_alternative+0x22/0x70 arch/x86/lib/copy_user_64.S:48
> > copy_user_generic arch/x86/include/asm/uaccess_64.h:110 [inline]
> > raw_copy_from_user arch/x86/include/asm/uaccess_64.h:125 [inline]
> > __copy_from_user_inatomic include/linux/uaccess.h:87 [inline]
> > copy_from_user_nofault+0xbc/0x150 mm/maccess.c:125

IOW, this is all doing a copy from user with page faults disabled, and
it shouldn't have caused a signal to be sent, so the whole
__bad_area_nosemaphore -> force_sig_fault path is bad.

The *problem* here is that the page fault doesn't actually happen on a
user access, it happens on the *ret* instruction in
rep_movs_alternative itself (which doesn't have a exception fixup,
obviously, because no exception is supposed to happen there!):

RIP: 0010:rep_movs_alternative+0x22/0x70 arch/x86/lib/copy_user_64.S:50
Code: 90 90 90 90 90 90 90 90 f3 0f 1e fa 48 83 f9 40 73 40 83 f9 08
73 21 85 c9 74 0f 8a 06 88 07 48 ff c7 48 ff c6 48 ff c9 75 f1 <c3> cc
cc cc cc 66 0f 1f 84 00 00 0$
RSP: 0000:ffffc90004137468 EFLAGS: 00050002
RAX: ffffffff8205ce4e RBX: dffffc0000000000 RCX: 0000000000000002
RDX: 0000000000000000 RSI: 0000000000000900 RDI: ffffc900041374e8
RBP: ffff88802d039784 R08: 0000000000000005 R09: ffffffff8205ce37
R10: 0000000000000003 R11: ffff88802d038000 R12: 1ffff11005a072f0
R13: 0000000000000900 R14: 0000000000000002 R15: ffffc900041374e8

where decoding that "Code:" line gives this:

0: f3 0f 1e fa endbr64
4: 48 83 f9 40 cmp $0x40,%rcx
8: 73 40 jae 0x4a
a: 83 f9 08 cmp $0x8,%ecx
d: 73 21 jae 0x30
f: 85 c9 test %ecx,%ecx
11: 74 0f je 0x22
13: 8a 06 mov (%rsi),%al
15: 88 07 mov %al,(%rdi)
17: 48 ff c7 inc %rdi
1a: 48 ff c6 inc %rsi
1d: 48 ff c9 dec %rcx
20: 75 f1 jne 0x13
22:* c3 ret <-- trapping instruction

but I have no idea why the 'ret' instruction would take a page fault.
It really shouldn't.

Now, it's not like 'ret' instructions can't take page faults, but it
sure shouldn't happen in the *kernel*. The reasons for page faults on
'ret' instructions are:

- the instruction itself takes a page fault

- the stack pointer is bogus

- possibly because the stack *contents* are bogus (at least some x86
instructions that jump will check the destination in the jump
instruction itself, although I didn't think 'ret' was one of them)

but for the kernel, none of these actually seem to be the case
normally. And even abnormally I don't see this being an issue, since
the exception backtrace is happily shown (ie the stack looks all
good).

So this dump is just *WEIRD*.

End result: the problem is not about any kind of deadlock on circular
locking. That's just the symptom of that odd page fault that shouldn't
have happened, and that I don't quite see how it happened.

Linus

2024-04-28 20:22:56

by Linus Torvalds

[permalink] [raw]
Subject: Re: [syzbot] [bpf?] [trace?] possible deadlock in force_sig_info_to_task

On Sun, 28 Apr 2024 at 13:01, Linus Torvalds
<[email protected]> wrote:
>
> The *problem* here is that the page fault doesn't actually happen on a
> user access, it happens on the *ret* instruction in
> rep_movs_alternative itself (which doesn't have a exception fixup,
> obviously, because no exception is supposed to happen there!):

Actually, there's another page fault deeper in that call chain:

asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
RIP: 0010:__put_user_handle_exception+0x0/0x10 arch/x86/lib/putuser.S:125
Code: 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 01 cb 48 89 01 31
c9 0f 01 ca c3 cc cc cc cc 66 2e 0f 1f 84 00 00 00 00 00 66 90 <0f> 01
ca b9 f2 ff ff ff c3 cc cc cc cc 0f 1f 00 90 90 90 90 90 90
RSP: 0000:ffffc90004137d98 EFLAGS: 00050202
RAX: 00000000662d5943 RBX: 0000000000000000 RCX: 0000000000000019
RDX: 0000000000000000 RSI: ffffffff8bcaca20 RDI: ffffffff8c1eaba0
RBP: ffffc90004137e50 R08: ffffffff8fa7cd6f R09: 1ffffffff1f4f9ad
R10: dffffc0000000000 R11: fffffbfff1f4f9ae R12: ffffc90004137de0
R13: dffffc0000000000 R14: 1ffff92000826fb8 R15: 0000000000000019
__do_sys_gettimeofday kernel/time/time.c:147 [inline]
__se_sys_gettimeofday+0xd9/0x240 kernel/time/time.c:140

which is also nonsensical, since that "<0f> 01 ca" code is just the
"CLAC" instruction (which is the first instruction of
__put_user_handle_exception, which is the exception fixup for the
__put_user() functions.

So that seems to be the *first* problem spot, actually. It too is
incomprehensible to me. I must be missing something. A "clac"
instruction cannot take a page fault (except for the instruction fetch
itself, of course).

So if the page fault on the 'RET' instruction was odd, the page fault
on the CLAC is *really* odd.

That original page fault looks like it's just from one of the
put_user() calls in gettimeofday():

if (put_user(ts.tv_sec, &tv->tv_sec) ||
put_user(ts.tv_nsec / 1000, &tv->tv_usec))

and yes, they can fault, but I'm not seeing how that then points to
the CLAC in the exception handler.

Linus

2024-04-28 23:23:24

by Hillf Danton

[permalink] [raw]
Subject: Re: [syzbot] [bpf?] [trace?] possible deadlock in force_sig_info_to_task

On Sun, 28 Apr 2024 13:01:19 -0700 Linus Torvalds wrote:
> On Sat, 27 Apr 2024 at 16:13, Hillf Danton <[email protected]> wrote:
> >
> > > -> #0 (&sighand->siglock){....}-{2:2}:
> > > check_prev_add kernel/locking/lockdep.c:3134 [inline]
> > > check_prevs_add kernel/locking/lockdep.c:3253 [inline]
> > > validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869
> > > __lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
> > > lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754
> > > __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
> > > _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
> > > force_sig_info_to_task+0x68/0x580 kernel/signal.c:1334
> > > force_sig_fault_to_task kernel/signal.c:1733 [inline]
> > > force_sig_fault+0x12c/0x1d0 kernel/signal.c:1738
> > > __bad_area_nosemaphore+0x127/0x780 arch/x86/mm/fault.c:814
> > > handle_page_fault arch/x86/mm/fault.c:1505 [inline]
> >
> > Given page fault with runqueue locked, bpf makes trouble instead of
> > helping anything in this case.
>
> That's not the odd thing here.
>
> Look, the callchain is:
>
> > > exc_page_fault+0x612/0x8e0 arch/x86/mm/fault.c:1563
> > > asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
> > > rep_movs_alternative+0x22/0x70 arch/x86/lib/copy_user_64.S:48
> > > copy_user_generic arch/x86/include/asm/uaccess_64.h:110 [inline]
> > > raw_copy_from_user arch/x86/include/asm/uaccess_64.h:125 [inline]
> > > __copy_from_user_inatomic include/linux/uaccess.h:87 [inline]
> > > copy_from_user_nofault+0xbc/0x150 mm/maccess.c:125
>
> IOW, this is all doing a copy from user with page faults disabled, and
> it shouldn't have caused a signal to be sent, so the whole
> __bad_area_nosemaphore -> force_sig_fault path is bad.
>
So is game like copying from/putting to user with runqueue locked
at the first place.

Plus as per another syzbot report [1], bpf could make trouble with
workqueue pool locked.

[1] https://lore.kernel.org/lkml/[email protected]/

2024-04-29 00:51:00

by Linus Torvalds

[permalink] [raw]
Subject: Re: [syzbot] [bpf?] [trace?] possible deadlock in force_sig_info_to_task

On Sun, 28 Apr 2024 at 16:23, Hillf Danton <[email protected]> wrote:
>
> So is game like copying from/putting to user with runqueue locked
> at the first place.

No, that should be perfectly fine. In fact, it's even normal. It would
happen any time you have any kind of tracing thing, where looking up
the user mode frame involves doing user accesses with page faults
disabled.

The runqueue lock is irrelevant. As mentioned, it's only a symptom of
something else going wrong.

Now, judging by the syz reproducer, the trigger for this all is almost
certainly that

bpf$BPF_RAW_TRACEPOINT_OPEN(0x11,
&(0x7f00000000c0)={&(0x7f0000000080)='sched_switch\x00', r0}, 0x10)

and that probably causes the instability. But the immediate problem is
not the user space access, it's that something goes horribly wrong
*around* it.

> Plus as per another syzbot report [1], bpf could make trouble with
> workqueue pool locked.

That seems to be entirely different. There's no unexplained page fault
in that case, that seems to be purely a "take lock in the wrong order"

Linus

2024-04-29 01:00:58

by Tetsuo Handa

[permalink] [raw]
Subject: Re: [syzbot] [bpf?] [trace?] possible deadlock in force_sig_info_to_task

On 2024/04/29 9:50, Linus Torvalds wrote:
> On Sun, 28 Apr 2024 at 16:23, Hillf Danton <[email protected]> wrote:
>>
>> So is game like copying from/putting to user with runqueue locked
>> at the first place.
>
> No, that should be perfectly fine. In fact, it's even normal. It would
> happen any time you have any kind of tracing thing, where looking up
> the user mode frame involves doing user accesses with page faults
> disabled.
>
> The runqueue lock is irrelevant. As mentioned, it's only a symptom of
> something else going wrong.
>
> Now, judging by the syz reproducer, the trigger for this all is almost
> certainly that
>
> bpf$BPF_RAW_TRACEPOINT_OPEN(0x11,
> &(0x7f00000000c0)={&(0x7f0000000080)='sched_switch\x00', r0}, 0x10)
>
> and that probably causes the instability. But the immediate problem is
> not the user space access, it's that something goes horribly wrong
> *around* it.

I can't recall title of the commit, but I feel that things went very wrong
after a commit that allows running tracing function upon lock contention
(run code when e.g. a spinlock could not be taken) was introduced.
That commit is forming random locking dependency, resulting in flood of
lockdep warnings.

>
>> Plus as per another syzbot report [1], bpf could make trouble with
>> workqueue pool locked.
>
> That seems to be entirely different. There's no unexplained page fault
> in that case, that seems to be purely a "take lock in the wrong order"
>
> Linus


2024-04-29 01:34:13

by Linus Torvalds

[permalink] [raw]
Subject: Re: [syzbot] [bpf?] [trace?] possible deadlock in force_sig_info_to_task

On Sun, 28 Apr 2024 at 17:50, Linus Torvalds
<[email protected]> wrote:
>
> But the immediate problem is
> not the user space access, it's that something goes horribly wrong
> *around* it.

Side note: that stack trace from hell actually has three nested page
faults, and I think that's actually the important thing here:

- the first page fault is from user space, and triggers the vsyscall emulation.

- the second page fault is from __do_sys_gettimeofday, and that
should just have caused the exception that then sets the return value
to -EFAULT

- the third nested page fault is due to _raw_spin_unlock_irqrestore
-> preempt_schedule -> trace_sched_switch, which then causes that bpf
trace program to run, which does that bpf_probe_read_compat, which
causes that page fault under pagefault_disable().

It's quite the nasty backtrace, and there's a lot going on.

And I think I finally see what may be going on. The problem is
literally the vsyscall emulation, which sets

current->thread.sig_on_uaccess_err = 1;

and that causes the fixup_exception() code to send the signal
*despite* the exception being caught.

And I think that is in fact completely bogus. It's completely bogus
exactly because it sends that signal even when it *shouldn't* be sent
- like for the bpf user mode trace gathering.

In other words, I think the whole "sig_on_uaccess_err" thing is
entirely broken, because it makes any nested page-faults do all the
wrong things.

Now, arguably, I don't think anybody should enable vsyscall emulation
any more, but this test case clearly does.

I think we should just make the "send SIGSEGV" be something that the
vsyscall emulation does on its own, not this broken per-thread state
for something that isn't actually per thread.

The x86 page fault code actually tried to deal with the "incorrect
nesting" by having that

if (in_interrupt())
return;

which ignores the sig_on_uaccess_err case when it happens in
interrupts, but as shown by this example, these nested page faults do
not need to be about interrupts at all.

IOW, I think the only right thing is to remove that horrendously broken code.

The attached patch is ENTIRELY UNTESTED, but looks like the
ObviouslyCorrect(tm) thing to do.

NOTE! This broken code goes back to commit 4fc3490114bb ("x86-64: Set
siginfo and context on vsyscall emulation faults") in 2011, and back
then the reason was to get all the siginfo details right. Honestly, I
do not for a moment believe that it's worth getting the siginfo
details right here, but part of the commit says

This fixes issues with UML when vsyscall=emulate.

and so my patch to remove this garbage will probably break UML in this
situation.

I cannot find it in myself to care, since I do not believe that
anybody should be running with vsyscall=emulate in 2024 in the first
place, much less if you are doing things like UML. But let's see if
somebody screams.

Also, somebody should obviously test my COMPLETELY UNTESTED patch.

Did I make it clear enough that this is UNTESTED and just does
crapectgomy on something that is clearly broken?

Linus "UNTESTED" Torvalds


Attachments:
patch.diff (3.91 kB)

2024-04-29 08:01:05

by Ingo Molnar

[permalink] [raw]
Subject: [PATCH] x86/mm: Remove broken vsyscall emulation code from the page fault code


* Linus Torvalds <[email protected]> wrote:

> The attached patch is ENTIRELY UNTESTED, but looks like the
> ObviouslyCorrect(tm) thing to do.
>
> NOTE! This broken code goes back to commit 4fc3490114bb ("x86-64: Set
> siginfo and context on vsyscall emulation faults") in 2011, and back
> then the reason was to get all the siginfo details right. Honestly, I
> do not for a moment believe that it's worth getting the siginfo
> details right here, but part of the commit says
>
> This fixes issues with UML when vsyscall=emulate.
>
> and so my patch to remove this garbage will probably break UML in this
> situation.
>
> I cannot find it in myself to care, since I do not believe that
> anybody should be running with vsyscall=emulate in 2024 in the first
> place, much less if you are doing things like UML. But let's see if
> somebody screams.
>
> Also, somebody should obviously test my COMPLETELY UNTESTED patch.
>
> Did I make it clear enough that this is UNTESTED and just does
> crapectgomy on something that is clearly broken?
>
> Linus "UNTESTED" Torvalds

I did some Simple Testing™, and nothing seemed to break in any way visible
to me, and the diffstat is lovely:

3 files changed, 3 insertions(+), 56 deletions(-)

Might stick this into tip:x86/mm and see what happens?

I'd love to remove the rest of the vsyscall emulation code as well. I don't
think anyone cares about vsyscall emulation anymore (let alone in an UML
context), IIRC it requires ancient glibc I don't think we even support
anymore (but I'm unsure about the exact version cutoff).

I created a changelog from your email, editing parts of it, and added your
Net-Yet-Signed-off-by tag.

Thanks,

Ingo

===================================>
From: Linus Torvalds <[email protected]>
Date: Sun, 28 Apr 2024 18:33:41 -0700
Subject: [PATCH] x86/mm: Remove broken vsyscall emulation code from the page fault code

The syzbot-reported stack trace from hell in this discussion thread
actually has three nested page faults:

https://lore.kernel.org/r/[email protected]

.. and I think that's actually the important thing here:

- the first page fault is from user space, and triggers the vsyscall
emulation.

- the second page fault is from __do_sys_gettimeofday(), and that should
just have caused the exception that then sets the return value to
-EFAULT

- the third nested page fault is due to _raw_spin_unlock_irqrestore() ->
preempt_schedule() -> trace_sched_switch(), which then causes a BPF
trace program to run, which does that bpf_probe_read_compat(), which
causes that page fault under pagefault_disable().

It's quite the nasty backtrace, and there's a lot going on.

The problem is literally the vsyscall emulation, which sets

current->thread.sig_on_uaccess_err = 1;

and that causes the fixup_exception() code to send the signal *despite* the
exception being caught.

And I think that is in fact completely bogus. It's completely bogus
exactly because it sends that signal even when it *shouldn't* be sent -
like for the BPF user mode trace gathering.

In other words, I think the whole "sig_on_uaccess_err" thing is entirely
broken, because it makes any nested page-faults do all the wrong things.

Now, arguably, I don't think anybody should enable vsyscall emulation any
more, but this test case clearly does.

I think we should just make the "send SIGSEGV" be something that the
vsyscall emulation does on its own, not this broken per-thread state for
something that isn't actually per thread.

The x86 page fault code actually tried to deal with the "incorrect nesting"
by having that:

if (in_interrupt())
return;

which ignores the sig_on_uaccess_err case when it happens in interrupts,
but as shown by this example, these nested page faults do not need to be
about interrupts at all.

IOW, I think the only right thing is to remove that horrendously broken
code.

The attached patch looks like the ObviouslyCorrect(tm) thing to do.

NOTE! This broken code goes back to this commit in 2011:

4fc3490114bb ("x86-64: Set siginfo and context on vsyscall emulation faults")

.. and back then the reason was to get all the siginfo details right.
Honestly, I do not for a moment believe that it's worth getting the siginfo
details right here, but part of the commit says:

This fixes issues with UML when vsyscall=emulate.

.. and so my patch to remove this garbage will probably break UML in this
situation.

I do not believe that anybody should be running with vsyscall=emulate in
2024 in the first place, much less if you are doing things like UML. But
let's see if somebody screams.

Not-Yet-Signed-off-by: Linus Torvalds <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Link: https://lore.kernel.org/r/CAHk-=wh9D6f7HUkDgZHKmDCHUQmp+Co89GP+b8+z+G56BKeyNg@mail.gmail.com
---
arch/x86/entry/vsyscall/vsyscall_64.c | 25 ++-----------------------
arch/x86/include/asm/processor.h | 1 -
arch/x86/mm/fault.c | 33 +--------------------------------
3 files changed, 3 insertions(+), 56 deletions(-)

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index a3c0df11d0e6..3b0f61b2ea6d 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)

static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
- /*
- * XXX: if access_ok, get_user, and put_user handled
- * sig_on_uaccess_err, this could go away.
- */
-
if (!access_ok((void __user *)ptr, size)) {
struct thread_struct *thread = &current->thread;

@@ -123,7 +118,6 @@ bool emulate_vsyscall(unsigned long error_code,
struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr, syscall_nr, tmp;
- int prev_sig_on_uaccess_err;
long ret;
unsigned long orig_dx;

@@ -234,12 +228,8 @@ bool emulate_vsyscall(unsigned long error_code,
goto do_ret; /* skip requested */

/*
- * With a real vsyscall, page faults cause SIGSEGV. We want to
- * preserve that behavior to make writing exploits harder.
+ * With a real vsyscall, page faults cause SIGSEGV.
*/
- prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
- current->thread.sig_on_uaccess_err = 1;
-
ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
@@ -262,23 +252,12 @@ bool emulate_vsyscall(unsigned long error_code,
break;
}

- current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
-
check_fault:
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall fault (exploit attempt?)");
-
- /*
- * If we failed to generate a signal for any reason,
- * generate one here. (This should be impossible.)
- */
- if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
- !sigismember(&tsk->pending.signal, SIGSEGV)))
- goto sigsegv;
-
- return true; /* Don't emulate the ret. */
+ goto sigsegv;
}

regs->ax = ret;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 811548f131f4..78e51b0d6433 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -472,7 +472,6 @@ struct thread_struct {
unsigned long iopl_emul;

unsigned int iopl_warn:1;
- unsigned int sig_on_uaccess_err:1;

/*
* Protection Keys Register for Userspace. Loaded immediately on
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 6b2ca8ba75b8..f26ecabc9424 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -724,39 +724,8 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
WARN_ON_ONCE(user_mode(regs));

/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
- /*
- * Any interrupt that takes a fault gets the fixup. This makes
- * the below recursive fault logic only apply to a faults from
- * task context.
- */
- if (in_interrupt())
- return;
-
- /*
- * Per the above we're !in_interrupt(), aka. task context.
- *
- * In this case we need to make sure we're not recursively
- * faulting through the emulate_vsyscall() logic.
- */
- if (current->thread.sig_on_uaccess_err && signal) {
- sanitize_error_code(address, &error_code);
-
- set_signal_archinfo(address, error_code);
-
- if (si_code == SEGV_PKUERR) {
- force_sig_pkuerr((void __user *)address, pkey);
- } else {
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
- }
- }
-
- /*
- * Barring that, we can do the fixup and be happy.
- */
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
return;
- }

/*
* AMD erratum #91 manifests as a spurious page fault on a PREFETCH

2024-04-29 10:49:17

by Hillf Danton

[permalink] [raw]
Subject: Re: [syzbot] [bpf?] [trace?] possible deadlock in force_sig_info_to_task

On Sun, 28 Apr 2024 18:33:41 -0700 Linus Torvalds wrote:
> I cannot find it in myself to care, since I do not believe that
> anybody should be running with vsyscall=emulate in 2024 in the first
> place, much less if you are doing things like UML. But let's see if
> somebody screams.
>
> Also, somebody should obviously test my COMPLETELY UNTESTED patch.

#syz test https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 5eb4573ea63d

arch/x86/entry/vsyscall/vsyscall_64.c | 25 ++-----------------------
arch/x86/include/asm/processor.h | 1 -
arch/x86/mm/fault.c | 33 +--------------------------------
3 files changed, 3 insertions(+), 56 deletions(-)

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index a3c0df11d0e6..3b0f61b2ea6d 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)

static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
- /*
- * XXX: if access_ok, get_user, and put_user handled
- * sig_on_uaccess_err, this could go away.
- */
-
if (!access_ok((void __user *)ptr, size)) {
struct thread_struct *thread = &current->thread;

@@ -123,7 +118,6 @@ bool emulate_vsyscall(unsigned long error_code,
struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr, syscall_nr, tmp;
- int prev_sig_on_uaccess_err;
long ret;
unsigned long orig_dx;

@@ -234,12 +228,8 @@ bool emulate_vsyscall(unsigned long error_code,
goto do_ret; /* skip requested */

/*
- * With a real vsyscall, page faults cause SIGSEGV. We want to
- * preserve that behavior to make writing exploits harder.
+ * With a real vsyscall, page faults cause SIGSEGV.
*/
- prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
- current->thread.sig_on_uaccess_err = 1;
-
ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
@@ -262,23 +252,12 @@ bool emulate_vsyscall(unsigned long error_code,
break;
}

- current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
-
check_fault:
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall fault (exploit attempt?)");
-
- /*
- * If we failed to generate a signal for any reason,
- * generate one here. (This should be impossible.)
- */
- if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
- !sigismember(&tsk->pending.signal, SIGSEGV)))
- goto sigsegv;
-
- return true; /* Don't emulate the ret. */
+ goto sigsegv;
}

regs->ax = ret;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 811548f131f4..78e51b0d6433 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -472,7 +472,6 @@ struct thread_struct {
unsigned long iopl_emul;

unsigned int iopl_warn:1;
- unsigned int sig_on_uaccess_err:1;

/*
* Protection Keys Register for Userspace. Loaded immediately on
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 622d12ec7f08..bba4e020dd64 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -723,39 +723,8 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
WARN_ON_ONCE(user_mode(regs));

/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
- /*
- * Any interrupt that takes a fault gets the fixup. This makes
- * the below recursive fault logic only apply to a faults from
- * task context.
- */
- if (in_interrupt())
- return;
-
- /*
- * Per the above we're !in_interrupt(), aka. task context.
- *
- * In this case we need to make sure we're not recursively
- * faulting through the emulate_vsyscall() logic.
- */
- if (current->thread.sig_on_uaccess_err && signal) {
- sanitize_error_code(address, &error_code);
-
- set_signal_archinfo(address, error_code);
-
- if (si_code == SEGV_PKUERR) {
- force_sig_pkuerr((void __user *)address, pkey);
- } else {
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
- }
- }
-
- /*
- * Barring that, we can do the fixup and be happy.
- */
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
return;
- }

/*
* AMD erratum #91 manifests as a spurious page fault on a PREFETCH
--

2024-04-29 11:36:11

by syzbot

[permalink] [raw]
Subject: Re: [syzbot] [bpf?] [trace?] possible deadlock in force_sig_info_to_task

Hello,

syzbot has tested the proposed patch and the reproducer did not trigger any issue:

Reported-and-tested-by: [email protected]

Tested on:

commit: 5eb4573e Merge tag 'soc-fixes-6.9-2' of git://git.kern..
git tree: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
console output: https://syzkaller.appspot.com/x/log.txt?x=14b4957f180000
kernel config: https://syzkaller.appspot.com/x/.config?x=3d46aa9d7a44f40d
dashboard link: https://syzkaller.appspot.com/bug?extid=83e7f982ca045ab4405c
compiler: Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 2.40
patch: https://syzkaller.appspot.com/x/patch.diff?x=13471b80980000

Note: testing is done by a robot and is best-effort only.

2024-04-29 13:54:00

by Jiri Olsa

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Remove broken vsyscall emulation code from the page fault code

On Mon, Apr 29, 2024 at 10:00:51AM +0200, Ingo Molnar wrote:

SNIP

> The attached patch looks like the ObviouslyCorrect(tm) thing to do.
>
> NOTE! This broken code goes back to this commit in 2011:
>
> 4fc3490114bb ("x86-64: Set siginfo and context on vsyscall emulation faults")
>
> ... and back then the reason was to get all the siginfo details right.
> Honestly, I do not for a moment believe that it's worth getting the siginfo
> details right here, but part of the commit says:
>
> This fixes issues with UML when vsyscall=emulate.
>
> ... and so my patch to remove this garbage will probably break UML in this
> situation.
>
> I do not believe that anybody should be running with vsyscall=emulate in
> 2024 in the first place, much less if you are doing things like UML. But
> let's see if somebody screams.
>
> Not-Yet-Signed-off-by: Linus Torvalds <[email protected]>
> Signed-off-by: Ingo Molnar <[email protected]>
> Link: https://lore.kernel.org/r/CAHk-=wh9D6f7HUkDgZHKmDCHUQmp+Co89GP+b8+z+G56BKeyNg@mail.gmail.com

fwiw I can no longer trigger the invalid wait context bug
with this change

Tested-by: Jiri Olsa <[email protected]>

jirka

> ---
> arch/x86/entry/vsyscall/vsyscall_64.c | 25 ++-----------------------
> arch/x86/include/asm/processor.h | 1 -
> arch/x86/mm/fault.c | 33 +--------------------------------
> 3 files changed, 3 insertions(+), 56 deletions(-)
>
> diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
> index a3c0df11d0e6..3b0f61b2ea6d 100644
> --- a/arch/x86/entry/vsyscall/vsyscall_64.c
> +++ b/arch/x86/entry/vsyscall/vsyscall_64.c
> @@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)
>
> static bool write_ok_or_segv(unsigned long ptr, size_t size)
> {
> - /*
> - * XXX: if access_ok, get_user, and put_user handled
> - * sig_on_uaccess_err, this could go away.
> - */
> -
> if (!access_ok((void __user *)ptr, size)) {
> struct thread_struct *thread = &current->thread;
>
> @@ -123,7 +118,6 @@ bool emulate_vsyscall(unsigned long error_code,
> struct task_struct *tsk;
> unsigned long caller;
> int vsyscall_nr, syscall_nr, tmp;
> - int prev_sig_on_uaccess_err;
> long ret;
> unsigned long orig_dx;
>
> @@ -234,12 +228,8 @@ bool emulate_vsyscall(unsigned long error_code,
> goto do_ret; /* skip requested */
>
> /*
> - * With a real vsyscall, page faults cause SIGSEGV. We want to
> - * preserve that behavior to make writing exploits harder.
> + * With a real vsyscall, page faults cause SIGSEGV.
> */
> - prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
> - current->thread.sig_on_uaccess_err = 1;
> -
> ret = -EFAULT;
> switch (vsyscall_nr) {
> case 0:
> @@ -262,23 +252,12 @@ bool emulate_vsyscall(unsigned long error_code,
> break;
> }
>
> - current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
> -
> check_fault:
> if (ret == -EFAULT) {
> /* Bad news -- userspace fed a bad pointer to a vsyscall. */
> warn_bad_vsyscall(KERN_INFO, regs,
> "vsyscall fault (exploit attempt?)");
> -
> - /*
> - * If we failed to generate a signal for any reason,
> - * generate one here. (This should be impossible.)
> - */
> - if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
> - !sigismember(&tsk->pending.signal, SIGSEGV)))
> - goto sigsegv;
> -
> - return true; /* Don't emulate the ret. */
> + goto sigsegv;
> }
>
> regs->ax = ret;
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index 811548f131f4..78e51b0d6433 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -472,7 +472,6 @@ struct thread_struct {
> unsigned long iopl_emul;
>
> unsigned int iopl_warn:1;
> - unsigned int sig_on_uaccess_err:1;
>
> /*
> * Protection Keys Register for Userspace. Loaded immediately on
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 6b2ca8ba75b8..f26ecabc9424 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -724,39 +724,8 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
> WARN_ON_ONCE(user_mode(regs));
>
> /* Are we prepared to handle this kernel fault? */
> - if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
> - /*
> - * Any interrupt that takes a fault gets the fixup. This makes
> - * the below recursive fault logic only apply to a faults from
> - * task context.
> - */
> - if (in_interrupt())
> - return;
> -
> - /*
> - * Per the above we're !in_interrupt(), aka. task context.
> - *
> - * In this case we need to make sure we're not recursively
> - * faulting through the emulate_vsyscall() logic.
> - */
> - if (current->thread.sig_on_uaccess_err && signal) {
> - sanitize_error_code(address, &error_code);
> -
> - set_signal_archinfo(address, error_code);
> -
> - if (si_code == SEGV_PKUERR) {
> - force_sig_pkuerr((void __user *)address, pkey);
> - } else {
> - /* XXX: hwpoison faults will set the wrong code. */
> - force_sig_fault(signal, si_code, (void __user *)address);
> - }
> - }
> -
> - /*
> - * Barring that, we can do the fixup and be happy.
> - */
> + if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
> return;
> - }
>
> /*
> * AMD erratum #91 manifests as a spurious page fault on a PREFETCH
>

2024-04-29 14:18:23

by Tetsuo Handa

[permalink] [raw]
Subject: Re: [syzbot] [bpf?] [trace?] possible deadlock in force_sig_info_to_task

On 2024/04/29 9:50, Linus Torvalds wrote:
> On Sun, 28 Apr 2024 at 16:23, Hillf Danton <[email protected]> wrote:
>>
>> So is game like copying from/putting to user with runqueue locked
>> at the first place.
>
> The runqueue lock is irrelevant. As mentioned, it's only a symptom of
> something else going wrong.
>
>> Plus as per another syzbot report [1], bpf could make trouble with
>> workqueue pool locked.
>
> That seems to be entirely different. There's no unexplained page fault
> in that case, that seems to be purely a "take lock in the wrong order"

Another example is at https://lkml.kernel.org/r/[email protected] .
Since many callers might hold runqueue lock while holding some other locks, allowing
BPF to run code which can hold one of such locks while runqueue lock is held is asking
for troubles. BPF programs are unexpected lock grabber for built-in code. I think that
BPF should not run code which might hold one of such locks when an atomic lock is
already held.


2024-04-29 15:51:40

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Remove broken vsyscall emulation code from the page fault code

On Mon, 29 Apr 2024 at 01:00, Ingo Molnar <[email protected]> wrote:
>
> I did some Simple Testing™, and nothing seemed to break in any way visible
> to me, and the diffstat is lovely:
>
> 3 files changed, 3 insertions(+), 56 deletions(-)
>
> Might stick this into tip:x86/mm and see what happens?

Well, Hilf had it go through the syzbot testing, and Jiri seems to
have tested it on his setup too, so it looks like it's all good, and
you can change the "Not-Yet-Signed-off-by" to be a proper sign-off
from me.

It would be good to have some UML testing done, but at the same time I
do think that anybody running UML on modern kernels should be running
a modern user-mode setup too, so while the exact SIGSEGV details may
have been an issue in 2011, I don't think it's reasonable to think
that it's an issue in 2024.

Linus

2024-04-29 18:55:41

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Remove broken vsyscall emulation code from the page fault code

On Mon, 29 Apr 2024 at 08:51, Linus Torvalds
<[email protected]> wrote:
>
> Well, Hilf had it go through the syzbot testing, and Jiri seems to
> have tested it on his setup too, so it looks like it's all good, and
> you can change the "Not-Yet-Signed-off-by" to be a proper sign-off
> from me.

Side note: having looked more at this, I suspect we have room for
further cleanups in this area.

In particular, I think the page fault emulation code should be moved
from do_user_addr_fault() to do_kern_addr_fault(), and the horrible
hack that is fault_in_kernel_space() should be removed (it is what now
makes a vsyscall page fault be treated as a user address, and the only
_reason_ for that is that we do the vsyscall handling in the wrong
place).

I also think that the vsyscall emulation code should just be cleaned
up - instead of looking up the system call number and then calling the
__x64_xyz() system call stub, I think we should just write out the
code in-place. That would get the SIGSEGV cases right too, and I think
it would actually clean up the code. We already do almost everything
but the (trivial) low-level ops anyway.

But I think my patch to remove the 'sig_on_uaccess_err' should just go
in first, since it fixes a real and present issue. And then if
somebody has the energy - or if it turns out that we actually need to
get the SIGSEGV siginfo details right - we can do the other cleanups.
They are mostly unrelated, but the current sig_on_uaccess_err code
just makes everything more complicated and needs to go.

Linus

2024-04-29 19:09:15

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Remove broken vsyscall emulation code from the page fault code

On Mon, 29 Apr 2024 at 11:47, Linus Torvalds
<[email protected]> wrote:
>
> In particular, I think the page fault emulation code should be moved
> from do_user_addr_fault() to do_kern_addr_fault(), and the horrible
> hack that is fault_in_kernel_space() should be removed (it is what now
> makes a vsyscall page fault be treated as a user address, and the only
> _reason_ for that is that we do the vsyscall handling in the wrong
> place).

Final note: we should also remove the XONLY option entirely, and
remove all the strange page table handling we currently do for it.

It won't work anyway on future CPUs with LASS, and we *have* to
emulate things (and not in the page fault path, I think LASS will
cause a GP fault).

I think the LASS patches ended up just disabling LASS if people wanted
vsyscall, which is probably the worst case.

Again, this is more of a "I think we have more work to do", and should
all happen after that sig_on_uaccess_err stuff is gone.

I guess that patch to rip out sig_on_uaccess_err needs to go into 6.9
and even be marked for stable, since it most definitely breaks some
stuff currently. Even if that "some stuff" is pretty esoteric (ie
"vsyscall=emulate" together with tracing).

Linus

2024-04-29 23:30:17

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Remove broken vsyscall emulation code from the page fault code

On Mon, Apr 29, 2024 at 12:07 PM Linus Torvalds
<[email protected]> wrote:
>
> On Mon, 29 Apr 2024 at 11:47, Linus Torvalds
> <[email protected]> wrote:
> >
> > In particular, I think the page fault emulation code should be moved
> > from do_user_addr_fault() to do_kern_addr_fault(), and the horrible
> > hack that is fault_in_kernel_space() should be removed (it is what now
> > makes a vsyscall page fault be treated as a user address, and the only
> > _reason_ for that is that we do the vsyscall handling in the wrong
> > place).
>
> Final note: we should also remove the XONLY option entirely, and
> remove all the strange page table handling we currently do for it.
>
> It won't work anyway on future CPUs with LASS, and we *have* to
> emulate things (and not in the page fault path, I think LASS will
> cause a GP fault).

What strange page table handling do we do for XONLY?

EMULATE actually involves page tables. XONLY is just in the "gate
area" (which is more or less just a procfs thing) and the page fault
code.

So I think we should remove EMULATE before removing XONLY. We already
tried pretty hard to get everyone to stop using EMULATE.

2024-04-29 23:31:10

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Remove broken vsyscall emulation code from the page fault code

On Mon, Apr 29, 2024 at 6:51 AM Jiri Olsa <[email protected]> wrote:
>
> On Mon, Apr 29, 2024 at 10:00:51AM +0200, Ingo Molnar wrote:
>
> SNIP
>
> > The attached patch looks like the ObviouslyCorrect(tm) thing to do.
> >
> > NOTE! This broken code goes back to this commit in 2011:
> >
> > 4fc3490114bb ("x86-64: Set siginfo and context on vsyscall emulation faults")
> >
> > ... and back then the reason was to get all the siginfo details right.
> > Honestly, I do not for a moment believe that it's worth getting the siginfo
> > details right here, but part of the commit says:
> >
> > This fixes issues with UML when vsyscall=emulate.
> >
> > ... and so my patch to remove this garbage will probably break UML in this
> > situation.
> >
> > I do not believe that anybody should be running with vsyscall=emulate in
> > 2024 in the first place, much less if you are doing things like UML. But
> > let's see if somebody screams.
> >
> > Not-Yet-Signed-off-by: Linus Torvalds <[email protected]>
> > Signed-off-by: Ingo Molnar <[email protected]>
> > Link: https://lore.kernel.org/r/CAHk-=wh9D6f7HUkDgZHKmDCHUQmp+Co89GP+b8+z+G56BKeyNg@mail.gmail.com
>
> fwiw I can no longer trigger the invalid wait context bug
> with this change
>
> Tested-by: Jiri Olsa <[email protected]>

Acked-by: Andy Lutomirski <[email protected]>

2024-04-30 00:06:09

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Remove broken vsyscall emulation code from the page fault code

On Mon, 29 Apr 2024 at 16:30, Andy Lutomirski <[email protected]> wrote:
>
> What strange page table handling do we do for XONLY?

Ahh, I misread set_vsyscall_pgtable_user_bits(). It's used for EMULATE
not for XONLY.

And the code in pti_setup_vsyscall() is just wrong, and does it for all cases.

> So I think we should remove EMULATE before removing XONLY.

Ok, looking at that again, I don't disagree. I misread that XONLY as
mapping it executable, but it is actually just mapping it readable

Yes, let's remove EMULATE, and keep XONLY.

Linus

2024-04-30 06:10:31

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Remove broken vsyscall emulation code from the page fault code


* Linus Torvalds <[email protected]> wrote:

> I guess that patch to rip out sig_on_uaccess_err needs to go into 6.9 and
> even be marked for stable, since it most definitely breaks some stuff
> currently. Even if that "some stuff" is pretty esoteric (ie
> "vsyscall=emulate" together with tracing).

Yeah - I just put it into tip:x86/urgent as-is, with the various Tested-by
and Acked-by tags added, and we'll send it to you later this week if all
goes well.

Thanks,

Ingo

Subject: [tip: x86/urgent] x86/mm: Remove broken vsyscall emulation code from the page fault code

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: c9e1dc9825319392b44d3c22493dc543075933b9
Gitweb: https://git.kernel.org/tip/c9e1dc9825319392b44d3c22493dc543075933b9
Author: Linus Torvalds <[email protected]>
AuthorDate: Mon, 29 Apr 2024 10:00:51 +02:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Tue, 30 Apr 2024 08:08:30 +02:00

x86/mm: Remove broken vsyscall emulation code from the page fault code

The syzbot-reported stack trace from hell in this discussion thread
actually has three nested page faults:

https://lore.kernel.org/r/[email protected]

.. and I think that's actually the important thing here:

- the first page fault is from user space, and triggers the vsyscall
emulation.

- the second page fault is from __do_sys_gettimeofday(), and that should
just have caused the exception that then sets the return value to
-EFAULT

- the third nested page fault is due to _raw_spin_unlock_irqrestore() ->
preempt_schedule() -> trace_sched_switch(), which then causes a BPF
trace program to run, which does that bpf_probe_read_compat(), which
causes that page fault under pagefault_disable().

It's quite the nasty backtrace, and there's a lot going on.

The problem is literally the vsyscall emulation, which sets

current->thread.sig_on_uaccess_err = 1;

and that causes the fixup_exception() code to send the signal *despite* the
exception being caught.

And I think that is in fact completely bogus. It's completely bogus
exactly because it sends that signal even when it *shouldn't* be sent -
like for the BPF user mode trace gathering.

In other words, I think the whole "sig_on_uaccess_err" thing is entirely
broken, because it makes any nested page-faults do all the wrong things.

Now, arguably, I don't think anybody should enable vsyscall emulation any
more, but this test case clearly does.

I think we should just make the "send SIGSEGV" be something that the
vsyscall emulation does on its own, not this broken per-thread state for
something that isn't actually per thread.

The x86 page fault code actually tried to deal with the "incorrect nesting"
by having that:

if (in_interrupt())
return;

which ignores the sig_on_uaccess_err case when it happens in interrupts,
but as shown by this example, these nested page faults do not need to be
about interrupts at all.

IOW, I think the only right thing is to remove that horrendously broken
code.

The attached patch looks like the ObviouslyCorrect(tm) thing to do.

NOTE! This broken code goes back to this commit in 2011:

4fc3490114bb ("x86-64: Set siginfo and context on vsyscall emulation faults")

.. and back then the reason was to get all the siginfo details right.
Honestly, I do not for a moment believe that it's worth getting the siginfo
details right here, but part of the commit says:

This fixes issues with UML when vsyscall=emulate.

.. and so my patch to remove this garbage will probably break UML in this
situation.

I do not believe that anybody should be running with vsyscall=emulate in
2024 in the first place, much less if you are doing things like UML. But
let's see if somebody screams.

Reported-and-tested-by: [email protected]
Signed-off-by: Linus Torvalds <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Tested-by: Jiri Olsa <[email protected]>
Acked-by: Andy Lutomirski <[email protected]>
Link: https://lore.kernel.org/r/CAHk-=wh9D6f7HUkDgZHKmDCHUQmp+Co89GP+b8+z+G56BKeyNg@mail.gmail.com
---
arch/x86/entry/vsyscall/vsyscall_64.c | 25 +-------------------
arch/x86/include/asm/processor.h | 1 +-
arch/x86/mm/fault.c | 33 +--------------------------
3 files changed, 3 insertions(+), 56 deletions(-)

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index a3c0df1..3b0f61b 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)

static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
- /*
- * XXX: if access_ok, get_user, and put_user handled
- * sig_on_uaccess_err, this could go away.
- */
-
if (!access_ok((void __user *)ptr, size)) {
struct thread_struct *thread = &current->thread;

@@ -123,7 +118,6 @@ bool emulate_vsyscall(unsigned long error_code,
struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr, syscall_nr, tmp;
- int prev_sig_on_uaccess_err;
long ret;
unsigned long orig_dx;

@@ -234,12 +228,8 @@ bool emulate_vsyscall(unsigned long error_code,
goto do_ret; /* skip requested */

/*
- * With a real vsyscall, page faults cause SIGSEGV. We want to
- * preserve that behavior to make writing exploits harder.
+ * With a real vsyscall, page faults cause SIGSEGV.
*/
- prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
- current->thread.sig_on_uaccess_err = 1;
-
ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
@@ -262,23 +252,12 @@ bool emulate_vsyscall(unsigned long error_code,
break;
}

- current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
-
check_fault:
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall fault (exploit attempt?)");
-
- /*
- * If we failed to generate a signal for any reason,
- * generate one here. (This should be impossible.)
- */
- if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
- !sigismember(&tsk->pending.signal, SIGSEGV)))
- goto sigsegv;
-
- return true; /* Don't emulate the ret. */
+ goto sigsegv;
}

regs->ax = ret;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 811548f..78e51b0 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -472,7 +472,6 @@ struct thread_struct {
unsigned long iopl_emul;

unsigned int iopl_warn:1;
- unsigned int sig_on_uaccess_err:1;

/*
* Protection Keys Register for Userspace. Loaded immediately on
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 622d12e..bba4e02 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -723,39 +723,8 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
WARN_ON_ONCE(user_mode(regs));

/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
- /*
- * Any interrupt that takes a fault gets the fixup. This makes
- * the below recursive fault logic only apply to a faults from
- * task context.
- */
- if (in_interrupt())
- return;
-
- /*
- * Per the above we're !in_interrupt(), aka. task context.
- *
- * In this case we need to make sure we're not recursively
- * faulting through the emulate_vsyscall() logic.
- */
- if (current->thread.sig_on_uaccess_err && signal) {
- sanitize_error_code(address, &error_code);
-
- set_signal_archinfo(address, error_code);
-
- if (si_code == SEGV_PKUERR) {
- force_sig_pkuerr((void __user *)address, pkey);
- } else {
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
- }
- }
-
- /*
- * Barring that, we can do the fixup and be happy.
- */
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
return;
- }

/*
* AMD erratum #91 manifests as a spurious page fault on a PREFETCH

2024-04-30 14:54:25

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Remove broken vsyscall emulation code from the page fault code

Hi Ingo,

kernel test robot noticed the following build warnings:

[auto build test WARNING on linux/master]
[also build test WARNING on tip/x86/mm linus/master v6.9-rc6 next-20240430]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Ingo-Molnar/x86-mm-Remove-broken-vsyscall-emulation-code-from-the-page-fault-code/20240430-135258
base: linux/master
patch link: https://lore.kernel.org/r/Zi9Ts1HcqiKzy9GX%40gmail.com
patch subject: [PATCH] x86/mm: Remove broken vsyscall emulation code from the page fault code
config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20240430/[email protected]/config)
compiler: gcc-13 (Ubuntu 13.2.0-4ubuntu3) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240430/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All warnings (new ones prefixed by >>):

arch/x86/entry/vsyscall/vsyscall_64.c: In function 'emulate_vsyscall':
>> arch/x86/entry/vsyscall/vsyscall_64.c:118:29: warning: variable 'tsk' set but not used [-Wunused-but-set-variable]
118 | struct task_struct *tsk;
| ^~~


vim +/tsk +118 arch/x86/entry/vsyscall/vsyscall_64.c

4fc3490114bb15 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-11-07 114
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 115 bool emulate_vsyscall(unsigned long error_code,
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 116 struct pt_regs *regs, unsigned long address)
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 117 {
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 @118 struct task_struct *tsk;
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 119 unsigned long caller;
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 120 int vsyscall_nr, syscall_nr, tmp;
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 121 long ret;
fa697140f9a201 arch/x86/entry/vsyscall/vsyscall_64.c Dominik Brodowski 2018-04-05 122 unsigned long orig_dx;
7460ed2844ffad arch/x86_64/kernel/vsyscall.c John Stultz 2007-02-16 123
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 124 /* Write faults or kernel-privilege faults never get fixed up. */
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 125 if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 126 return false;
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 127
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 128 if (!(error_code & X86_PF_INSTR)) {
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 129 /* Failed vsyscall read */
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 130 if (vsyscall_mode == EMULATE)
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 131 return false;
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 132
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 133 /*
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 134 * User code tried and failed to read the vsyscall page.
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 135 */
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 136 warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 137 return false;
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 138 }
918ce325098a4e arch/x86/entry/vsyscall/vsyscall_64.c Andy Lutomirski 2019-06-26 139
c9712944b2a123 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-07-13 140 /*
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 141 * No point in checking CS -- the only way to get here is a user mode
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 142 * trap to a high address, which means that we're in 64-bit user code.
c9712944b2a123 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-07-13 143 */
7460ed2844ffad arch/x86_64/kernel/vsyscall.c John Stultz 2007-02-16 144
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 145 WARN_ON_ONCE(address != regs->ip);
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 146
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 147 if (vsyscall_mode == NONE) {
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 148 warn_bad_vsyscall(KERN_INFO, regs,
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 149 "vsyscall attempted with vsyscall=none");
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 150 return false;
c9712944b2a123 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-07-13 151 }
7460ed2844ffad arch/x86_64/kernel/vsyscall.c John Stultz 2007-02-16 152
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 153 vsyscall_nr = addr_to_vsyscall_nr(address);
c149a665ac488e arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-03 154
c149a665ac488e arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-03 155 trace_emulate_vsyscall(vsyscall_nr);
c149a665ac488e arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-03 156
c9712944b2a123 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-07-13 157 if (vsyscall_nr < 0) {
c9712944b2a123 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-07-13 158 warn_bad_vsyscall(KERN_WARNING, regs,
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 159 "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 160 goto sigsegv;
7460ed2844ffad arch/x86_64/kernel/vsyscall.c John Stultz 2007-02-16 161 }
7460ed2844ffad arch/x86_64/kernel/vsyscall.c John Stultz 2007-02-16 162
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 163 if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 164 warn_bad_vsyscall(KERN_WARNING, regs,
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 165 "vsyscall with bad stack (exploit attempt?)");
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 166 goto sigsegv;
^1da177e4c3f41 arch/x86_64/kernel/vsyscall.c Linus Torvalds 2005-04-16 167 }
^1da177e4c3f41 arch/x86_64/kernel/vsyscall.c Linus Torvalds 2005-04-16 168
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 169 tsk = current;
4fc3490114bb15 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-11-07 170
4fc3490114bb15 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-11-07 171 /*
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 172 * Check for access_ok violations and find the syscall nr.
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 173 *
46ed99d1b7c929 arch/x86/kernel/vsyscall_64.c Emil Goode 2012-04-01 174 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
4fc3490114bb15 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-11-07 175 * 64-bit, so we don't need to special-case it here. For all the
46ed99d1b7c929 arch/x86/kernel/vsyscall_64.c Emil Goode 2012-04-01 176 * vsyscalls, NULL means "don't write anything" not "write it at
4fc3490114bb15 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-11-07 177 * address 0".
4fc3490114bb15 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-11-07 178 */
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 179 switch (vsyscall_nr) {
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 180 case 0:
ddccf40fe82b7a arch/x86/entry/vsyscall/vsyscall_64.c Arnd Bergmann 2017-11-23 181 if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) ||
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 182 !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 183 ret = -EFAULT;
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 184 goto check_fault;
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 185 }
4fc3490114bb15 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-11-07 186
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 187 syscall_nr = __NR_gettimeofday;
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 188 break;
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 189
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 190 case 1:
21346564ccad17 arch/x86/entry/vsyscall/vsyscall_64.c Arnd Bergmann 2019-11-05 191 if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) {
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 192 ret = -EFAULT;
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 193 goto check_fault;
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 194 }
5651721edec25b arch/x86/kernel/vsyscall_64.c Will Drewry 2012-07-13 195
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 196 syscall_nr = __NR_time;
4fc3490114bb15 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-11-07 197 break;
4fc3490114bb15 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-11-07 198
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 199 case 2:
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 200 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 201 !write_ok_or_segv(regs->si, sizeof(unsigned))) {
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 202 ret = -EFAULT;
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 203 goto check_fault;
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 204 }
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 205
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 206 syscall_nr = __NR_getcpu;
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 207 break;
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 208 }
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 209
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 210 /*
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 211 * Handle seccomp. regs->ip must be the original value.
5fb94e9ca333f0 arch/x86/entry/vsyscall/vsyscall_64.c Mauro Carvalho Chehab 2018-05-08 212 * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 213 *
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 214 * We could optimize the seccomp disabled case, but performance
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 215 * here doesn't matter.
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 216 */
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 217 regs->orig_ax = syscall_nr;
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 218 regs->ax = -ENOSYS;
fefad9ef58ffc2 arch/x86/entry/vsyscall/vsyscall_64.c Christian Brauner 2019-09-24 219 tmp = secure_computing();
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 220 if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 221 warn_bad_vsyscall(KERN_DEBUG, regs,
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 222 "seccomp tried to change syscall nr or ip");
fcb116bc43c8c3 arch/x86/entry/vsyscall/vsyscall_64.c Eric W. Biederman 2021-11-18 223 force_exit_sig(SIGSYS);
695dd0d634df89 arch/x86/entry/vsyscall/vsyscall_64.c Eric W. Biederman 2021-10-20 224 return true;
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 225 }
26893107aa717c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2014-11-04 226 regs->orig_ax = -1;
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 227 if (tmp)
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 228 goto do_ret; /* skip requested */
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 229
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 230 /*
198d72414c92c7 arch/x86/entry/vsyscall/vsyscall_64.c Ingo Molnar 2024-04-29 231 * With a real vsyscall, page faults cause SIGSEGV.
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 232 */
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 233 ret = -EFAULT;
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 234 switch (vsyscall_nr) {
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 235 case 0:
fa697140f9a201 arch/x86/entry/vsyscall/vsyscall_64.c Dominik Brodowski 2018-04-05 236 /* this decodes regs->di and regs->si on its own */
d5a00528b58cdb arch/x86/entry/vsyscall/vsyscall_64.c Dominik Brodowski 2018-04-09 237 ret = __x64_sys_gettimeofday(regs);
5651721edec25b arch/x86/kernel/vsyscall_64.c Will Drewry 2012-07-13 238 break;
5651721edec25b arch/x86/kernel/vsyscall_64.c Will Drewry 2012-07-13 239
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 240 case 1:
fa697140f9a201 arch/x86/entry/vsyscall/vsyscall_64.c Dominik Brodowski 2018-04-05 241 /* this decodes regs->di on its own */
d5a00528b58cdb arch/x86/entry/vsyscall/vsyscall_64.c Dominik Brodowski 2018-04-09 242 ret = __x64_sys_time(regs);
4fc3490114bb15 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-11-07 243 break;
4fc3490114bb15 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-11-07 244
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 245 case 2:
fa697140f9a201 arch/x86/entry/vsyscall/vsyscall_64.c Dominik Brodowski 2018-04-05 246 /* while we could clobber regs->dx, we didn't in the past... */
fa697140f9a201 arch/x86/entry/vsyscall/vsyscall_64.c Dominik Brodowski 2018-04-05 247 orig_dx = regs->dx;
fa697140f9a201 arch/x86/entry/vsyscall/vsyscall_64.c Dominik Brodowski 2018-04-05 248 regs->dx = 0;
fa697140f9a201 arch/x86/entry/vsyscall/vsyscall_64.c Dominik Brodowski 2018-04-05 249 /* this decodes regs->di, regs->si and regs->dx on its own */
d5a00528b58cdb arch/x86/entry/vsyscall/vsyscall_64.c Dominik Brodowski 2018-04-09 250 ret = __x64_sys_getcpu(regs);
fa697140f9a201 arch/x86/entry/vsyscall/vsyscall_64.c Dominik Brodowski 2018-04-05 251 regs->dx = orig_dx;
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 252 break;
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 253 }
8c73626ab28527 arch/x86/kernel/vsyscall_64.c John Stultz 2010-07-13 254
87b526d349b04c arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2012-10-01 255 check_fault:
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 256 if (ret == -EFAULT) {
4fc3490114bb15 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-11-07 257 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 258 warn_bad_vsyscall(KERN_INFO, regs,
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 259 "vsyscall fault (exploit attempt?)");
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 260 goto sigsegv;
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 261 }
8c73626ab28527 arch/x86/kernel/vsyscall_64.c John Stultz 2010-07-13 262
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 263 regs->ax = ret;
8c73626ab28527 arch/x86/kernel/vsyscall_64.c John Stultz 2010-07-13 264
5651721edec25b arch/x86/kernel/vsyscall_64.c Will Drewry 2012-07-13 265 do_ret:
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 266 /* Emulate a ret instruction. */
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 267 regs->ip = caller;
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 268 regs->sp += 8;
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 269 return true;
c08c820508233b arch/x86_64/kernel/vsyscall.c Vojtech Pavlik 2006-09-26 270
5cec93c216db77 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-06-05 271 sigsegv:
3cf5d076fb4d48 arch/x86/entry/vsyscall/vsyscall_64.c Eric W. Biederman 2019-05-23 272 force_sig(SIGSEGV);
3ae36655b97a03 arch/x86/kernel/vsyscall_64.c Andy Lutomirski 2011-08-10 273 return true;
^1da177e4c3f41 arch/x86_64/kernel/vsyscall.c Linus Torvalds 2005-04-16 274 }
^1da177e4c3f41 arch/x86_64/kernel/vsyscall.c Linus Torvalds 2005-04-16 275

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

2024-05-01 07:43:21

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86/mm: Remove broken vsyscall emulation code from the page fault code


* Ingo Molnar <[email protected]> wrote:

>
> * Linus Torvalds <[email protected]> wrote:
>
> > I guess that patch to rip out sig_on_uaccess_err needs to go into 6.9 and
> > even be marked for stable, since it most definitely breaks some stuff
> > currently. Even if that "some stuff" is pretty esoteric (ie
> > "vsyscall=emulate" together with tracing).
>
> Yeah - I just put it into tip:x86/urgent as-is, with the various Tested-by
> and Acked-by tags added, and we'll send it to you later this week if all
> goes well.

Update: added the delta patch below to the fix, because now
'tsk' is unused in emulate_vsyscall().

Thanks,

Ingo

arch/x86/entry/vsyscall/vsyscall_64.c | 3 ---
1 file changed, 3 deletions(-)

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 3b0f61b2ea6d..2fb7d53cf333 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -115,7 +115,6 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
bool emulate_vsyscall(unsigned long error_code,
struct pt_regs *regs, unsigned long address)
{
- struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr, syscall_nr, tmp;
long ret;
@@ -166,8 +165,6 @@ bool emulate_vsyscall(unsigned long error_code,
goto sigsegv;
}

- tsk = current;
-
/*
* Check for access_ok violations and find the syscall nr.
*


Subject: [tip: x86/urgent] x86/mm: Remove broken vsyscall emulation code from the page fault code

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 02b670c1f88e78f42a6c5aee155c7b26960ca054
Gitweb: https://git.kernel.org/tip/02b670c1f88e78f42a6c5aee155c7b26960ca054
Author: Linus Torvalds <[email protected]>
AuthorDate: Mon, 29 Apr 2024 10:00:51 +02:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Wed, 01 May 2024 09:41:43 +02:00

x86/mm: Remove broken vsyscall emulation code from the page fault code

The syzbot-reported stack trace from hell in this discussion thread
actually has three nested page faults:

https://lore.kernel.org/r/[email protected]

.. and I think that's actually the important thing here:

- the first page fault is from user space, and triggers the vsyscall
emulation.

- the second page fault is from __do_sys_gettimeofday(), and that should
just have caused the exception that then sets the return value to
-EFAULT

- the third nested page fault is due to _raw_spin_unlock_irqrestore() ->
preempt_schedule() -> trace_sched_switch(), which then causes a BPF
trace program to run, which does that bpf_probe_read_compat(), which
causes that page fault under pagefault_disable().

It's quite the nasty backtrace, and there's a lot going on.

The problem is literally the vsyscall emulation, which sets

current->thread.sig_on_uaccess_err = 1;

and that causes the fixup_exception() code to send the signal *despite* the
exception being caught.

And I think that is in fact completely bogus. It's completely bogus
exactly because it sends that signal even when it *shouldn't* be sent -
like for the BPF user mode trace gathering.

In other words, I think the whole "sig_on_uaccess_err" thing is entirely
broken, because it makes any nested page-faults do all the wrong things.

Now, arguably, I don't think anybody should enable vsyscall emulation any
more, but this test case clearly does.

I think we should just make the "send SIGSEGV" be something that the
vsyscall emulation does on its own, not this broken per-thread state for
something that isn't actually per thread.

The x86 page fault code actually tried to deal with the "incorrect nesting"
by having that:

if (in_interrupt())
return;

which ignores the sig_on_uaccess_err case when it happens in interrupts,
but as shown by this example, these nested page faults do not need to be
about interrupts at all.

IOW, I think the only right thing is to remove that horrendously broken
code.

The attached patch looks like the ObviouslyCorrect(tm) thing to do.

NOTE! This broken code goes back to this commit in 2011:

4fc3490114bb ("x86-64: Set siginfo and context on vsyscall emulation faults")

.. and back then the reason was to get all the siginfo details right.
Honestly, I do not for a moment believe that it's worth getting the siginfo
details right here, but part of the commit says:

This fixes issues with UML when vsyscall=emulate.

.. and so my patch to remove this garbage will probably break UML in this
situation.

I do not believe that anybody should be running with vsyscall=emulate in
2024 in the first place, much less if you are doing things like UML. But
let's see if somebody screams.

Reported-and-tested-by: [email protected]
Signed-off-by: Linus Torvalds <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Tested-by: Jiri Olsa <[email protected]>
Acked-by: Andy Lutomirski <[email protected]>
Link: https://lore.kernel.org/r/CAHk-=wh9D6f7HUkDgZHKmDCHUQmp+Co89GP+b8+z+G56BKeyNg@mail.gmail.com
---
arch/x86/entry/vsyscall/vsyscall_64.c | 28 +---------------------
arch/x86/include/asm/processor.h | 1 +-
arch/x86/mm/fault.c | 33 +--------------------------
3 files changed, 3 insertions(+), 59 deletions(-)

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index a3c0df1..2fb7d53 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)

static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
- /*
- * XXX: if access_ok, get_user, and put_user handled
- * sig_on_uaccess_err, this could go away.
- */
-
if (!access_ok((void __user *)ptr, size)) {
struct thread_struct *thread = &current->thread;

@@ -120,10 +115,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
bool emulate_vsyscall(unsigned long error_code,
struct pt_regs *regs, unsigned long address)
{
- struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr, syscall_nr, tmp;
- int prev_sig_on_uaccess_err;
long ret;
unsigned long orig_dx;

@@ -172,8 +165,6 @@ bool emulate_vsyscall(unsigned long error_code,
goto sigsegv;
}

- tsk = current;
-
/*
* Check for access_ok violations and find the syscall nr.
*
@@ -234,12 +225,8 @@ bool emulate_vsyscall(unsigned long error_code,
goto do_ret; /* skip requested */

/*
- * With a real vsyscall, page faults cause SIGSEGV. We want to
- * preserve that behavior to make writing exploits harder.
+ * With a real vsyscall, page faults cause SIGSEGV.
*/
- prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
- current->thread.sig_on_uaccess_err = 1;
-
ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
@@ -262,23 +249,12 @@ bool emulate_vsyscall(unsigned long error_code,
break;
}

- current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
-
check_fault:
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall fault (exploit attempt?)");
-
- /*
- * If we failed to generate a signal for any reason,
- * generate one here. (This should be impossible.)
- */
- if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
- !sigismember(&tsk->pending.signal, SIGSEGV)))
- goto sigsegv;
-
- return true; /* Don't emulate the ret. */
+ goto sigsegv;
}

regs->ax = ret;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 811548f..78e51b0 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -472,7 +472,6 @@ struct thread_struct {
unsigned long iopl_emul;

unsigned int iopl_warn:1;
- unsigned int sig_on_uaccess_err:1;

/*
* Protection Keys Register for Userspace. Loaded immediately on
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 622d12e..bba4e02 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -723,39 +723,8 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
WARN_ON_ONCE(user_mode(regs));

/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
- /*
- * Any interrupt that takes a fault gets the fixup. This makes
- * the below recursive fault logic only apply to a faults from
- * task context.
- */
- if (in_interrupt())
- return;
-
- /*
- * Per the above we're !in_interrupt(), aka. task context.
- *
- * In this case we need to make sure we're not recursively
- * faulting through the emulate_vsyscall() logic.
- */
- if (current->thread.sig_on_uaccess_err && signal) {
- sanitize_error_code(address, &error_code);
-
- set_signal_archinfo(address, error_code);
-
- if (si_code == SEGV_PKUERR) {
- force_sig_pkuerr((void __user *)address, pkey);
- } else {
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
- }
- }
-
- /*
- * Barring that, we can do the fixup and be happy.
- */
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
return;
- }

/*
* AMD erratum #91 manifests as a spurious page fault on a PREFETCH