On Thu, Jun 11, 2020 at 11:56 AM Naresh Kamboju
<[email protected]> wrote:
>
> While running perf test and selftest x86 single_step_syscall_32 on
> i386 kernel linux
> next 5.7.0-next-20200610 kernel warning noticed.
>
> steps to reproduce:
> --------------------------
> perf test
> and
> cd /opt/kselftests/default-in-kernel/x86
> ./single_step_syscall_32
>
> perf warning log:
> ----------------------
> [ 57.260865] ------------[ cut here ]------------
> [ 57.266576] IRQs not disabled as expected
> [ 57.270583] WARNING: CPU: 1 PID: 500 at
> /usr/src/kernel/arch/x86/entry/common.c:624
> idtentry_exit_cond_rcu+0x92/0xc0
> [ 57.281092] Modules linked in: x86_pkg_temp_thermal fuse
> [ 57.286406] CPU: 1 PID: 500 Comm: perf Not tainted 5.7.0-next-20200610 #1
> [ 57.293190] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS
> 2.2 05/23/2018
> [ 57.300577] EIP: idtentry_exit_cond_rcu+0x92/0xc0
> [ 57.305280] Code: 8b 89 d8 05 00 00 85 c9 74 ae 80 3d b1 64 2c d4
> 00 75 a5 68 94 2d fb d3 89 55 f8 89 45 fc c6 05 b1 64 2c d4 01 e8 8e
> f5 2b ff <0f> 0b 58 8b 55 f8 8b 45 fc eb 83 8d 76 00 e8 5b fd ff ff c9
> c3 89
> [ 57.324017] EAX: 0000001d EBX: 0d00022a ECX: 00000027 EDX: f5b9e14c
> [ 57.330274] ESI: f2a2ffb4 EDI: ffff0ff4 EBP: f2a2ff8c ESP: f2a2ff80
> [ 57.336531] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010096
> [ 57.343345] CR0: 80050033 CR2: 08700a58 CR3: 14ad7000 CR4: 003406d0
> [ 57.349608] DR0: 080dfb80 DR1: 080dfc00 DR2: 08700a58 DR3: 00000000
> [ 57.355866] DR6: fffe0ff0 DR7: 0d00062a
> [ 57.359697] Call Trace:
> [ 57.362143] exc_debug+0x84/0x1b0
> [ 57.365487] ? exc_int3+0x1d0/0x1d0
> [ 57.368981] handle_exception+0x145/0x145
> [ 57.372991] EIP: 0x80dfbcd
> [ 57.375694] Code: Bad RIP value.
> [ 57.378918] EAX: 00000000 EBX: 00000005 ECX: 00002400 EDX: 00000000
> [ 57.385175] ESI: 00000003 EDI: 00000004 EBP: bfd59798 ESP: bfd59770
> [ 57.391431] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000246
> [ 57.398215] irq event stamp: 1896
A regrettable property of the current entry code structure is that we
lose any real indication of the vector. Presumably this is #DB, hence
exc_debug. I don't know what perf has to do with it.
I'll bang on this after lunch if no one beats me to it.
On Thu, Jun 11, 2020 at 12:10:50PM -0700, Andy Lutomirski wrote:
> On Thu, Jun 11, 2020 at 11:56 AM Naresh Kamboju
> <[email protected]> wrote:
> >
> > While running perf test and selftest x86 single_step_syscall_32 on
> > i386 kernel linux
> > next 5.7.0-next-20200610 kernel warning noticed.
> >
> > steps to reproduce:
> > --------------------------
> > perf test
> > and
> > cd /opt/kselftests/default-in-kernel/x86
> > ./single_step_syscall_32
> >
> > perf warning log:
> > ----------------------
> > [ 57.260865] ------------[ cut here ]------------
> > [ 57.266576] IRQs not disabled as expected
> > [ 57.270583] WARNING: CPU: 1 PID: 500 at
> > /usr/src/kernel/arch/x86/entry/common.c:624
> > idtentry_exit_cond_rcu+0x92/0xc0
> > [ 57.281092] Modules linked in: x86_pkg_temp_thermal fuse
> > [ 57.286406] CPU: 1 PID: 500 Comm: perf Not tainted 5.7.0-next-20200610 #1
> > [ 57.293190] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS
> > 2.2 05/23/2018
> > [ 57.300577] EIP: idtentry_exit_cond_rcu+0x92/0xc0
> > [ 57.305280] Code: 8b 89 d8 05 00 00 85 c9 74 ae 80 3d b1 64 2c d4
> > 00 75 a5 68 94 2d fb d3 89 55 f8 89 45 fc c6 05 b1 64 2c d4 01 e8 8e
> > f5 2b ff <0f> 0b 58 8b 55 f8 8b 45 fc eb 83 8d 76 00 e8 5b fd ff ff c9
> > c3 89
> > [ 57.324017] EAX: 0000001d EBX: 0d00022a ECX: 00000027 EDX: f5b9e14c
> > [ 57.330274] ESI: f2a2ffb4 EDI: ffff0ff4 EBP: f2a2ff8c ESP: f2a2ff80
> > [ 57.336531] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010096
> > [ 57.343345] CR0: 80050033 CR2: 08700a58 CR3: 14ad7000 CR4: 003406d0
> > [ 57.349608] DR0: 080dfb80 DR1: 080dfc00 DR2: 08700a58 DR3: 00000000
> > [ 57.355866] DR6: fffe0ff0 DR7: 0d00062a
> > [ 57.359697] Call Trace:
> > [ 57.362143] exc_debug+0x84/0x1b0
> > [ 57.365487] ? exc_int3+0x1d0/0x1d0
> > [ 57.368981] handle_exception+0x145/0x145
> > [ 57.372991] EIP: 0x80dfbcd
> > [ 57.375694] Code: Bad RIP value.
> > [ 57.378918] EAX: 00000000 EBX: 00000005 ECX: 00002400 EDX: 00000000
> > [ 57.385175] ESI: 00000003 EDI: 00000004 EBP: bfd59798 ESP: bfd59770
> > [ 57.391431] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000246
> > [ 57.398215] irq event stamp: 1896
>
> A regrettable property of the current entry code structure is that we
> lose any real indication of the vector. Presumably this is #DB, hence
> exc_debug. I don't know what perf has to do with it.
>
> I'll bang on this after lunch if no one beats me to it.
Puzzling, CR3 seems to suggest this is !user_mode(), but either way #DB
has either idtentry_enter_user() or nmi_enter().
On Thu, Jun 11, 2020 at 12:25 PM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Jun 11, 2020 at 12:10:50PM -0700, Andy Lutomirski wrote:
> > On Thu, Jun 11, 2020 at 11:56 AM Naresh Kamboju
> > <[email protected]> wrote:
> > >
> > > While running perf test and selftest x86 single_step_syscall_32 on
> > > i386 kernel linux
> > > next 5.7.0-next-20200610 kernel warning noticed.
> > >
> > > steps to reproduce:
> > > --------------------------
> > > perf test
> > > and
> > > cd /opt/kselftests/default-in-kernel/x86
> > > ./single_step_syscall_32
> > >
> > > perf warning log:
> > > ----------------------
> > > [ 57.260865] ------------[ cut here ]------------
> > > [ 57.266576] IRQs not disabled as expected
> > > [ 57.270583] WARNING: CPU: 1 PID: 500 at
> > > /usr/src/kernel/arch/x86/entry/common.c:624
> > > idtentry_exit_cond_rcu+0x92/0xc0
> > > [ 57.281092] Modules linked in: x86_pkg_temp_thermal fuse
> > > [ 57.286406] CPU: 1 PID: 500 Comm: perf Not tainted 5.7.0-next-20200610 #1
> > > [ 57.293190] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS
> > > 2.2 05/23/2018
> > > [ 57.300577] EIP: idtentry_exit_cond_rcu+0x92/0xc0
> > > [ 57.305280] Code: 8b 89 d8 05 00 00 85 c9 74 ae 80 3d b1 64 2c d4
> > > 00 75 a5 68 94 2d fb d3 89 55 f8 89 45 fc c6 05 b1 64 2c d4 01 e8 8e
> > > f5 2b ff <0f> 0b 58 8b 55 f8 8b 45 fc eb 83 8d 76 00 e8 5b fd ff ff c9
> > > c3 89
> > > [ 57.324017] EAX: 0000001d EBX: 0d00022a ECX: 00000027 EDX: f5b9e14c
> > > [ 57.330274] ESI: f2a2ffb4 EDI: ffff0ff4 EBP: f2a2ff8c ESP: f2a2ff80
> > > [ 57.336531] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010096
> > > [ 57.343345] CR0: 80050033 CR2: 08700a58 CR3: 14ad7000 CR4: 003406d0
> > > [ 57.349608] DR0: 080dfb80 DR1: 080dfc00 DR2: 08700a58 DR3: 00000000
> > > [ 57.355866] DR6: fffe0ff0 DR7: 0d00062a
> > > [ 57.359697] Call Trace:
> > > [ 57.362143] exc_debug+0x84/0x1b0
> > > [ 57.365487] ? exc_int3+0x1d0/0x1d0
> > > [ 57.368981] handle_exception+0x145/0x145
> > > [ 57.372991] EIP: 0x80dfbcd
> > > [ 57.375694] Code: Bad RIP value.
> > > [ 57.378918] EAX: 00000000 EBX: 00000005 ECX: 00002400 EDX: 00000000
> > > [ 57.385175] ESI: 00000003 EDI: 00000004 EBP: bfd59798 ESP: bfd59770
> > > [ 57.391431] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000246
> > > [ 57.398215] irq event stamp: 1896
> >
> > A regrettable property of the current entry code structure is that we
> > lose any real indication of the vector. Presumably this is #DB, hence
> > exc_debug. I don't know what perf has to do with it.
> >
> > I'll bang on this after lunch if no one beats me to it.
>
> Puzzling, CR3 seems to suggest this is !user_mode(), but either way #DB
> has either idtentry_enter_user() or nmi_enter().
>
I just got this splat on 32-bit while running my perf abuser here:
https://git.kernel.org/pub/scm/linux/kernel/git/luto/misc-tests.git/tree/perf_lots_of_nmi.c
[ 21.874114] traps: PANIC: double fault, error_code: 0x0
[ 21.874127] double fault: 0000 [#1] SMP PTI
[ 21.874127] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.7.0+ #117
[ 21.874128] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31
04/01/2014
[ 21.874128] EIP: asm_exc_invalid_op+0x6/0x10
[ 21.874129] Code: a1 07 00 00 0f 01 ca fc 6a 00 68 f0 4c d7 cf e9
91 07 00 00 0f 01 ca fc 6a 00 68 90 4f d7 cf e9 81 07 00 00 0f 01 ca
fc 6a 00 <68> 30 4d d7 cf e9 71 07 00 00 0f 01 ca fc 6a 00 68 60 55 d7
cf e9
[ 21.874129] EAX: 00000000 EBX: cff9902c ECX: 000000e0 EDX: 00000000
[ 21.874129] ESI: cff9be9c EDI: cfd74d30 EBP: cff99014 ESP: cff99000
[ 21.874130] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210046
[ 21.874130] CR0: 80050033 CR2: cff98ffc CR3: 10108000 CR4: 00340ef0
[ 21.874130] Call Trace:
[ 21.874131] Modules linked in:
[ 21.889116] ---[ end trace aa191c9e5c534faf ]---
[ 21.889116] EIP: asm_exc_invalid_op+0x6/0x10
[ 21.889117] Code: a1 07 00 00 0f 01 ca fc 6a 00 68 f0 4c d7 cf e9
91 07 00 00 0f 01 ca fc 6a 00 68 90 4f d7 cf e9 81 07 00 00 0f 01 ca
fc 6a 00 <68> 30 4d d7 cf e9 71 07 00 00 0f 01 ca fc 6a 00 68 60 55 d7
cf e9
[ 21.889117] EAX: 00000000 EBX: cff9902c ECX: 000000e0 EDX: 00000000
[ 21.889117] ESI: cff9be9c EDI: cfd74d30 EBP: cff99014 ESP: cff99000
[ 21.889127] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210046
[ 21.889128] CR0: 80050033 CR2: cff98ffc CR3: 10108000 CR4: 00340ef0
[ 21.889128] Kernel panic - not syncing: Fatal exception in interrupt
[ 21.889338] Kernel Offset: 0xe600000 from 0xc1000000 (relocation
range: 0xc0000000-0xf7dfdfff)
All code
========
0: a1 07 00 00 0f mov 0xf000007,%eax
5: 01 ca add %ecx,%edx
7: fc cld
8: 6a 00 push $0x0
a: 68 f0 4c d7 cf push $0xcfd74cf0
f: e9 91 07 00 00 jmp 0x7a5
14: 0f 01 ca clac
17: fc cld
18: 6a 00 push $0x0
1a: 68 90 4f d7 cf push $0xcfd74f90
1f: e9 81 07 00 00 jmp 0x7a5
24: 0f 01 ca clac
27: fc cld
28: 6a 00 push $0x0
2a:* 68 30 4d d7 cf push $0xcfd74d30 <--
trapping instruction
2f: e9 71 07 00 00 jmp 0x7a5
34: 0f 01 ca clac
37: fc cld
38: 6a 00 push $0x0
3a: 68 60 55 d7 cf push $0xcfd75560
This is on my private branch, which has some extra changes, but I
don't think they're relevant. I think we broke NMI :( I did a nopti
run and got a different explosion:
[ 14.618581] BUG: unable to handle page fault for address: 0021004a
[ 14.620149] #PF: supervisor write access in kernel mode
[ 14.621134] #PF: error_code(0x0002) - not-present page
[ 14.622028] *pdpt = 0000000035b2f001 *pde = 0000000000000000
[ 14.623047] Oops: 0002 [#1] SMP NOPTI
[ 14.623684] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.7.0+ #117
[ 14.624733] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31
04/01/2014
[ 14.627013] EIP: run_timer_softirq+0x14a/0x470
[ 14.627793] Code: 3b 63 00 8b 45 b8 8b 00 89 45 bc 8b 45 bc 85 c0
74 5c 8b 45 b8 8b 18 89 5f 04 3e 8d 74 26 00 8b 03 8b 53 04 89 02 85
c0 74 03 <89> 50 04 c7 43 04 00 00 00 00 8b 53 0c c7 03 22 01 00 00 f6
43 12
[ 14.630874] EAX: 00210046 EBX: f60f3ee4 ECX: fffba48e EDX: f602bf88
[ 14.631913] ESI: f602bf88 EDI: f65c8080 EBP: f602bfbc ESP: f602bf68
[ 14.632970] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210002
[ 14.634248] CR0: 80050033 CR2: 0021004a CR3: 3583b120 CR4: 00340ef0
[ 14.635300] Call Trace:
[ 14.635710] <SOFTIRQ>
[ 14.636232] __do_softirq+0xac/0x25b
[ 14.636885] ? __entry_text_end+0x8/0x8
[ 14.637786] call_on_stack+0x40/0x50
[ 14.638397] </SOFTIRQ>
[ 14.638813] ? irq_exit_rcu+0x95/0xa0
[ 14.639513] ? sysvec_apic_timer_interrupt+0x27/0x40
[ 14.640424] ? handle_exception+0x15d/0x15d
[ 14.641272] ? __cpuidle_text_start+0x8/0x8
[ 14.642167] ? default_idle+0x23/0x160
I have to run, so I'll keep digging later.
On Thu, Jun 11, 2020 at 4:22 PM Andy Lutomirski <[email protected]> wrote:
>
> On Thu, Jun 11, 2020 at 12:25 PM Peter Zijlstra <[email protected]> wrote:
> >
> > On Thu, Jun 11, 2020 at 12:10:50PM -0700, Andy Lutomirski wrote:
> > > On Thu, Jun 11, 2020 at 11:56 AM Naresh Kamboju
> > > <[email protected]> wrote:
> > > >
> > > > While running perf test and selftest x86 single_step_syscall_32 on
> > > > i386 kernel linux
> > > > next 5.7.0-next-20200610 kernel warning noticed.
> > > >
> > > > steps to reproduce:
> > > > --------------------------
> > > > perf test
> > > > and
> > > > cd /opt/kselftests/default-in-kernel/x86
> > > > ./single_step_syscall_32
> > > >
> > > > perf warning log:
> > > > ----------------------
> > > > [ 57.260865] ------------[ cut here ]------------
> > > > [ 57.266576] IRQs not disabled as expected
> > > > [ 57.270583] WARNING: CPU: 1 PID: 500 at
> > > > /usr/src/kernel/arch/x86/entry/common.c:624
> > > > idtentry_exit_cond_rcu+0x92/0xc0
> > > > [ 57.281092] Modules linked in: x86_pkg_temp_thermal fuse
> > > > [ 57.286406] CPU: 1 PID: 500 Comm: perf Not tainted 5.7.0-next-20200610 #1
> > > > [ 57.293190] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS
> > > > 2.2 05/23/2018
> > > > [ 57.300577] EIP: idtentry_exit_cond_rcu+0x92/0xc0
> > > > [ 57.305280] Code: 8b 89 d8 05 00 00 85 c9 74 ae 80 3d b1 64 2c d4
> > > > 00 75 a5 68 94 2d fb d3 89 55 f8 89 45 fc c6 05 b1 64 2c d4 01 e8 8e
> > > > f5 2b ff <0f> 0b 58 8b 55 f8 8b 45 fc eb 83 8d 76 00 e8 5b fd ff ff c9
> > > > c3 89
> > > > [ 57.324017] EAX: 0000001d EBX: 0d00022a ECX: 00000027 EDX: f5b9e14c
> > > > [ 57.330274] ESI: f2a2ffb4 EDI: ffff0ff4 EBP: f2a2ff8c ESP: f2a2ff80
> > > > [ 57.336531] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010096
> > > > [ 57.343345] CR0: 80050033 CR2: 08700a58 CR3: 14ad7000 CR4: 003406d0
> > > > [ 57.349608] DR0: 080dfb80 DR1: 080dfc00 DR2: 08700a58 DR3: 00000000
> > > > [ 57.355866] DR6: fffe0ff0 DR7: 0d00062a
> > > > [ 57.359697] Call Trace:
> > > > [ 57.362143] exc_debug+0x84/0x1b0
> > > > [ 57.365487] ? exc_int3+0x1d0/0x1d0
> > > > [ 57.368981] handle_exception+0x145/0x145
> > > > [ 57.372991] EIP: 0x80dfbcd
> > > > [ 57.375694] Code: Bad RIP value.
> > > > [ 57.378918] EAX: 00000000 EBX: 00000005 ECX: 00002400 EDX: 00000000
> > > > [ 57.385175] ESI: 00000003 EDI: 00000004 EBP: bfd59798 ESP: bfd59770
> > > > [ 57.391431] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000246
> > > > [ 57.398215] irq event stamp: 1896
> > >
> > > A regrettable property of the current entry code structure is that we
> > > lose any real indication of the vector. Presumably this is #DB, hence
> > > exc_debug. I don't know what perf has to do with it.
> > >
> > > I'll bang on this after lunch if no one beats me to it.
> >
> > Puzzling, CR3 seems to suggest this is !user_mode(), but either way #DB
> > has either idtentry_enter_user() or nmi_enter().
> >
>
> I just got this splat on 32-bit while running my perf abuser here:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/luto/misc-tests.git/tree/perf_lots_of_nmi.c
>
> [ 21.874114] traps: PANIC: double fault, error_code: 0x0
Two bugs here.
1. We had an issue with WARN. Patch sent.
2. idtentry.h has, for x86_32:
# define DEFINE_IDTENTRY_IST DEFINE_IDTENTRY
This is nonsense. It's getting late over here and I'd rather focus on
the more interesting RCU issue, so that's all from me today.
--Andy
Andy Lutomirski <[email protected]> writes:
> On Thu, Jun 11, 2020 at 4:22 PM Andy Lutomirski <[email protected]> wrote:
>
> Two bugs here.
>
> 1. We had an issue with WARN. Patch sent.
Grabbed it
> 2. idtentry.h has, for x86_32:
>
> # define DEFINE_IDTENTRY_IST DEFINE_IDTENTRY
>
> This is nonsense. It's getting late over here and I'd rather focus on
> the more interesting RCU issue, so that's all from me today.
Well, this might be nonsense, but it's exactly matching the current code
in mainline which, e.g. for #DB does:
SYM_CODE_START(debug)
/*
* Entry from sysenter is now handled in common_exception
*/
ASM_CLAC
pushl $0
pushl $do_debug
jmp common_exception
SYM_CODE_END(debug)
There is no IST on 32bit, never was. We do software stack switching for
device interrupts, but that's a different story.
Thanks,
tglx
> On Jun 12, 2020, at 2:01 AM, Thomas Gleixner <[email protected]> wrote:
>
> Andy Lutomirski <[email protected]> writes:
>> On Thu, Jun 11, 2020 at 4:22 PM Andy Lutomirski <[email protected]> wrote:
>>
>> Two bugs here.
>>
>> 1. We had an issue with WARN. Patch sent.
>
> Grabbed it
>
>> 2. idtentry.h has, for x86_32:
>>
>> # define DEFINE_IDTENTRY_IST DEFINE_IDTENTRY
>>
>> This is nonsense. It's getting late over here and I'd rather focus on
>> the more interesting RCU issue, so that's all from me today.
>
> Well, this might be nonsense, but it's exactly matching the current code
> in mainline which, e.g. for #DB does:
>
> SYM_CODE_START(debug)
> /*
> * Entry from sysenter is now handled in common_exception
> */
> ASM_CLAC
> pushl $0
> pushl $do_debug
> jmp common_exception
> SYM_CODE_END(debug)
>
> There is no IST on 32bit, never was. We do software stack switching for
> device interrupts, but that's a different story.
>
DEFINE_IDTENTRY does the idtentry_enter_cond_rcu() dance, which isn’t intended to be safe from NMI context. It should probably map to DEFINE_IDTENTRY_RAW() instead. The specific issue is that NMI ends up there, and at least DEFINE_IDTENTRY_NMI should be raw.
I haven’t tried this at all, nor have I dug through all the users of these macros to check what they expect. Perhaps we should not have the _IST one defined at all on 32 bit and rename it to DEFINE_IDTENTRY_IST_RAW on 64 bit to make it more clear what’s going on when reading the C code.
Or maybe I’m too sleepy and I’m nuts. But I don’t think I am.
> Thanks,
>
> tglx
Andy Lutomirski <[email protected]> writes:
>> On Jun 12, 2020, at 2:01 AM, Thomas Gleixner <[email protected]> wrote:
>> There is no IST on 32bit, never was. We do software stack switching for
>> device interrupts, but that's a different story.
>>
>
> DEFINE_IDTENTRY does the idtentry_enter_cond_rcu() dance, which isn’t
> intended to be safe from NMI context. It should probably map to
> DEFINE_IDTENTRY_RAW() instead. The specific issue is that NMI ends up
> there, and at least DEFINE_IDTENTRY_NMI should be raw.
Yes, you are right. That's clearly broken.
> I haven’t tried this at all, nor have I dug through all the users of
> these macros to check what they expect. Perhaps we should not have
> the _IST one defined at all on 32 bit and rename it to
> DEFINE_IDTENTRY_IST_RAW on 64 bit to make it more clear what’s going
> on when reading the C code.
Let me go over it with a fine comb.
> Or maybe I’m too sleepy and I’m nuts. But I don’t think I am.
/me politely refrains from commenting
I have reported this warning on linux-next and now it is happening on
linux mainline tree.
May I know , are we missing a fix patch on linus 's tree ?
- Naresh
---
While running selftest x86 single_step_syscall_32 on
i386 kernel linux 5.8.0-rc3 kernel warning noticed.
steps to reproduce:
--------------------------
perf test
and
cd /opt/kselftests/default-in-kernel/x86
./single_step_syscall_32
crash dump,
[ 1324.774385] kselftest: Running tests in x86
[ 1324.830187] ------------[ cut here ]------------
[ 1324.834820] IRQs not disabled as expected
[ 1324.838838] WARNING: CPU: 2 PID: 5365 at
/usr/src/kernel/arch/x86/entry/common.c:645
idtentry_exit_cond_rcu+0x92/0xc0
[ 1324.849448] Modules linked in: test_sysctl nf_tables act_mirred
cls_u32 sch_netem sch_ingress mpls_iptunnel mpls_router sch_etf sch_fq
sit x86_pkg_temp_thermal fuse [last unloaded: test_user_copy]
[ 1324.866909] CPU: 2 PID: 5365 Comm: single_step_sys Not tainted 5.8.0-rc3 #1
[ 1324.873866] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS
2.0b 07/27/2017
[ 1324.881340] EIP: idtentry_exit_cond_rcu+0x92/0xc0
[ 1324.886045] Code: 8b 89 d8 05 00 00 85 c9 74 ae 80 3d 3f 1d f8 de
00 75 a5 68 3c 6f c5 de 89 55 f8 89 45 fc c6 05 3f 1d f8 de 01 e8 7e
17 23 ff <0f> 0b 58 8b 55 f8 8b 45 fc eb 83 8d 76 00 e8 5b fd ff ff c9
c3 89
[ 1324.904790] EAX: 0000001d EBX: 00000000 ECX: 00000027 EDX: f594c14c
[ 1324.911055] ESI: f1da3fb4 EDI: ffff4ff0 EBP: f1da3f8c ESP: f1da3f80
[ 1324.917312] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010096
[ 1324.924090] CR0: 80050033 CR2: 0883a16c CR3: 26c06000 CR4: 003406d0
[ 1324.930347] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
[ 1324.936622] DR6: fffe0ff0 DR7: 00000400
[ 1324.940460] Call Trace:
[ 1324.942906] exc_debug+0x84/0x1b0
[ 1324.946224] ? exc_int3+0x1d0/0x1d0
[ 1324.949716] handle_exception+0x145/0x145
[ 1324.953720] EIP: 0x8048900
[ 1324.956423] Code: Bad RIP value.
[ 1324.959646] EAX: 00000005 EBX: 00000000 ECX: bfe19f0c EDX: bfe19e8c
[ 1324.965903] ESI: b7e3a000 EDI: 00000000 EBP: bfe1a488 ESP: bfe19e7c
[ 1324.972161] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000282
[ 1324.978942] irq event stamp: 3694
[ 1324.982257] hardirqs last enabled at (3693): [<de8bf6b9>]
idtentry_exit_user+0x39/0x60
[ 1324.990251] hardirqs last disabled at (3694): [<ddb6ea62>]
vprintk_emit+0x42/0x290
[ 1324.997814] softirqs last enabled at (3688): [<ddab1d98>]
fpu__clear+0x78/0xd0
[ 1325.005110] softirqs last disabled at (3686): [<ddab1d41>]
fpu__clear+0x21/0xd0
[ 1325.012406] ---[ end trace d68e89e2115323e2 ]---
[ 1325.017075] ------------[ cut here ]------------
[ 1325.021688] IRQs not disabled as expected
[ 1325.025695] WARNING: CPU: 2 PID: 5365 at
/usr/src/kernel/arch/x86/entry/common.c:240
__prepare_exit_to_usermode+0x164/0x1b0
[ 1325.036811] Modules linked in: test_sysctl nf_tables act_mirred
cls_u32 sch_netem sch_ingress mpls_iptunnel mpls_router sch_etf sch_fq
sit x86_pkg_temp_thermal fuse [last unloaded: test_user_copy]
[ 1325.054258] CPU: 2 PID: 5365 Comm: single_step_sys Tainted: G
W 5.8.0-rc3 #1
[ 1325.062624] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS
2.0b 07/27/2017
[ 1325.070099] EIP: __prepare_exit_to_usermode+0x164/0x1b0
[ 1325.075315] Code: d8 05 00 00 85 db 0f 84 08 ff ff ff 80 3d 40 1d
f8 de 00 0f 85 fb fe ff ff 68 3c 6f c5 de c6 05 40 1d f8 de 01 e8 7c
ee 0e 00 <0f> 0b 59 e9 e2 fe ff ff 8d 74 26 00 68 e4 6e c5 de e8 66 ee
0e 00
[ 1325.094053] EAX: 0000001d EBX: 00000001 ECX: 00000027 EDX: f594c14c
[ 1325.100309] ESI: f5766c00 EDI: f1da3fb4 EBP: f1da3f74 ESP: f1da3f64
[ 1325.106583] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010086
[ 1325.113378] CR0: 80050033 CR2: 0804d084 CR3: 26c06000 CR4: 003406d0
[ 1325.119637] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
[ 1325.125902] DR6: fffe0ff0 DR7: 00000400
[ 1325.129731] Call Trace:
[ 1325.132179] prepare_exit_to_usermode+0x8/0x30
[ 1325.136624] idtentry_exit_cond_rcu+0xa5/0xc0
[ 1325.140983] ? idtentry_exit_user+0x39/0x60
[ 1325.145168] exc_debug+0x84/0x1b0
[ 1325.148480] ? exc_int3+0x1d0/0x1d0
[ 1325.151970] handle_exception+0x145/0x145
[ 1325.155973] EIP: 0x8048900
[ 1325.158677] Code: Bad RIP value.
[ 1325.161901] EAX: 00000005 EBX: 00000000 ECX: bfe19f0c EDX: bfe19e8c
[ 1325.168160] ESI: b7e3a000 EDI: 00000000 EBP: bfe1a488 ESP: bfe19e7c
[ 1325.174423] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000282
[ 1325.181205] irq event stamp: 3808
[ 1325.184520] hardirqs last enabled at (3807): [<de8bf6b9>]
idtentry_exit_user+0x39/0x60
[ 1325.192512] hardirqs last disabled at (3808): [<ddb6ea62>]
vprintk_emit+0x42/0x290
[ 1325.200079] softirqs last enabled at (3802): [<ddab1d98>]
fpu__clear+0x78/0xd0
[ 1325.207384] softirqs last disabled at (3800): [<ddab1d41>]
fpu__clear+0x21/0xd0
[ 1325.214688] ---[ end trace d68e89e2115323e3 ]---
On Thu, Jul 02, 2020 at 07:57:54PM +0530, Naresh Kamboju wrote:
> I have reported this warning on linux-next and now it is happening on
> linux mainline tree.
> May I know , are we missing a fix patch on linus 's tree ?
>
> - Naresh
> ---
> While running selftest x86 single_step_syscall_32 on
> i386 kernel linux 5.8.0-rc3 kernel warning noticed.
>
> steps to reproduce:
> --------------------------
> perf test
> and
> cd /opt/kselftests/default-in-kernel/x86
> ./single_step_syscall_32
>
> crash dump,
> [ 1324.774385] kselftest: Running tests in x86
> [ 1324.830187] ------------[ cut here ]------------
> [ 1324.834820] IRQs not disabled as expected
> [ 1324.838838] WARNING: CPU: 2 PID: 5365 at
> /usr/src/kernel/arch/x86/entry/common.c:645
> idtentry_exit_cond_rcu+0x92/0xc0
How's this?
DEFINE_IDTENTRY_DEBUG() is DEFINE_IDTENTRY_RAW on x86_64
DEFINE_IDTENTRY on i386
calling exc_debug_*() from DEFINE_IDTENTRY() does a double layer of
idtentry_{enter,exit}*() functions.
Also, handle_debug() is still a trainwreck, we should never get to
cond_local_irq_enable() when !user.
Completely untested...
---
arch/x86/kernel/traps.c | 49 +++++++++++++++++++++++--------------------------
1 file changed, 23 insertions(+), 26 deletions(-)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b0195771c932..47d6b46e1564 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -806,8 +806,17 @@ static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user)
* If DR6 is zero, no point in trying to handle it. The kernel is
* not using INT1.
*/
- if (!user && !dr6)
- return;
+ if (!user) {
+ /*
+ * Catch SYSENTER with TF set and clear DR_STEP. If this hit a
+ * watchpoint at the same time then that will still be handled.
+ */
+ if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs))
+ dr6 &= ~DR_STEP;
+
+ if (!dr6)
+ return;
+ }
/*
* If dr6 has no reason to give us about the origin of this trap,
@@ -859,25 +868,29 @@ static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user)
cond_local_irq_disable(regs);
}
+#ifdef CONFIG_X86_64
static __always_inline void exc_debug_kernel(struct pt_regs *regs,
unsigned long dr6)
{
bool irq_state = idtentry_enter_nmi(regs);
instrumentation_begin();
- /*
- * Catch SYSENTER with TF set and clear DR_STEP. If this hit a
- * watchpoint at the same time then that will still be handled.
- */
- if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs))
- dr6 &= ~DR_STEP;
-
handle_debug(regs, dr6, false);
instrumentation_end();
idtentry_exit_nmi(regs, irq_state);
}
+/* IST stack entry */
+DEFINE_IDTENTRY_DEBUG(exc_debug)
+{
+ unsigned long dr6, dr7;
+
+ debug_enter(&dr6, &dr7);
+ exc_debug_kernel(regs, dr6);
+ debug_exit(dr7);
+}
+
static __always_inline void exc_debug_user(struct pt_regs *regs,
unsigned long dr6)
{
@@ -890,17 +903,6 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
idtentry_exit_user(regs);
}
-#ifdef CONFIG_X86_64
-/* IST stack entry */
-DEFINE_IDTENTRY_DEBUG(exc_debug)
-{
- unsigned long dr6, dr7;
-
- debug_enter(&dr6, &dr7);
- exc_debug_kernel(regs, dr6);
- debug_exit(dr7);
-}
-
/* User entry, runs on regular task stack */
DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
{
@@ -917,12 +919,7 @@ DEFINE_IDTENTRY_DEBUG(exc_debug)
unsigned long dr6, dr7;
debug_enter(&dr6, &dr7);
-
- if (user_mode(regs))
- exc_debug_user(regs, dr6);
- else
- exc_debug_kernel(regs, dr6);
-
+ handle_debug(regs, dr6, user_mode(regs));
debug_exit(dr7);
}
#endif
On Thu, 2 Jul 2020 at 20:33, Peter Zijlstra <[email protected]> wrote:
>
> How's this?
>
> DEFINE_IDTENTRY_DEBUG() is DEFINE_IDTENTRY_RAW on x86_64
> DEFINE_IDTENTRY on i386
>
> calling exc_debug_*() from DEFINE_IDTENTRY() does a double layer of
> idtentry_{enter,exit}*() functions.
>
> Also, handle_debug() is still a trainwreck, we should never get to
> cond_local_irq_enable() when !user.
>
> Completely untested...
Reported-by: Naresh Kamboju <[email protected]>
Tested-by: Naresh Kamboju <[email protected]>
The patch did not apply smooth.
I have manually edited the patch [1] and applied it on top of linus
master branch
and tested on x86_64 and i386 and test pass and the reported WARNING gone.
ref:
patch link
[1] https://pastebin.com/mBHkP1A6
Test jobs links,
https://lkft.validation.linaro.org/scheduler/job/1538367#L1218
https://lkft.validation.linaro.org/scheduler/job/1538368#L1271
- Naresh
On Thu, Jul 2, 2020 at 8:03 AM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Jul 02, 2020 at 07:57:54PM +0530, Naresh Kamboju wrote:
> > I have reported this warning on linux-next and now it is happening on
> > linux mainline tree.
> > May I know , are we missing a fix patch on linus 's tree ?
> >
> > - Naresh
> > ---
> > While running selftest x86 single_step_syscall_32 on
> > i386 kernel linux 5.8.0-rc3 kernel warning noticed.
> >
> > steps to reproduce:
> > --------------------------
> > perf test
> > and
> > cd /opt/kselftests/default-in-kernel/x86
> > ./single_step_syscall_32
> >
> > crash dump,
> > [ 1324.774385] kselftest: Running tests in x86
> > [ 1324.830187] ------------[ cut here ]------------
> > [ 1324.834820] IRQs not disabled as expected
> > [ 1324.838838] WARNING: CPU: 2 PID: 5365 at
> > /usr/src/kernel/arch/x86/entry/common.c:645
> > idtentry_exit_cond_rcu+0x92/0xc0
>
> How's this?
>
> DEFINE_IDTENTRY_DEBUG() is DEFINE_IDTENTRY_RAW on x86_64
> DEFINE_IDTENTRY on i386
>
> calling exc_debug_*() from DEFINE_IDTENTRY() does a double layer of
> idtentry_{enter,exit}*() functions.
>
> Also, handle_debug() is still a trainwreck, we should never get to
> cond_local_irq_enable() when !user.
>
> Completely untested...
>
> ---
> arch/x86/kernel/traps.c | 49 +++++++++++++++++++++++--------------------------
> 1 file changed, 23 insertions(+), 26 deletions(-)
>
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index b0195771c932..47d6b46e1564 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -806,8 +806,17 @@ static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user)
> * If DR6 is zero, no point in trying to handle it. The kernel is
> * not using INT1.
> */
> - if (!user && !dr6)
> - return;
> + if (!user) {
> + /*
> + * Catch SYSENTER with TF set and clear DR_STEP. If this hit a
> + * watchpoint at the same time then that will still be handled.
> + */
> + if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs))
> + dr6 &= ~DR_STEP;
> +
> + if (!dr6)
> + return;
> + }
>
> /*
> * If dr6 has no reason to give us about the origin of this trap,
> @@ -859,25 +868,29 @@ static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user)
> cond_local_irq_disable(regs);
> }
>
> +#ifdef CONFIG_X86_64
> static __always_inline void exc_debug_kernel(struct pt_regs *regs,
> unsigned long dr6)
> {
> bool irq_state = idtentry_enter_nmi(regs);
> instrumentation_begin();
>
> - /*
> - * Catch SYSENTER with TF set and clear DR_STEP. If this hit a
> - * watchpoint at the same time then that will still be handled.
> - */
> - if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs))
> - dr6 &= ~DR_STEP;
> -
> handle_debug(regs, dr6, false);
>
> instrumentation_end();
> idtentry_exit_nmi(regs, irq_state);
> }
>
> +/* IST stack entry */
> +DEFINE_IDTENTRY_DEBUG(exc_debug)
> +{
> + unsigned long dr6, dr7;
> +
> + debug_enter(&dr6, &dr7);
> + exc_debug_kernel(regs, dr6);
> + debug_exit(dr7);
> +}
> +
> static __always_inline void exc_debug_user(struct pt_regs *regs,
> unsigned long dr6)
> {
> @@ -890,17 +903,6 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
> idtentry_exit_user(regs);
> }
>
> -#ifdef CONFIG_X86_64
> -/* IST stack entry */
> -DEFINE_IDTENTRY_DEBUG(exc_debug)
> -{
> - unsigned long dr6, dr7;
> -
> - debug_enter(&dr6, &dr7);
> - exc_debug_kernel(regs, dr6);
> - debug_exit(dr7);
> -}
> -
> /* User entry, runs on regular task stack */
> DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
> {
> @@ -917,12 +919,7 @@ DEFINE_IDTENTRY_DEBUG(exc_debug)
> unsigned long dr6, dr7;
>
> debug_enter(&dr6, &dr7);
> -
> - if (user_mode(regs))
> - exc_debug_user(regs, dr6);
> - else
> - exc_debug_kernel(regs, dr6);
> -
> + handle_debug(regs, dr6, user_mode(regs));
> debug_exit(dr7);
> }
> #endif
I'll fold something like this into my pile of pending fixes.