On Mon, 2020-10-05 at 17:43 +0100, Mark Rutland wrote:
> The current initialization of the per-cpu offset register is difficult
> to follow and this initialization is not always early enough for
> upcoming instrumentation with KCSAN, where the instrumentation callbacks
> use the per-cpu offset.
>
> To make it possible to support KCSAN, and to simplify reasoning about
> early bringup code, let's initialize the per-cpu offset earlier, before
> we run any C code that may consume it. To do so, this patch adds a new
> init_this_cpu_offset() helper that's called before the usual
> primary/secondary start functions. For consistency, this is also used to
> re-initialize the per-cpu offset after the runtime per-cpu areas have
> been allocated (which can change CPU0's offset).
>
> So that init_this_cpu_offset() isn't subject to any instrumentation that
> might consume the per-cpu offset, it is marked with noinstr, preventing
> instrumentation.
>
> Signed-off-by: Mark Rutland <[email protected]>
> Cc: Catalin Marinas <[email protected]>
> Cc: James Morse <[email protected]>
> Cc: Will Deacon <[email protected]>
Reverting this commit on the top of today's linux-next fixed an issue that
Thunder X2 is unable to boot:
.config: https://gitlab.com/cailca/linux-mm/-/blob/master/arm64.config
EFI stub: Booting Linux Kernel...
EFI stub: EFI_RNG_PROTOCOL unavailable, KASLR will be disabled
EFI stub: Using DTB from configuration table
EFI stub: Exiting boot services and installing virtual address map...
It hangs here for more than 10 minutes even with "earlycon" before I gave up.
The reverting makes it boot again following by those lines almost immediately.
[ 0.000000][ T0] Booting Linux on physical CPU 0x0000000000 [0x431f0af1]
[ 0.000000][ T0] Linux version 5.9.0-rc8-next-20201008+ (gcc (GCC) 8.3.1 20191121 (Red Hat 8.3.1-5), GNU ld version 2.30-79.el8) #6 SMP Thu Oct 8 20:57:40 EDT 2020
[ 0.000000][ T0] efi: EFI v2.70 by American Megatrends
[ 0.000000][ T0] efi: ESRT=0xf9224418 SMBIOS=0xfcca0000 SMBIOS 3.0=0xfcc90000 ACPI 2.0=0xf9720000 MEMRESERVE=0xfc965918
[ 0.000000][ T0] esrt: Reserving ESRT space from 0x00000000f9224418 to 0x00000000f9224450.
[ 0.000000][ T0] ACPI: Early table checksum verification disabled
[ 0.000000][ T0] ACPI: RSDP 0x00000000F9720000 000024 (v02 HPE )
[ 0.000000][ T0] ACPI: XSDT 0x00000000F9720028 0000DC (v01 HPE ServerCL 01072009 AMI 00010013)
[ 0.000000][ T0] ACPI: FACP 0x00000000F9720108 000114 (v06 HPE ServerCL 01072009 AMI 00010013)
[ 0.000000][ T0] ACPI: DSDT 0x00000000F9720220 000714 (v02 HPE ServerCL 20150406 INTL 20170831)
[ 0.000000][ T0] ACPI: FIDT 0x00000000F9720938 00009C (v01 HPE ServerCL 01072009 AMI 00010013)
...
# lscpu
Architecture: aarch64
Byte Order: Little Endian
CPU(s): 224
On-line CPU(s) list: 0-223
Thread(s) per core: 4
Core(s) per socket: 28
Socket(s): 2
NUMA node(s): 2
Vendor ID: Cavium
Model: 1
Model name: ThunderX2 99xx
Stepping: 0x1
BogoMIPS: 400.00
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 32768K
NUMA node0 CPU(s): 0-111
NUMA node1 CPU(s): 112-223
Flags: fp asimd aes pmull sha1 sha2 crc32 atomics cpuid asimdrdm
> ---
> arch/arm64/include/asm/cpu.h | 2 ++
> arch/arm64/kernel/head.S | 3 +++
> arch/arm64/kernel/setup.c | 12 ++++++------
> arch/arm64/kernel/smp.c | 13 ++++++++-----
> 4 files changed, 19 insertions(+), 11 deletions(-)
>
> Since v1[1]:
>
> * Fix typos
> * Rebase atop v5.9-rc4
>
> Mark.
>
> [1] https://lore.kernel.org/r/[email protected]
>
> diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
> index 7faae6ff3ab4d..d9d60b18e8116 100644
> --- a/arch/arm64/include/asm/cpu.h
> +++ b/arch/arm64/include/asm/cpu.h
> @@ -68,4 +68,6 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info);
> void update_cpu_features(int cpu, struct cpuinfo_arm64 *info,
> struct cpuinfo_arm64 *boot);
>
> +void init_this_cpu_offset(void);
> +
> #endif /* __ASM_CPU_H */
> diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
> index 037421c66b147..2720e6ec68140 100644
> --- a/arch/arm64/kernel/head.S
> +++ b/arch/arm64/kernel/head.S
> @@ -452,6 +452,8 @@ SYM_FUNC_START_LOCAL(__primary_switched)
> bl __pi_memset
> dsb ishst // Make zero page visible to
> PTW
>
> + bl init_this_cpu_offset
> +
> #ifdef CONFIG_KASAN
> bl kasan_early_init
> #endif
> @@ -758,6 +760,7 @@ SYM_FUNC_START_LOCAL(__secondary_switched)
> ptrauth_keys_init_cpu x2, x3, x4, x5
> #endif
>
> + bl init_this_cpu_offset
> b secondary_start_kernel
> SYM_FUNC_END(__secondary_switched)
>
> diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
> index 53acbeca4f574..fde4396418add 100644
> --- a/arch/arm64/kernel/setup.c
> +++ b/arch/arm64/kernel/setup.c
> @@ -87,12 +87,6 @@ void __init smp_setup_processor_id(void)
> u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
> set_cpu_logical_map(0, mpidr);
>
> - /*
> - * clear __my_cpu_offset on boot CPU to avoid hang caused by
> - * using percpu variable early, for example, lockdep will
> - * access percpu variable inside lock_release
> - */
> - set_my_cpu_offset(0);
> pr_info("Booting Linux on physical CPU 0x%010lx [0x%08x]\n",
> (unsigned long)mpidr, read_cpuid_id());
> }
> @@ -281,6 +275,12 @@ u64 cpu_logical_map(int cpu)
> return __cpu_logical_map[cpu];
> }
>
> +void noinstr init_this_cpu_offset(void)
> +{
> + unsigned int cpu = task_cpu(current);
> + set_my_cpu_offset(per_cpu_offset(cpu));
> +}
> +
> void __init __no_sanitize_address setup_arch(char **cmdline_p)
> {
> init_mm.start_code = (unsigned long) _text;
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 355ee9eed4dde..7714310fba226 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -192,10 +192,7 @@ asmlinkage notrace void secondary_start_kernel(void)
> u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
> struct mm_struct *mm = &init_mm;
> const struct cpu_operations *ops;
> - unsigned int cpu;
> -
> - cpu = task_cpu(current);
> - set_my_cpu_offset(per_cpu_offset(cpu));
> + unsigned int cpu = smp_processor_id();
>
> /*
> * All kernel threads share the same mm context; grab a
> @@ -435,7 +432,13 @@ void __init smp_cpus_done(unsigned int max_cpus)
>
> void __init smp_prepare_boot_cpu(void)
> {
> - set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
> + /*
> + * Now that setup_per_cpu_areas() has allocated the runtime per-cpu
> + * areas it is only safe to read the CPU0 boot-time area, and we must
> + * reinitialize the offset to point to the runtime area.
> + */
> + init_this_cpu_offset();
> +
> cpuinfo_store_boot_cpu();
>
> /*
On Thu, Oct 08, 2020 at 09:18:24PM -0400, Qian Cai wrote:
> On Mon, 2020-10-05 at 17:43 +0100, Mark Rutland wrote:
> > The current initialization of the per-cpu offset register is difficult
> > to follow and this initialization is not always early enough for
> > upcoming instrumentation with KCSAN, where the instrumentation callbacks
> > use the per-cpu offset.
> >
> > To make it possible to support KCSAN, and to simplify reasoning about
> > early bringup code, let's initialize the per-cpu offset earlier, before
> > we run any C code that may consume it. To do so, this patch adds a new
> > init_this_cpu_offset() helper that's called before the usual
> > primary/secondary start functions. For consistency, this is also used to
> > re-initialize the per-cpu offset after the runtime per-cpu areas have
> > been allocated (which can change CPU0's offset).
> >
> > So that init_this_cpu_offset() isn't subject to any instrumentation that
> > might consume the per-cpu offset, it is marked with noinstr, preventing
> > instrumentation.
> >
> > Signed-off-by: Mark Rutland <[email protected]>
> > Cc: Catalin Marinas <[email protected]>
> > Cc: James Morse <[email protected]>
> > Cc: Will Deacon <[email protected]>
>
> Reverting this commit on the top of today's linux-next fixed an issue that
> Thunder X2 is unable to boot:
>
> .config: https://gitlab.com/cailca/linux-mm/-/blob/master/arm64.config
>
> EFI stub: Booting Linux Kernel...
> EFI stub: EFI_RNG_PROTOCOL unavailable, KASLR will be disabled
> EFI stub: Using DTB from configuration table
> EFI stub: Exiting boot services and installing virtual address map...
>
> It hangs here for more than 10 minutes even with "earlycon" before I gave up.
> The reverting makes it boot again following by those lines almost immediately.
>
> [ 0.000000][ T0] Booting Linux on physical CPU 0x0000000000 [0x431f0af1]
> [ 0.000000][ T0] Linux version 5.9.0-rc8-next-20201008+ (gcc (GCC) 8.3.1 20191121 (Red Hat 8.3.1-5), GNU ld version 2.30-79.el8) #6 SMP Thu Oct 8 20:57:40 EDT 2020
> [ 0.000000][ T0] efi: EFI v2.70 by American Megatrends
> [ 0.000000][ T0] efi: ESRT=0xf9224418 SMBIOS=0xfcca0000 SMBIOS 3.0=0xfcc90000 ACPI 2.0=0xf9720000 MEMRESERVE=0xfc965918
> [ 0.000000][ T0] esrt: Reserving ESRT space from 0x00000000f9224418 to 0x00000000f9224450.
> [ 0.000000][ T0] ACPI: Early table checksum verification disabled
> [ 0.000000][ T0] ACPI: RSDP 0x00000000F9720000 000024 (v02 HPE )
> [ 0.000000][ T0] ACPI: XSDT 0x00000000F9720028 0000DC (v01 HPE ServerCL 01072009 AMI 00010013)
> [ 0.000000][ T0] ACPI: FACP 0x00000000F9720108 000114 (v06 HPE ServerCL 01072009 AMI 00010013)
> [ 0.000000][ T0] ACPI: DSDT 0x00000000F9720220 000714 (v02 HPE ServerCL 20150406 INTL 20170831)
> [ 0.000000][ T0] ACPI: FIDT 0x00000000F9720938 00009C (v01 HPE ServerCL 01072009 AMI 00010013)
Interesting...
Could you provide a disassembly of init_this_cpu_offset() please?
I was hoping to send the 5.10 pull request today, so I'll probably have
to revert this change for now.
Will
Hi Qian,
On Fri, Oct 09, 2020 at 09:51:15AM +0100, Will Deacon wrote:
> On Thu, Oct 08, 2020 at 09:18:24PM -0400, Qian Cai wrote:
> > On Mon, 2020-10-05 at 17:43 +0100, Mark Rutland wrote:
> > > The current initialization of the per-cpu offset register is difficult
> > > to follow and this initialization is not always early enough for
> > > upcoming instrumentation with KCSAN, where the instrumentation callbacks
> > > use the per-cpu offset.
> > >
> > > To make it possible to support KCSAN, and to simplify reasoning about
> > > early bringup code, let's initialize the per-cpu offset earlier, before
> > > we run any C code that may consume it. To do so, this patch adds a new
> > > init_this_cpu_offset() helper that's called before the usual
> > > primary/secondary start functions. For consistency, this is also used to
> > > re-initialize the per-cpu offset after the runtime per-cpu areas have
> > > been allocated (which can change CPU0's offset).
> > >
> > > So that init_this_cpu_offset() isn't subject to any instrumentation that
> > > might consume the per-cpu offset, it is marked with noinstr, preventing
> > > instrumentation.
> > >
> > > Signed-off-by: Mark Rutland <[email protected]>
> > > Cc: Catalin Marinas <[email protected]>
> > > Cc: James Morse <[email protected]>
> > > Cc: Will Deacon <[email protected]>
> >
> > Reverting this commit on the top of today's linux-next fixed an issue that
> > Thunder X2 is unable to boot:
> >
> > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/arm64.config
Sorry about this. :/
Will, to save you reading all the below, I think the right thing to do
for now is to revert this.
Building with that config, I see a boot time-hang on the primary CPU
under QEMU TCG, with or without VHE (in both cases, a stuck in a
recursive synchronous exception). With the patch reverted, the kernel
boots.
Looking at the assembly, task_cpu() gets instrumented (which puts this
patch on dodgy ground generally and I think warrants the revert), but as
it's instrumented with KASAN_INLINE that doesn't immediately explain the
issue since the shadow should be up and so we shouldn't call the report
function. I'll dig into this some more.
Assembly dumps below; for init_this_cpu_offset the reference to
page_wait_table+0x4d00 is generating the address for __per_cpu_offset,
but objdump doesn't have enough info to resolve that nicely.
| ffffa00011143bc0 <init_this_cpu_offset>:
| ffffa00011143bc0: a9bf7bfd stp x29, x30, [sp, #-16]!
| ffffa00011143bc4: d5384100 mrs x0, sp_el0
| ffffa00011143bc8: 910003fd mov x29, sp
| ffffa00011143bcc: 97bb29b9 bl ffffa0001000e2b0 <task_cpu>
| ffffa00011143bd0: f0004ea1 adrp x1, ffffa00011b1a000 <page_wait_table+0x4d00>
| ffffa00011143bd4: 913c6021 add x1, x1, #0xf18
| ffffa00011143bd8: f8605820 ldr x0, [x1, w0, uxtw #3]
| ffffa00011143bdc: 97bb29b1 bl ffffa0001000e2a0 <set_my_cpu_offset>
| ffffa00011143be0: a8c17bfd ldp x29, x30, [sp], #16
| ffffa00011143be4: d65f03c0 ret
| ffffa0001000e2b0 <task_cpu>:
| ffffa0001000e2b0: a9be7bfd stp x29, x30, [sp, #-32]!
| ffffa0001000e2b4: d2d40001 mov x1, #0xa00000000000 // #175921860444160
| ffffa0001000e2b8: f2fbffe1 movk x1, #0xdfff, lsl #48
| ffffa0001000e2bc: 910003fd mov x29, sp
| ffffa0001000e2c0: f9000bf3 str x19, [sp, #16]
| ffffa0001000e2c4: aa0003f3 mov x19, x0
| ffffa0001000e2c8: 91012000 add x0, x0, #0x48
| ffffa0001000e2cc: 52800062 mov w2, #0x3 // #3
| ffffa0001000e2d0: d343fc03 lsr x3, x0, #3
| ffffa0001000e2d4: 38e16861 ldrsb w1, [x3, x1]
| ffffa0001000e2d8: 7100003f cmp w1, #0x0
| ffffa0001000e2dc: 7a411041 ccmp w2, w1, #0x1, ne // ne = any
| ffffa0001000e2e0: 540000aa b.ge ffffa0001000e2f4 <task_cpu+0x44> // b.tcont
| ffffa0001000e2e4: b9404a60 ldr w0, [x19, #72]
| ffffa0001000e2e8: f9400bf3 ldr x19, [sp, #16]
| ffffa0001000e2ec: a8c27bfd ldp x29, x30, [sp], #32
| ffffa0001000e2f0: d65f03c0 ret
| ffffa0001000e2f4: 9415af1f bl ffffa00010579f70 <__asan_report_load4_noabort>
| ffffa0001000e2f8: 17fffffb b ffffa0001000e2e4 <task_cpu+0x34>
| ffffa0001000e2fc: d503201f nop
| ffffa0001000e2a0 <set_my_cpu_offset>:
| ffffa0001000e2a0: d518d080 msr tpidr_el1, x0
| ffffa0001000e2a4: d65f03c0 ret
| ffffa0001000e2a8: d503201f nop
| ffffa0001000e2ac: d503201f nop
Thanks,
Mark.