From: Lai Jiangshan <[email protected]>
Add some C equivalent functions of the ASM macros and implement the whole
error_entry() as C code.
The patches are picked and re-made from the huge patchset
https://lore.kernel.org/lkml/[email protected]/
which converts a large chunk of ASM code to C code.
The C version generally has better readability and easier to be
updated/improved.
This smaller patchset converts error_entry() only.
The equivalent ASM macros are not removed because they are still used by
the IST exceptions.
No functional change intended and comments are also copied.
The complier generates very similar code for the C code and the original
ASM code except minor differences.
The complier uses tail-call-optimization for calling sync_regs(). It
uses "JMP sync_regs" while the ASM code uses "CALL+RET".
The compiler generates "AND $0xe7,%ah" (3 bytes) for the code
"cr3 = user_cr3 & ~PTI_USER_PGTABLE_AND_PCID_MASK" while the ASM code in
SWITCH_TO_KERNEL_CR3() results "AND $0xffffffffffffe7ff,%rax" (6 bytes).
The compiler generates lengthier code for "cr3 |= X86_CR3_PCID_NOFLUSH"
because it uses "MOVABS+OR" (13 bytes) rather than a single
"BTS" (5 bytes).
ALTERNATIVE and static_cpu_has() are also different which depends on
what alternative instructions for ALTERNATIVE are.
[V2]: https://lore.kernel.org/lkml/[email protected]/
[V1]: https://lore.kernel.org/lkml/[email protected]/
Changed from V2:
Fix conflict in arch/x86/include/asm/proto.h in patch7
Changed from V1:
remove unneeded cleanup in patch2
Changed from the old huge patchset:
squash some patches
Lai Jiangshan (7):
x86/entry: Introduce __entry_text for entry code written in C
x86/entry: Move PTI_USER_* to arch/x86/include/asm/processor-flags.h
x86: Mark __native_read_cr3() & native_write_cr3() as __always_inline
x86/entry: Add arch/x86/entry/entry64.c for C entry code
x86/entry: Add the C verion of SWITCH_TO_KERNEL_CR3 as
switch_to_kernel_cr3()
x86/traps: Add fence_swapgs_{user,kernel}_entry() and
user_entry_swapgs_and_fence()
x86/entry: Implement the whole error_entry() as C code
arch/x86/entry/Makefile | 3 +-
arch/x86/entry/calling.h | 10 --
arch/x86/entry/entry64.c | 137 +++++++++++++++++++++++++
arch/x86/entry/entry_64.S | 85 +--------------
arch/x86/include/asm/idtentry.h | 3 +
arch/x86/include/asm/processor-flags.h | 15 +++
arch/x86/include/asm/proto.h | 1 +
arch/x86/include/asm/special_insns.h | 4 +-
arch/x86/include/asm/traps.h | 1 +
arch/x86/kernel/traps.c | 2 -
include/linux/compiler_types.h | 8 +-
11 files changed, 169 insertions(+), 100 deletions(-)
create mode 100644 arch/x86/entry/entry64.c
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
Some entry code will be implemented in C files.
Introduce __entry_text to set them in .entry.text section.
The new __entry_text disables instrumentation like noinstr, so
__noinstr_section() is added for noinstr and the new __entry_text.
Note, entry code can not access to %gs before the %gs base is switched
to kernel %gs base, so stack protector can not be used on the C entry
code. But __entry_text doesn't disable stack protector since some
compilers might not support function level granular attribute to
disable stack protector. It will be disabled in C file level.
Cc: Borislav Petkov <[email protected]>
Reviewed-by: Miguel Ojeda <[email protected]>
Reviewed-by: Kees Cook <[email protected]>
Suggested-by: Nick Desaulniers <[email protected]>
Suggested-by: Peter Zijlstra <[email protected]>
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/include/asm/idtentry.h | 3 +++
include/linux/compiler_types.h | 8 +++++---
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 72184b0b2219..acc4c99f801c 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -13,6 +13,9 @@
#include <asm/irq_stack.h>
+/* Entry code written in C. */
+#define __entry_text __noinstr_section(".entry.text")
+
/**
* DECLARE_IDTENTRY - Declare functions for simple IDT entry points
* No error code pushed by hardware
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index d08dfcb0ac68..bd9d9d19dc9b 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -225,9 +225,11 @@ struct ftrace_likely_data {
#endif
/* Section for code which can't be instrumented at all */
-#define noinstr \
- noinline notrace __attribute((__section__(".noinstr.text"))) \
- __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage
+#define __noinstr_section(section) \
+ noinline notrace __section(section) __no_kcsan \
+ __no_sanitize_address __no_profile __no_sanitize_coverage
+
+#define noinstr __noinstr_section(".noinstr.text")
#endif /* __KERNEL__ */
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
These constants will be also used in C file.
Move them to arch/x86/include/asm/processor-flags.h which already has
a kin X86_CR3_PTI_PCID_USER_BIT defined.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/entry/calling.h | 10 ----------
arch/x86/include/asm/processor-flags.h | 15 +++++++++++++++
2 files changed, 15 insertions(+), 10 deletions(-)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 29b36e9e4e74..331a44994cc0 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -142,16 +142,6 @@ For 32-bit we have the following conventions - kernel is built with
#ifdef CONFIG_PAGE_TABLE_ISOLATION
-/*
- * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
- * halves:
- */
-#define PTI_USER_PGTABLE_BIT PAGE_SHIFT
-#define PTI_USER_PGTABLE_MASK (1 << PTI_USER_PGTABLE_BIT)
-#define PTI_USER_PCID_BIT X86_CR3_PTI_PCID_USER_BIT
-#define PTI_USER_PCID_MASK (1 << PTI_USER_PCID_BIT)
-#define PTI_USER_PGTABLE_AND_PCID_MASK (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK)
-
.macro SET_NOFLUSH_BIT reg:req
bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
.endm
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 02c2cbda4a74..4dd2fbbc861a 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -4,6 +4,7 @@
#include <uapi/asm/processor-flags.h>
#include <linux/mem_encrypt.h>
+#include <asm/page_types.h>
#ifdef CONFIG_VM86
#define X86_VM_MASK X86_EFLAGS_VM
@@ -50,7 +51,21 @@
#endif
#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
# define X86_CR3_PTI_PCID_USER_BIT 11
+
+#ifdef CONFIG_X86_64
+/*
+ * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
+ * halves:
+ */
+#define PTI_USER_PGTABLE_BIT PAGE_SHIFT
+#define PTI_USER_PGTABLE_MASK (1 << PTI_USER_PGTABLE_BIT)
+#define PTI_USER_PCID_BIT X86_CR3_PTI_PCID_USER_BIT
+#define PTI_USER_PCID_MASK (1 << PTI_USER_PCID_BIT)
+#define PTI_USER_PGTABLE_AND_PCID_MASK (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK)
+#endif
+
#endif
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
Mark __native_read_cr3() & native_write_cr3() as __always_inline to
ensure they are not instrumentable and in the .entry.text section if
the caller is not instrumentable and in the .entry.text section.
It prepares for __native_read_cr3() and native_write_cr3() to be used
in the C entry code for handling KPTI.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/include/asm/special_insns.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 45b18eb94fa1..dbaee50abb3c 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -42,14 +42,14 @@ static __always_inline void native_write_cr2(unsigned long val)
asm volatile("mov %0,%%cr2": : "r" (val) : "memory");
}
-static inline unsigned long __native_read_cr3(void)
+static __always_inline unsigned long __native_read_cr3(void)
{
unsigned long val;
asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER);
return val;
}
-static inline void native_write_cr3(unsigned long val)
+static __always_inline void native_write_cr3(unsigned long val)
{
asm volatile("mov %0,%%cr3": : "r" (val) : "memory");
}
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
Add a C file "entry64.c" to deposit C entry code for traps and faults
which will be as the same logic as the existing ASM code in entry_64.S.
The file is as low level as entry_64.S and its code can be running in
the environments that the GS base is a user controlled value, or
the CR3 is the KPTI user CR3 or both.
All the code in this file should not be instrumentable. Many instrument
facilities can be disabled by per-function attributes which are included
in the macro __noinstr_section. But stack-protector can not be disabled
function-granularly by some compliers. So stack-protector is disabled
for the whole file in Makefile.
Suggested-by: Joerg Roedel <[email protected]>
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/entry/Makefile | 3 ++-
arch/x86/entry/entry64.c | 14 ++++++++++++++
2 files changed, 16 insertions(+), 1 deletion(-)
create mode 100644 arch/x86/entry/entry64.c
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 7fec5dcf6438..792f7009ff32 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -10,13 +10,14 @@ KCOV_INSTRUMENT := n
CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
CFLAGS_common.o += -fno-stack-protector
+CFLAGS_entry64.o += -fno-stack-protector
obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
obj-y += common.o
+obj-$(CONFIG_X86_64) += entry64.o
obj-y += vdso/
obj-y += vsyscall/
obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o
obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o
-
diff --git a/arch/x86/entry/entry64.c b/arch/x86/entry/entry64.c
new file mode 100644
index 000000000000..ace73861c2a0
--- /dev/null
+++ b/arch/x86/entry/entry64.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ * Copyright (C) 2000 Pavel Machek <[email protected]>
+ * Copyright (C) 2022 Lai Jiangshan, Ant Group
+ *
+ * Handle entries and exits for hardware traps and faults.
+ *
+ * It is as low level as entry_64.S and its code can be running in the
+ * environments that the GS base is a user controlled value, or the CR3
+ * is the PTI user CR3 or both.
+ */
+#include <asm/traps.h>
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
Add the C version switch_to_kernel_cr3() which implements the macro
SWITCH_TO_KERNEL_CR3() in arch/x86/entry/calling.h.
No functional difference intended.
Note:
The compiler generates "AND $0xe7,%ah" (3 bytes) for the code
"cr3 = user_cr3 & ~PTI_USER_PGTABLE_AND_PCID_MASK" while the ASM code in
SWITCH_TO_KERNEL_CR3() results "AND $0xffffffffffffe7ff,%rax" (6 bytes).
The compiler generates lengthier code for "cr3 |= X86_CR3_PCID_NOFLUSH"
because it uses "MOVABS+OR" (13 bytes) rather than a single
"BTS" (5 bytes).
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/entry/entry64.c | 24 ++++++++++++++++++++++++
1 file changed, 24 insertions(+)
diff --git a/arch/x86/entry/entry64.c b/arch/x86/entry/entry64.c
index ace73861c2a0..bd77cc8373ce 100644
--- a/arch/x86/entry/entry64.c
+++ b/arch/x86/entry/entry64.c
@@ -12,3 +12,27 @@
* is the PTI user CR3 or both.
*/
#include <asm/traps.h>
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+static __always_inline void pti_switch_to_kernel_cr3(unsigned long user_cr3)
+{
+ /*
+ * Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3
+ * at kernel pagetables:
+ */
+ unsigned long cr3 = user_cr3 & ~PTI_USER_PGTABLE_AND_PCID_MASK;
+
+ if (static_cpu_has(X86_FEATURE_PCID))
+ cr3 |= X86_CR3_PCID_NOFLUSH;
+
+ native_write_cr3(cr3);
+}
+
+static __always_inline void switch_to_kernel_cr3(void)
+{
+ if (static_cpu_has(X86_FEATURE_PTI))
+ pti_switch_to_kernel_cr3(__native_read_cr3());
+}
+#else
+static __always_inline void switch_to_kernel_cr3(void) {}
+#endif
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
Add the C version fence_swapgs_{user,kernel}_entry() in entry64.c which
are the same as the ASM macro FENCE_SWAPGS_{USER,KERNEL}_ENTRY.
fence_swapgs_user_entry is used in the user entry swapgs code path,
to prevent a speculative swapgs when coming from kernel space.
fence_swapgs_kernel_entry is used in the kernel entry code path,
to prevent the swapgs from getting speculatively skipped when
coming from user space.
Add the C user_entry_swapgs_and_fence() which implements the ASM code:
swapgs
FENCE_SWAPGS_USER_ENTRY
It will be used in the user entry swapgs code path, doing the swapgs and
lfence to prevent a speculative swapgs when coming from kernel space.
Cc: Josh Poimboeuf <[email protected]>
Suggested-by: Peter Zijlstra <[email protected]>
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/entry/entry64.c | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/arch/x86/entry/entry64.c b/arch/x86/entry/entry64.c
index bd77cc8373ce..f7f23800cee4 100644
--- a/arch/x86/entry/entry64.c
+++ b/arch/x86/entry/entry64.c
@@ -36,3 +36,33 @@ static __always_inline void switch_to_kernel_cr3(void)
#else
static __always_inline void switch_to_kernel_cr3(void) {}
#endif
+
+/*
+ * Mitigate Spectre v1 for conditional swapgs code paths.
+ *
+ * fence_swapgs_user_entry is used in the user entry swapgs code path, to
+ * prevent a speculative swapgs when coming from kernel space. It must be
+ * used with switch_to_kernel_cr3() in the same path.
+ *
+ * fence_swapgs_kernel_entry is used in the kernel entry code path without
+ * CR3 write or with conditinal CR3 write only, to prevent the swapgs from
+ * getting speculatively skipped when coming from user space.
+ *
+ * user_entry_swapgs_and_fence is a wrapper of swapgs and fence for user entry
+ * code path.
+ */
+static __always_inline void fence_swapgs_user_entry(void)
+{
+ alternative("", "lfence", X86_FEATURE_FENCE_SWAPGS_USER);
+}
+
+static __always_inline void fence_swapgs_kernel_entry(void)
+{
+ alternative("", "lfence", X86_FEATURE_FENCE_SWAPGS_KERNEL);
+}
+
+static __always_inline void user_entry_swapgs_and_fence(void)
+{
+ native_swapgs();
+ fence_swapgs_user_entry();
+}
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
Implement error_entry() as C code in arch/x86/entry/entry64.c and
replace the ASM version of error_entry().
The code might be in the user CR3 and user GS base at the start of
the function so it calls __always_inline C function only until the GS
and CR3 is switched.
No functional change intended and comments are also copied.
The C version generally has better readability and easier to be
updated/improved.
Note:
To avoid using goto, the C code has two call sites of sync_regs().
It calls sync_regs() directly after fixup_bad_iret() returns while the
ASM code uses JMP instruction to jump to the start of the first call
site.
The complier uses tail-call-optimization for calling sync_regs(). It
uses "JMP sync_regs" while the ASM code uses "CALL+RET".
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/entry/entry64.c | 69 +++++++++++++++++++++++++++++
arch/x86/entry/entry_64.S | 85 ++----------------------------------
arch/x86/include/asm/proto.h | 1 +
arch/x86/include/asm/traps.h | 1 +
arch/x86/kernel/traps.c | 2 -
5 files changed, 74 insertions(+), 84 deletions(-)
diff --git a/arch/x86/entry/entry64.c b/arch/x86/entry/entry64.c
index f7f23800cee4..bd047c329622 100644
--- a/arch/x86/entry/entry64.c
+++ b/arch/x86/entry/entry64.c
@@ -13,6 +13,8 @@
*/
#include <asm/traps.h>
+extern unsigned char asm_load_gs_index_gs_change[];
+
#ifdef CONFIG_PAGE_TABLE_ISOLATION
static __always_inline void pti_switch_to_kernel_cr3(unsigned long user_cr3)
{
@@ -66,3 +68,70 @@ static __always_inline void user_entry_swapgs_and_fence(void)
native_swapgs();
fence_swapgs_user_entry();
}
+
+/*
+ * Put pt_regs onto the task stack and switch GS and CR3 if needed.
+ * The actual stack switch is done in entry_64.S.
+ *
+ * Be careful, it might be in the user CR3 and user GS base at the start
+ * of the function.
+ */
+asmlinkage __visible __entry_text
+struct pt_regs *error_entry(struct pt_regs *eregs)
+{
+ unsigned long iret_ip = (unsigned long)native_irq_return_iret;
+
+ if (user_mode(eregs)) {
+ /*
+ * We entered from user mode.
+ * Switch to kernel gsbase and CR3.
+ */
+ user_entry_swapgs_and_fence();
+ switch_to_kernel_cr3();
+
+ /* Put pt_regs onto the task stack. */
+ return sync_regs(eregs);
+ }
+
+ /*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. B stepping K8s sometimes report a
+ * truncated RIP for IRET exceptions returning to compat mode. Check
+ * for these here too.
+ */
+ if ((eregs->ip == iret_ip) || (eregs->ip == (unsigned int)iret_ip)) {
+ eregs->ip = iret_ip; /* Fix truncated RIP */
+
+ /*
+ * We came from an IRET to user mode, so we have user
+ * gsbase and CR3. Switch to kernel gsbase and CR3:
+ */
+ user_entry_swapgs_and_fence();
+ switch_to_kernel_cr3();
+
+ /*
+ * Pretend that the exception came from user mode: set up
+ * pt_regs as if we faulted immediately after IRET and then
+ * put pt_regs onto the real task stack.
+ */
+ return sync_regs(fixup_bad_iret(eregs));
+ }
+
+ /*
+ * Hack: asm_load_gs_index_gs_change can fail with user gsbase.
+ * If this happens, fix up gsbase and proceed. We'll fix up the
+ * exception and land in asm_load_gs_index_gs_change's error
+ * handler with kernel gsbase.
+ */
+ if (eregs->ip == (unsigned long)asm_load_gs_index_gs_change)
+ native_swapgs();
+
+ /*
+ * Issue an LFENCE to prevent GS speculation, regardless of whether
+ * it is a kernel or user gsbase.
+ */
+ fence_swapgs_kernel_entry();
+
+ /* Enter from kernel, don't move pt_regs */
+ return eregs;
+}
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4300ba49b5ee..f8322398fe1c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -348,7 +348,7 @@ SYM_CODE_END(push_and_clear_regs)
* own pvops for IRET and load_gs_index(). And it doesn't need to
* switch the CR3. So it can skip invoking error_entry().
*/
- ALTERNATIVE "call error_entry; movq %rax, %rsp", \
+ ALTERNATIVE "movq %rsp, %rdi; call error_entry; movq %rax, %rsp", \
"", X86_FEATURE_XENPV
ENCODE_FRAME_POINTER
@@ -784,7 +784,7 @@ _ASM_NOKPROBE(common_interrupt_return)
SYM_FUNC_START(asm_load_gs_index)
FRAME_BEGIN
swapgs
-.Lgs_change:
+SYM_INNER_LABEL(asm_load_gs_index_gs_change, SYM_L_GLOBAL)
ANNOTATE_NOENDBR // error_entry
movl %edi, %gs
2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
@@ -805,7 +805,7 @@ SYM_FUNC_START(asm_load_gs_index)
movl %eax, %gs
jmp 2b
- _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
+ _ASM_EXTABLE(asm_load_gs_index_gs_change, .Lbad_gs)
SYM_FUNC_END(asm_load_gs_index)
EXPORT_SYMBOL(asm_load_gs_index)
@@ -1012,85 +1012,6 @@ SYM_CODE_START_LOCAL(paranoid_exit)
jmp restore_regs_and_return_to_kernel
SYM_CODE_END(paranoid_exit)
-/*
- * Switch GS and CR3 if needed.
- */
-SYM_CODE_START_LOCAL(error_entry)
- UNWIND_HINT_FUNC
- testb $3, CS+8(%rsp)
- jz .Lerror_kernelspace
-
- /*
- * We entered from user mode or we're pretending to have entered
- * from user mode due to an IRET fault.
- */
- swapgs
- FENCE_SWAPGS_USER_ENTRY
- /* We have user CR3. Change to kernel CR3. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
-
- leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
-.Lerror_entry_from_usermode_after_swapgs:
- /* Put us onto the real thread stack. */
- call sync_regs
- RET
-
- /*
- * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. B stepping K8s sometimes report a
- * truncated RIP for IRET exceptions returning to compat mode. Check
- * for these here too.
- */
-.Lerror_kernelspace:
- leaq native_irq_return_iret(%rip), %rcx
- cmpq %rcx, RIP+8(%rsp)
- je .Lerror_bad_iret
- movl %ecx, %eax /* zero extend */
- cmpq %rax, RIP+8(%rsp)
- je .Lbstep_iret
- cmpq $.Lgs_change, RIP+8(%rsp)
- jne .Lerror_entry_done_lfence
-
- /*
- * hack: .Lgs_change can fail with user gsbase. If this happens, fix up
- * gsbase and proceed. We'll fix up the exception and land in
- * .Lgs_change's error handler with kernel gsbase.
- */
- swapgs
-
- /*
- * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
- * kernel or user gsbase.
- */
-.Lerror_entry_done_lfence:
- FENCE_SWAPGS_KERNEL_ENTRY
- leaq 8(%rsp), %rax /* return pt_regs pointer */
- RET
-
-.Lbstep_iret:
- /* Fix truncated RIP */
- movq %rcx, RIP+8(%rsp)
- /* fall through */
-
-.Lerror_bad_iret:
- /*
- * We came from an IRET to user mode, so we have user
- * gsbase and CR3. Switch to kernel gsbase and CR3:
- */
- swapgs
- FENCE_SWAPGS_USER_ENTRY
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
-
- /*
- * Pretend that the exception came from user mode: set up pt_regs
- * as if we faulted immediately after IRET.
- */
- leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
- call fixup_bad_iret
- mov %rax, %rdi
- jmp .Lerror_entry_from_usermode_after_swapgs
-SYM_CODE_END(error_entry)
-
SYM_CODE_START_LOCAL(error_return)
UNWIND_HINT_REGS
DEBUG_ENTRY_ASSERT_IRQS_OFF
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 12ef86b19910..199d27fbf903 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -15,6 +15,7 @@ void entry_SYSCALL_64(void);
void entry_SYSCALL_64_safe_stack(void);
void entry_SYSRETQ_unsafe_stack(void);
void entry_SYSRETQ_end(void);
+extern unsigned char native_irq_return_iret[];
long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
#endif
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 47ecfff2c83d..2d00100d3e03 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -14,6 +14,7 @@
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
asmlinkage __visible notrace
struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs);
+asmlinkage __visible notrace struct pt_regs *error_entry(struct pt_regs *eregs);
void __init trap_init(void);
asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
#endif
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d62b2cb85cea..f76a15f654c5 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -436,8 +436,6 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
#endif
#ifdef CONFIG_X86_ESPFIX64
- extern unsigned char native_irq_return_iret[];
-
/*
* If IRET takes a non-IST fault on the espfix64 stack, then we
* end up promoting it to a doublefault. In that case, take
--
2.19.1.6.gb485710b
On 6/6/22 16:45, Lai Jiangshan wrote:
> From: Lai Jiangshan <[email protected]>
>
> Add some C equivalent functions of the ASM macros and implement the whole
> error_entry() as C code.
Hi,
I did some testing of your patches (on top of mainline commit
34f4335c16a5) and I see these two KASAN reports very occasionally during
boot:
1)
Mountpoint-cache hash table entries: 2048 (order: 2, 16384 bytes, linear)
==================================================================
BUG: KASAN: wild-memory-access in rcu_nmi_enter+0x6e/0xf0
Read of size 4 at addr ff11000034e38b10 by task swapper/0/0
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.19.0-rc1-14003-ga9afe081e27d
#1787
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.13.0-1ubuntu1.1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x57/0x7d
? rcu_nmi_enter+0x6e/0xf0
print_report.cold+0x188/0x667
? rcu_nmi_enter+0x6e/0xf0
kasan_report+0x8a/0x1b0
? rcu_nmi_enter+0x6e/0xf0
kasan_check_range+0x14d/0x1d0
rcu_nmi_enter+0x6e/0xf0
irqentry_enter+0x33/0x50
common_interrupt+0x15/0xc0
asm_common_interrupt+0x2a/0x40
RIP: 0010:identify_cpu+0x3db/0x19f0
Code: b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 d3 14
00 00 41 c7 84 24 80 00 00 00 00 00 00 00 31 c0 89 c1 0f a2 <41> 89 54
24 7c 4c 89 e2 41 89 44 24 1c 48 c1 ea 03 48 b8 00 00 00
RSP: 0000:ffffffff94807da0 EFLAGS: 00000246
RAX: 000000000000001b RBX: 00000000756e6547 RCX: 000000006c65746e
RDX: 0000000049656e69 RSI: 0000000000000000 RDI: ffffffff9509e6e0
RBP: ffffffff9509e675 R08: ffffffff9509e67c R09: 0000000000000000
R10: ffffffff9509e668 R11: fffffbfff2a13cce R12: ffffffff9509e660
R13: ffffffff9509e6e8 R14: ffffffff9509e678 R15: ffffffff9509e674
identify_boot_cpu+0xd/0xb5
check_bugs+0x82/0x15b5
? l1tf_cmdline+0x10c/0x10c
? do_raw_spin_unlock+0x4f/0x250
? _raw_spin_unlock+0x24/0x40
? poking_init+0x350/0x37f
? parse_direct_gbpages_off+0xd/0xd
? rcu_read_lock_bh_held+0xc0/0xc0
start_kernel+0x38c/0x3bb
secondary_startup_64_no_verify+0xe0/0xeb
</TASK>
==================================================================
Disabling lock debugging due to kernel taint
x86/cpu: User Mode Instruction Prevention (UMIP) activated
numa_add_cpu cpu 0 node 0: mask now 0
2)
x86/cpu: User Mode Instruction Prevention (UMIP) activated
numa_add_cpu cpu 0 node 0: mask now 0
Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0
Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0, 1GB 0
Spectre V1 : Vulnerable: __user pointer sanitization and usercopy
barriers only; no swapgs barriers
Spectre V2 : Mitigation: Enhanced IBRS
Spectre V2 : Spectre v2 / SpectreRSB mitigation: Filling RSB on context
switch
Spectre V2 : mitigation: Enabling conditional Indirect Branch Prediction
Barrier
Speculative Store Bypass: Mitigation: Speculative Store Bypass disabled
TAA: Mitigation: TSX disabled
==================================================================
BUG: KASAN: out-of-bounds in rcu_nmi_enter+0x6e/0xf0
Read of size 4 at addr ff11000034e38b10 by task swapper/0/0
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.19.0-rc1-14003-ga9afe081e27d
#1787
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.13.0-1ubuntu1.1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x57/0x7d
print_report.cold+0x9c/0x667
? rcu_nmi_enter+0x6e/0xf0
kasan_report+0x8a/0x1b0
? rcu_nmi_enter+0x6e/0xf0
kasan_check_range+0x14d/0x1d0
rcu_nmi_enter+0x6e/0xf0
irqentry_enter+0x33/0x50
common_interrupt+0x15/0xc0
asm_common_interrupt+0x2a/0x40
RIP: 0010:text_poke_early+0x97/0x9e
Code: 8e 00 f8 fd 48 85 db 74 01 fb eb 05 0f 1f 00 eb 16 8c d0 50 54 48
83 04 24 08 9c 8c c8 50 68 13 ec 12 b1 48 cf eb 03 0f 01 e8 <5b> 5d 41
5c 41 5d c3 41 57 b8 ff ff 37 00 41 56 48 c1 e0 2a 41 55
RSP: 0000:ffffffffb0007c10 EFLAGS: 00000292
RAX: 0000000000000010 RBX: 0000000000000200 RCX: 1ffffffff61137c9
RDX: 0000000000000000 RSI: ffffffffaf69f720 RDI: ffffffffaf7f9ce0
RBP: ffffffffacc6902c R08: 0000000000000001 R09: 0000000000000001
R10: ffffffffacc69030 R11: fffffbfff598d206 R12: ffffffffb0007c80
R13: 0000000000000005 R14: ffffffffb0007c85 R15: ffffffffb0007c80
? kasan_check_range+0x1c/0x1d0
? kasan_check_range+0x20/0x1d0
? kasan_check_range+0x1c/0x1d0
apply_alternatives+0x79e/0x81d
? text_poke_early+0x9e/0x9e
? xas_clear_mark+0x1df/0x270
? apply_retpolines+0x4e4/0x535
? apply_alternatives+0x81d/0x81d
? delay_halt+0x31/0x60
? delay_halt+0x40/0x60
? delay_halt+0x36/0x60
? optimize_nops+0x225/0x225
alternative_instructions+0x43/0x11a
check_bugs+0x14fb/0x15b5
? l1tf_cmdline+0x10c/0x10c
? do_raw_spin_unlock+0x4f/0x250
? quirk_disable_msi.part.0+0x72/0x73
? poking_init+0x350/0x37f
? parse_direct_gbpages_off+0xd/0xd
? rcu_read_lock_bh_held+0xc0/0xc0
start_kernel+0x38c/0x3bb
secondary_startup_64_no_verify+0xe0/0xeb
</TASK>
The buggy address belongs to the physical page:
page:ffd4000000d38e00 refcount:1 mapcount:0 mapping:0000000000000000
index:0x0 pfn:0x34e38
flags: 0x100000000001000(reserved|node=0|zone=1)
raw: 0100000000001000 ffd4000000d38e08 ffd4000000d38e08 0000000000000000
raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ff11000034e38a00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
ff11000034e38a80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>ff11000034e38b00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
^
ff11000034e38b80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
ff11000034e38c00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
==================================================================
Disabling lock debugging due to kernel taint
debug: unmapping init [mem 0xffffffffb12b8000-0xffffffffb12c2fff]
smpboot: CPU0: Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz (family:
0x6, model: 0x6a, stepping: 0x6)
I'll try to get more details (boot options, configs, full logs, etc.), I
just wanted to give a heads up in case anything sticks out.
Vegard
On 6/9/22 11:11, Vegard Nossum wrote:
> On 6/6/22 16:45, Lai Jiangshan wrote:
>> From: Lai Jiangshan <[email protected]>
>>
>> Add some C equivalent functions of the ASM macros and implement the whole
>> error_entry() as C code.
>
> Hi,
>
> I did some testing of your patches (on top of mainline commit
> 34f4335c16a5) and I see these two KASAN reports very occasionally during
> boot:
>
> 1)
>
> Mountpoint-cache hash table entries: 2048 (order: 2, 16384 bytes, linear)
> ==================================================================
> BUG: KASAN: wild-memory-access in rcu_nmi_enter+0x6e/0xf0
So this one I get without your patches as well. It's only about 1% of
boots, though. Let me try to bisect this and start a new thread.
> 2)
>
> BUG: KASAN: out-of-bounds in rcu_nmi_enter+0x6e/0xf0
> Read of size 4 at addr ff11000034e38b10 by task swapper/0/0
I haven't seen this without your patches, although it's the exact same
callsite so I assume it must be related to the first problem.
Vegard
On Mon, Jun 6, 2022 at 10:44 PM Lai Jiangshan <[email protected]> wrote:
>
> From: Lai Jiangshan <[email protected]>
>
> Add some C equivalent functions of the ASM macros and implement the whole
> error_entry() as C code.
>
> The patches are picked and re-made from the huge patchset
> https://lore.kernel.org/lkml/[email protected]/
> which converts a large chunk of ASM code to C code.
>
> The C version generally has better readability and easier to be
> updated/improved.
>
> This smaller patchset converts error_entry() only.
> The equivalent ASM macros are not removed because they are still used by
> the IST exceptions.
>
> No functional change intended and comments are also copied.
>
> The complier generates very similar code for the C code and the original
> ASM code except minor differences.
>
> The complier uses tail-call-optimization for calling sync_regs(). It
> uses "JMP sync_regs" while the ASM code uses "CALL+RET".
>
> The compiler generates "AND $0xe7,%ah" (3 bytes) for the code
> "cr3 = user_cr3 & ~PTI_USER_PGTABLE_AND_PCID_MASK" while the ASM code in
> SWITCH_TO_KERNEL_CR3() results "AND $0xffffffffffffe7ff,%rax" (6 bytes).
>
> The compiler generates lengthier code for "cr3 |= X86_CR3_PCID_NOFLUSH"
> because it uses "MOVABS+OR" (13 bytes) rather than a single
> "BTS" (5 bytes).
>
> ALTERNATIVE and static_cpu_has() are also different which depends on
> what alternative instructions for ALTERNATIVE are.
>
> [V2]: https://lore.kernel.org/lkml/[email protected]/
> [V1]: https://lore.kernel.org/lkml/[email protected]/
>
> Changed from V2:
> Fix conflict in arch/x86/include/asm/proto.h in patch7
>
> Changed from V1:
> remove unneeded cleanup in patch2
>
> Changed from the old huge patchset:
> squash some patches
>
Hello, ALL,
Ping
Thanks,
Lai