2021-12-04 13:54:22

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH v2 6/6] x86: Add straight-line-speculation mitigation

Make use of an upcomming GCC feature to mitigate
straight-line-speculation for x86:

https://gcc.gnu.org/g:53a643f8568067d7700a9f2facc8ba39974973d3
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102952
https://bugs.llvm.org/show_bug.cgi?id=52323

It's built tested on x86_64-allyesconfig using GCC-12 and GCC-11.

Maintenace overhead of this should be fairly low due to objtool
validation.

Size overhead of all these additional int3 instructions comes to:

text data bss dec hex filename
22267751 6933356 2011368 31212475 1dc43bb defconfig-build/vmlinux
22804126 6933356 1470696 31208178 1dc32f2 defconfig-build/vmlinux.sls

Or roughly 2.4% additional text.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/Kconfig | 12 ++++++++++++
arch/x86/Makefile | 4 ++++
arch/x86/include/asm/linkage.h | 10 ++++++++++
arch/x86/include/asm/static_call.h | 2 +-
arch/x86/kernel/ftrace.c | 2 +-
arch/x86/kernel/static_call.c | 5 +++--
arch/x86/lib/memmove_64.S | 2 +-
arch/x86/lib/retpoline.S | 2 +-
scripts/Makefile.build | 3 ++-
scripts/link-vmlinux.sh | 3 +++
10 files changed, 38 insertions(+), 7 deletions(-)

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -472,6 +472,18 @@ config RETPOLINE
branches. Requires a compiler with -mindirect-branch=thunk-extern
support for full protection. The kernel may run slower.

+config CC_HAS_SLS
+ def_bool $(cc-option,-mharden-sls=all)
+
+config SLS
+ bool "Mitigate Straight-Line-Speculation"
+ depends on CC_HAS_SLS && X86_64
+ default n
+ help
+ Compile the kernel with straight-line-speculation options to guard
+ against straight line speculation. The kernel image might be slightly
+ larger.
+
config X86_CPU_RESCTRL
bool "x86 CPU resource control support"
depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -179,6 +179,10 @@ ifdef CONFIG_RETPOLINE
endif
endif

+ifdef CONFIG_SLS
+ KBUILD_CFLAGS += -mharden-sls=all
+endif
+
KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)

ifdef CONFIG_LTO_CLANG
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -18,9 +18,19 @@
#define __ALIGN_STR __stringify(__ALIGN)
#endif

+#ifdef CONFIG_SLS
+#define RET ret; int3
+#else
+#define RET ret
+#endif
+
#else /* __ASSEMBLY__ */

+#ifdef CONFIG_SLS
+#define ASM_RET "ret; int3\n\t"
+#else
#define ASM_RET "ret\n\t"
+#endif

#endif /* __ASSEMBLY__ */

--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -36,7 +36,7 @@
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)")

#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
- __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")
+ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; int3; nop; nop; nop")


#define ARCH_ADD_TRAMP_KEY(name) \
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -303,7 +303,7 @@ union ftrace_op_code_union {
} __attribute__((packed));
};

-#define RET_SIZE 1
+#define RET_SIZE 1 + IS_ENABLED(CONFIG_SLS)

static unsigned long
create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -17,6 +17,8 @@ enum insn_type {
*/
static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };

+static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
+
static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
{
const void *emulate = NULL;
@@ -42,8 +44,7 @@ static void __ref __static_call_transfor
break;

case RET:
- code = text_gen_insn(RET_INSN_OPCODE, insn, func);
- size = RET_INSN_SIZE;
+ code = &retinsn;
break;
}

--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -40,7 +40,7 @@ SYM_FUNC_START(__memmove)
/* FSRM implies ERMS => no length checks, do the copy directly */
.Lmemmove_begin_forward:
ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
- ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; RET", X86_FEATURE_ERMS
+ ALTERNATIVE "", __stringify(movq %rdx, %rcx; rep movsb; RET), X86_FEATURE_ERMS

/*
* movsq instruction have many startup latency
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -34,7 +34,7 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\re

ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
__stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
- __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_AMD

.endm

--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -234,7 +234,8 @@ objtool_args = \
$(if $(CONFIG_GCOV_KERNEL)$(CONFIG_LTO_CLANG), --no-unreachable)\
$(if $(CONFIG_RETPOLINE), --retpoline) \
$(if $(CONFIG_X86_SMAP), --uaccess) \
- $(if $(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL), --mcount)
+ $(if $(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL), --mcount) \
+ $(if $(CONFIG_SLS), --sls)

cmd_objtool = $(if $(objtool-enabled), ; $(objtool) $(objtool_args) $@)
cmd_gen_objtooldep = $(if $(objtool-enabled), { echo ; echo '$@: $$(wildcard $(objtool))' ; } >> $(dot-target).cmd)
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -139,6 +139,9 @@ objtool_link()
if [ -n "${CONFIG_X86_SMAP}" ]; then
objtoolopt="${objtoolopt} --uaccess"
fi
+ if [ -n "${CONFIG_SLS}" ]; then
+ objtoolopt="${objtoolopt} --sls"
+ fi
info OBJTOOL ${1}
tools/objtool/objtool ${objtoolcmd} ${objtoolopt} ${1}
fi




Subject: [tip: x86/core] x86: Add straight-line-speculation mitigation

The following commit has been merged into the x86/core branch of tip:

Commit-ID: e463a09af2f0677b9485a7e8e4e70b396b2ffb6f
Gitweb: https://git.kernel.org/tip/e463a09af2f0677b9485a7e8e4e70b396b2ffb6f
Author: Peter Zijlstra <[email protected]>
AuthorDate: Sat, 04 Dec 2021 14:43:44 +01:00
Committer: Borislav Petkov <[email protected]>
CommitterDate: Thu, 09 Dec 2021 13:32:25 +01:00

x86: Add straight-line-speculation mitigation

Make use of an upcoming GCC feature to mitigate
straight-line-speculation for x86:

https://gcc.gnu.org/g:53a643f8568067d7700a9f2facc8ba39974973d3
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102952
https://bugs.llvm.org/show_bug.cgi?id=52323

It's built tested on x86_64-allyesconfig using GCC-12 and GCC-11.

Maintenance overhead of this should be fairly low due to objtool
validation.

Size overhead of all these additional int3 instructions comes to:

text data bss dec hex filename
22267751 6933356 2011368 31212475 1dc43bb defconfig-build/vmlinux
22804126 6933356 1470696 31208178 1dc32f2 defconfig-build/vmlinux.sls

Or roughly 2.4% additional text.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/Kconfig | 12 ++++++++++++
arch/x86/Makefile | 4 ++++
arch/x86/include/asm/linkage.h | 10 ++++++++++
arch/x86/include/asm/static_call.h | 2 +-
arch/x86/kernel/ftrace.c | 2 +-
arch/x86/kernel/static_call.c | 5 +++--
arch/x86/lib/memmove_64.S | 2 +-
arch/x86/lib/retpoline.S | 2 +-
scripts/Makefile.build | 3 ++-
scripts/link-vmlinux.sh | 3 +++
10 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7399327..dd13ba8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -472,6 +472,18 @@ config RETPOLINE
branches. Requires a compiler with -mindirect-branch=thunk-extern
support for full protection. The kernel may run slower.

+config CC_HAS_SLS
+ def_bool $(cc-option,-mharden-sls=all)
+
+config SLS
+ bool "Mitigate Straight-Line-Speculation"
+ depends on CC_HAS_SLS && X86_64
+ default n
+ help
+ Compile the kernel with straight-line-speculation options to guard
+ against straight line speculation. The kernel image might be slightly
+ larger.
+
config X86_CPU_RESCTRL
bool "x86 CPU resource control support"
depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index c38b657..e84cdd4 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -191,6 +191,10 @@ ifdef CONFIG_RETPOLINE
endif
endif

+ifdef CONFIG_SLS
+ KBUILD_CFLAGS += -mharden-sls=all
+endif
+
KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)

ifdef CONFIG_LTO_CLANG
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index ebddec2..0309079 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -18,9 +18,19 @@
#define __ALIGN_STR __stringify(__ALIGN)
#endif

+#ifdef CONFIG_SLS
+#define RET ret; int3
+#else
+#define RET ret
+#endif
+
#else /* __ASSEMBLY__ */

+#ifdef CONFIG_SLS
+#define ASM_RET "ret; int3\n\t"
+#else
#define ASM_RET "ret\n\t"
+#endif

#endif /* __ASSEMBLY__ */

diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index 39ebe05..ed4f8bb 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -36,7 +36,7 @@
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)")

#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
- __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")
+ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; int3; nop; nop; nop")


#define ARCH_ADD_TRAMP_KEY(name) \
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c39f906..7cc540e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -303,7 +303,7 @@ union ftrace_op_code_union {
} __attribute__((packed));
};

-#define RET_SIZE 1
+#define RET_SIZE 1 + IS_ENABLED(CONFIG_SLS)

static unsigned long
create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 9c407a3..531fb4c 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -17,6 +17,8 @@ enum insn_type {
*/
static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };

+static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
+
static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
{
const void *emulate = NULL;
@@ -42,8 +44,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
break;

case RET:
- code = text_gen_insn(RET_INSN_OPCODE, insn, func);
- size = RET_INSN_SIZE;
+ code = &retinsn;
break;
}

diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index e84d649..50ea390 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -40,7 +40,7 @@ SYM_FUNC_START(__memmove)
/* FSRM implies ERMS => no length checks, do the copy directly */
.Lmemmove_begin_forward:
ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
- ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; RET", X86_FEATURE_ERMS
+ ALTERNATIVE "", __stringify(movq %rdx, %rcx; rep movsb; RET), X86_FEATURE_ERMS

/*
* movsq instruction have many startup latency
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index a842866..89b3fb2 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -34,7 +34,7 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)

ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
__stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
- __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_AMD

.endm

diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 78656b5..a4b89b7 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -234,7 +234,8 @@ objtool_args = \
$(if $(CONFIG_GCOV_KERNEL)$(CONFIG_LTO_CLANG), --no-unreachable)\
$(if $(CONFIG_RETPOLINE), --retpoline) \
$(if $(CONFIG_X86_SMAP), --uaccess) \
- $(if $(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL), --mcount)
+ $(if $(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL), --mcount) \
+ $(if $(CONFIG_SLS), --sls)

cmd_objtool = $(if $(objtool-enabled), ; $(objtool) $(objtool_args) $@)
cmd_gen_objtooldep = $(if $(objtool-enabled), { echo ; echo '$@: $$(wildcard $(objtool))' ; } >> $(dot-target).cmd)
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 5cdd9bc..9716f28 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -139,6 +139,9 @@ objtool_link()
if [ -n "${CONFIG_X86_SMAP}" ]; then
objtoolopt="${objtoolopt} --uaccess"
fi
+ if [ -n "${CONFIG_SLS}" ]; then
+ objtoolopt="${objtoolopt} --sls"
+ fi
info OBJTOOL ${1}
tools/objtool/objtool ${objtoolcmd} ${objtoolopt} ${1}
fi

2022-07-19 15:02:10

by Maciej S. Szmigiero

[permalink] [raw]
Subject: Missing SLS int3 in JMP_NOSPEC? (Was: [PATCH v2 6/6] x86: Add straight-line-speculation mitigation)

On 4.12.2021 14:43, Peter Zijlstra wrote:
> Make use of an upcomming GCC feature to mitigate
> straight-line-speculation for x86:
>
> https://gcc.gnu.org/g:53a643f8568067d7700a9f2facc8ba39974973d3
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102952
> https://bugs.llvm.org/show_bug.cgi?id=52323
>
> It's built tested on x86_64-allyesconfig using GCC-12 and GCC-11.
>
> Maintenace overhead of this should be fairly low due to objtool
> validation.
>
> Size overhead of all these additional int3 instructions comes to:
>
> text data bss dec hex filename
> 22267751 6933356 2011368 31212475 1dc43bb defconfig-build/vmlinux
> 22804126 6933356 1470696 31208178 1dc32f2 defconfig-build/vmlinux.sls
>
> Or roughly 2.4% additional text.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
(..)
> --- a/arch/x86/lib/retpoline.S
> +++ b/arch/x86/lib/retpoline.S
> @@ -34,7 +34,7 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\re
>
> ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
> __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
> - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
> + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_AMD
>
> .endm
>

Looking at this __x86_indirect_thunk_* change makes me wonder why there is
no similar int3 SLS protection in the X86_FEATURE_RETPOLINE_LFENCE case of
JMP_NOSPEC in arch/x86/include/asm/nospec-branch.h:
> .macro JMP_NOSPEC reg:req
> #ifdef CONFIG_RETPOLINE
> ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
> __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
> __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE
> #else

JMP_NOSPEC users seem to have no explicit trailing int3 instructions
either.

Or am I missing something here?

Thanks,
Maciej

2022-07-19 21:24:57

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH] x86,nospec: Simplify {JMP,CALL}_NOSPEC

On Tue, Jul 19, 2022 at 03:19:26PM +0200, Maciej S. Szmigiero wrote:
> On 4.12.2021 14:43, Peter Zijlstra wrote:
> > Make use of an upcomming GCC feature to mitigate
> > straight-line-speculation for x86:
> >
> > https://gcc.gnu.org/g:53a643f8568067d7700a9f2facc8ba39974973d3
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102952
> > https://bugs.llvm.org/show_bug.cgi?id=52323
> >
> > It's built tested on x86_64-allyesconfig using GCC-12 and GCC-11.
> >
> > Maintenace overhead of this should be fairly low due to objtool
> > validation.
> >
> > Size overhead of all these additional int3 instructions comes to:
> >
> > text data bss dec hex filename
> > 22267751 6933356 2011368 31212475 1dc43bb defconfig-build/vmlinux
> > 22804126 6933356 1470696 31208178 1dc32f2 defconfig-build/vmlinux.sls
> >
> > Or roughly 2.4% additional text.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > ---
> (..)
> > --- a/arch/x86/lib/retpoline.S
> > +++ b/arch/x86/lib/retpoline.S
> > @@ -34,7 +34,7 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\re
> > ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
> > __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
> > - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
> > + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_AMD
> > .endm
>
> Looking at this __x86_indirect_thunk_* change makes me wonder why there is
> no similar int3 SLS protection in the X86_FEATURE_RETPOLINE_LFENCE case of
> JMP_NOSPEC in arch/x86/include/asm/nospec-branch.h:
> > .macro JMP_NOSPEC reg:req
> > #ifdef CONFIG_RETPOLINE
> > ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
> > __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
> > __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE
> > #else
>
> JMP_NOSPEC users seem to have no explicit trailing int3 instructions
> either.
>
> Or am I missing something here?

Probably just forgot about those. I'm thinking we ought to do something
like this...

---
Subject: x86,nospec: Simplify {JMP,CALL}_NOSPEC

Have {JMP,CALL}_NOSPEC generate the same code GCC does for indirect
calls and rely on the objtool retpoline patching infrastructure.

There's no reason these should be alternatives while the vast bulk of
compiler generated retpolines are not.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/include/asm/nospec-branch.h | 24 ++++++++++++++++++------
1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 10a3bfc1eb23..7bb319d2932c 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -93,6 +93,19 @@
#endif
.endm

+/*
+ * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call
+ * to the retpoline thunk with a CS prefix when the register requires
+ * a RAX prefix byte to encode. Also see apply_alternatives().
+ */
+.macro __CS_PREFIX reg:req
+ .irp rs,r8,r9,r10,r11,r12,r13,r14,r15
+ .ifc \reg,\rs
+ .byte 0x2e
+ .endif
+ .endr
+.endm
+
/*
* JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
* indirect jmp/call which may be susceptible to the Spectre variant 2
@@ -100,19 +113,18 @@
*/
.macro JMP_NOSPEC reg:req
#ifdef CONFIG_RETPOLINE
- ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
- __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
- __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE
+ __CS_PREFIX \reg
+ jmp __x86_indirect_thunk_\reg
#else
jmp *%\reg
+ int3
#endif
.endm

.macro CALL_NOSPEC reg:req
#ifdef CONFIG_RETPOLINE
- ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \
- __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
- __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_LFENCE
+ __CS_PREFIX \reg
+ call __x86_indirect_thunk_\reg
#else
call *%\reg
#endif

2022-07-19 21:39:13

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] x86,nospec: Simplify {JMP,CALL}_NOSPEC

On Tue, Jul 19, 2022 at 11:23:07PM +0200, Peter Zijlstra wrote:
> Subject: x86,nospec: Simplify {JMP,CALL}_NOSPEC
>
> Have {JMP,CALL}_NOSPEC generate the same code GCC does for indirect
> calls and rely on the objtool retpoline patching infrastructure.
>
> There's no reason these should be alternatives while the vast bulk of
> compiler generated retpolines are not.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/x86/include/asm/nospec-branch.h | 24 ++++++++++++++++++------
> 1 file changed, 18 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> index 10a3bfc1eb23..7bb319d2932c 100644
> --- a/arch/x86/include/asm/nospec-branch.h
> +++ b/arch/x86/include/asm/nospec-branch.h
> @@ -93,6 +93,19 @@
> #endif
> .endm
>
> +/*
> + * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call
> + * to the retpoline thunk with a CS prefix when the register requires
> + * a RAX prefix byte to encode. Also see apply_alternatives().

Obviously I meant: apply_retpolines() ...

> + */
> +.macro __CS_PREFIX reg:req
> + .irp rs,r8,r9,r10,r11,r12,r13,r14,r15
> + .ifc \reg,\rs
> + .byte 0x2e
> + .endif
> + .endr
> +.endm
> +
> /*
> * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
> * indirect jmp/call which may be susceptible to the Spectre variant 2
> @@ -100,19 +113,18 @@
> */
> .macro JMP_NOSPEC reg:req
> #ifdef CONFIG_RETPOLINE
> - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
> - __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
> - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE
> + __CS_PREFIX \reg
> + jmp __x86_indirect_thunk_\reg
> #else
> jmp *%\reg
> + int3
> #endif
> .endm
>
> .macro CALL_NOSPEC reg:req
> #ifdef CONFIG_RETPOLINE
> - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \
> - __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
> - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_LFENCE
> + __CS_PREFIX \reg
> + call __x86_indirect_thunk_\reg
> #else
> call *%\reg
> #endif

2022-07-20 00:04:46

by Maciej S. Szmigiero

[permalink] [raw]
Subject: Re: [RFC][PATCH] x86,nospec: Simplify {JMP,CALL}_NOSPEC

On 19.07.2022 23:33, Peter Zijlstra wrote:
> On Tue, Jul 19, 2022 at 11:23:07PM +0200, Peter Zijlstra wrote:
>> Subject: x86,nospec: Simplify {JMP,CALL}_NOSPEC
>>
>> Have {JMP,CALL}_NOSPEC generate the same code GCC does for indirect
>> calls and rely on the objtool retpoline patching infrastructure.
>>
>> There's no reason these should be alternatives while the vast bulk of
>> compiler generated retpolines are not.
>>
>> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>> ---
>> arch/x86/include/asm/nospec-branch.h | 24 ++++++++++++++++++------
>> 1 file changed, 18 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
>> index 10a3bfc1eb23..7bb319d2932c 100644
>> --- a/arch/x86/include/asm/nospec-branch.h
>> +++ b/arch/x86/include/asm/nospec-branch.h
>> @@ -93,6 +93,19 @@
>> #endif
>> .endm
>>
>> +/*
>> + * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call
>> + * to the retpoline thunk with a CS prefix when the register requires
>> + * a RAX prefix byte to encode. Also see apply_alternatives().
>
> Obviously I meant: apply_retpolines() ...

Will apply_retpolines() actually patch in that trailing int3 in
the X86_FEATURE_RETPOLINE_LFENCE case?

Looking at its code it uses just ordinary NOPs as fill:
> /*
> * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
> */
> if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
> bytes[i++] = 0x0f;
> bytes[i++] = 0xae;
> bytes[i++] = 0xe8; /* LFENCE */
> }
>
> ret = emit_indirect(op, reg, bytes + i);
> if (ret < 0)
> return ret;
> i += ret;
>
> for (; i < insn->length;)
> bytes[i++] = BYTES_NOP1;

BYTES_NOP1 is 0x90.

>> + */
>> +.macro __CS_PREFIX reg:req
>> + .irp rs,r8,r9,r10,r11,r12,r13,r14,r15
>> + .ifc \reg,\rs
>> + .byte 0x2e
>> + .endif
>> + .endr
>> +.endm
>> +
>> /*
>> * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
>> * indirect jmp/call which may be susceptible to the Spectre variant 2
>> @@ -100,19 +113,18 @@
>> */
>> .macro JMP_NOSPEC reg:req
>> #ifdef CONFIG_RETPOLINE
>> - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
>> - __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
>> - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE
>> + __CS_PREFIX \reg
>> + jmp __x86_indirect_thunk_\reg
>> #else
>> jmp *%\reg
>> + int3
>> #endif

Perhaps that int3 should be here to be common to both
"#ifdef CONFIG_RETPOLINE" branches?

>> .endm
>>

Thanks,
Maciej

2022-07-20 09:20:35

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] x86,nospec: Simplify {JMP,CALL}_NOSPEC

On Wed, Jul 20, 2022 at 02:01:39AM +0200, Maciej S. Szmigiero wrote:
> > Obviously I meant: apply_retpolines() ...
>
> Will apply_retpolines() actually patch in that trailing int3 in
> the X86_FEATURE_RETPOLINE_LFENCE case?
>
> Looking at its code it uses just ordinary NOPs as fill:
> > /*
> > * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
> > */
> > if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
> > bytes[i++] = 0x0f;
> > bytes[i++] = 0xae;
> > bytes[i++] = 0xe8; /* LFENCE */
> > }
> >
> > ret = emit_indirect(op, reg, bytes + i);
> > if (ret < 0)
> > return ret;
> > i += ret;
> >
> > for (; i < insn->length;)
> > bytes[i++] = BYTES_NOP1;

There is no space for int3 in that case. You get 3 bytes for LFENCE and
{2,3} bytes for 'jmp *%reg', which fully consumes the {5,6} bytes
available.

There will be no nops added.

But this is what all regular retpolines get to look like.

The plan was; and that's still pending, to get the INT3 from the AMD BTC
mitigation that adds INT3 after regular JMPs but those compiler patches
still need to happen I think.