2021-02-18 19:05:33

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH 1/2] x86/retpoline: Simplify retpolines

Currently out retpolines consist of 2 symbols,
__x86_indirect_thunk_\reg, which is the compiler target, and
__x86_retpoline_\reg, which is the actual retpoline. Both are
consecutive in code and aligned such that for any one register they
both live in the same cacheline:

0000000000000000 <__x86_indirect_thunk_rax>:
0: ff e0 jmpq *%rax
2: 90 nop
3: 90 nop
4: 90 nop

0000000000000005 <__x86_retpoline_rax>:
5: e8 07 00 00 00 callq 11 <__x86_retpoline_rax+0xc>
a: f3 90 pause
c: 0f ae e8 lfence
f: eb f9 jmp a <__x86_retpoline_rax+0x5>
11: 48 89 04 24 mov %rax,(%rsp)
15: c3 retq
16: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:0x0(%rax,%rax,1)

The thunk is an alternative_2, where one option is a jmp to the
retpoline. Observe that we can fold the entire retpoline into the
alternative to simplify and consolidate unused bytes:

0000000000000000 <__x86_indirect_thunk_rax>:
0: ff e0 jmpq *%rax
2: 90 nop
3: 90 nop
4: 90 nop
5: 90 nop
6: 90 nop
7: 90 nop
8: 90 nop
9: 90 nop
a: 90 nop
b: 90 nop
c: 90 nop
d: 90 nop
e: 90 nop
f: 90 nop
10: 90 nop
11: 66 66 2e 0f 1f 84 00 00 00 00 00 data16 nopw %cs:0x0(%rax,%rax,1)
1c: 0f 1f 40 00 nopl 0x0(%rax)

Notice that since our longest alternative sequence is now:

0: e8 07 00 00 00 callq c <.altinstr_replacement+0xc>
5: f3 90 pause
7: 0f ae e8 lfence
a: eb f9 jmp 5 <.altinstr_replacement+0x5>
c: 48 89 04 24 mov %rax,(%rsp)
10: c3 retq

17 bytes, we have 15 bytes NOP at the end of our 32 byte slot.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/include/asm/asm-prototypes.h | 7 -------
arch/x86/include/asm/nospec-branch.h | 6 +++---
arch/x86/lib/retpoline.S | 34 +++++++++++++++++-----------------
tools/objtool/check.c | 3 +--
4 files changed, 21 insertions(+), 29 deletions(-)

--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -22,15 +22,8 @@ extern void cmpxchg8b_emu(void);
#define DECL_INDIRECT_THUNK(reg) \
extern asmlinkage void __x86_indirect_thunk_ ## reg (void);

-#define DECL_RETPOLINE(reg) \
- extern asmlinkage void __x86_retpoline_ ## reg (void);
-
#undef GEN
#define GEN(reg) DECL_INDIRECT_THUNK(reg)
#include <asm/GEN-for-each-reg.h>

-#undef GEN
-#define GEN(reg) DECL_RETPOLINE(reg)
-#include <asm/GEN-for-each-reg.h>
-
#endif /* CONFIG_RETPOLINE */
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -81,7 +81,7 @@
.macro JMP_NOSPEC reg:req
#ifdef CONFIG_RETPOLINE
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
- __stringify(jmp __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \
+ __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
__stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
#else
jmp *%\reg
@@ -91,7 +91,7 @@
.macro CALL_NOSPEC reg:req
#ifdef CONFIG_RETPOLINE
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \
- __stringify(call __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \
+ __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
__stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD
#else
call *%\reg
@@ -129,7 +129,7 @@
ALTERNATIVE_2( \
ANNOTATE_RETPOLINE_SAFE \
"call *%[thunk_target]\n", \
- "call __x86_retpoline_%V[thunk_target]\n", \
+ "call __x86_indirect_thunk_%V[thunk_target]\n", \
X86_FEATURE_RETPOLINE, \
"lfence;\n" \
ANNOTATE_RETPOLINE_SAFE \
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -10,27 +10,31 @@
#include <asm/unwind_hints.h>
#include <asm/frame.h>

-.macro THUNK reg
- .section .text.__x86.indirect_thunk
-
- .align 32
-SYM_FUNC_START(__x86_indirect_thunk_\reg)
- JMP_NOSPEC \reg
-SYM_FUNC_END(__x86_indirect_thunk_\reg)
-
-SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg)
+.macro RETPOLINE reg
ANNOTATE_INTRA_FUNCTION_CALL
- call .Ldo_rop_\@
+ call .Ldo_rop_\@
.Lspec_trap_\@:
UNWIND_HINT_EMPTY
pause
lfence
- jmp .Lspec_trap_\@
+ jmp .Lspec_trap_\@
.Ldo_rop_\@:
- mov %\reg, (%_ASM_SP)
+ mov %\reg, (%_ASM_SP)
UNWIND_HINT_FUNC
ret
-SYM_FUNC_END(__x86_retpoline_\reg)
+.endm
+
+.macro THUNK reg
+ .section .text.__x86.indirect_thunk
+
+ .align 32
+SYM_FUNC_START(__x86_indirect_thunk_\reg)
+
+ ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
+ __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
+
+SYM_FUNC_END(__x86_indirect_thunk_\reg)

.endm

@@ -48,7 +52,6 @@ SYM_FUNC_END(__x86_retpoline_\reg)

#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym)
#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
-#define EXPORT_RETPOLINE(reg) __EXPORT_THUNK(__x86_retpoline_ ## reg)

#undef GEN
#define GEN(reg) THUNK reg
@@ -58,6 +61,3 @@ SYM_FUNC_END(__x86_retpoline_\reg)
#define GEN(reg) EXPORT_THUNK(reg)
#include <asm/GEN-for-each-reg.h>

-#undef GEN
-#define GEN(reg) EXPORT_RETPOLINE(reg)
-#include <asm/GEN-for-each-reg.h>
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -800,8 +800,7 @@ static int add_jump_destinations(struct
} else if (reloc->sym->type == STT_SECTION) {
dest_sec = reloc->sym->sec;
dest_off = arch_dest_reloc_offset(reloc->addend);
- } else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21) ||
- !strncmp(reloc->sym->name, "__x86_retpoline_", 16)) {
+ } else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21)) {
/*
* Retpoline jumps are really dynamic jumps in
* disguise, so convert them accordingly.



2021-02-22 11:38:41

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/2] x86/retpoline: Simplify retpolines

On Thu, Feb 18, 2021 at 05:59:39PM +0100, Peter Zijlstra wrote:
> Currently out retpolines consist of 2 symbols,
> __x86_indirect_thunk_\reg, which is the compiler target, and
> __x86_retpoline_\reg, which is the actual retpoline. Both are
> consecutive in code and aligned such that for any one register they
> both live in the same cacheline:
>
> 0000000000000000 <__x86_indirect_thunk_rax>:
> 0: ff e0 jmpq *%rax
> 2: 90 nop
> 3: 90 nop
> 4: 90 nop
>
> 0000000000000005 <__x86_retpoline_rax>:
> 5: e8 07 00 00 00 callq 11 <__x86_retpoline_rax+0xc>
> a: f3 90 pause
> c: 0f ae e8 lfence
> f: eb f9 jmp a <__x86_retpoline_rax+0x5>
> 11: 48 89 04 24 mov %rax,(%rsp)
> 15: c3 retq
> 16: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:0x0(%rax,%rax,1)
>
> The thunk is an alternative_2, where one option is a jmp to the
> retpoline.

So the reason I originally did that was because objtool could not deal
with alternatives with stack ops. But we've recently fixed that.