Rewrite retpoline thunk call sites to be indirect calls for
spectre_v2=off. This ensures spectre_v2=off is as near to a
RETPOLINE=n build as possible.
This is the replacement for objtool writing alternative entries to
ensure the same and achieves feature-parity with the previous
approach.
One noteworthy feature is that it relies on the thunks to be in
machine order to compute the register index.
Specifically, this does not yet address the Jcc __x86_indirect_thunk_*
calls generated by clang, a future patch will add this.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/include/asm/alternative.h | 1
arch/x86/kernel/alternative.c | 136 +++++++++++++++++++++++++++++++++++--
arch/x86/kernel/module.c | 9 ++
3 files changed, 141 insertions(+), 5 deletions(-)
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -75,6 +75,7 @@ extern int alternatives_patched;
extern void alternative_instructions(void);
extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+extern void apply_retpolines(s32 *start, s32 *end);
struct module;
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -29,6 +29,7 @@
#include <asm/io.h>
#include <asm/fixmap.h>
#include <asm/paravirt.h>
+#include <asm/asm-prototypes.h>
int __read_mostly alternatives_patched;
@@ -113,6 +114,7 @@ static void __init_or_module add_nops(vo
}
}
+extern s32 __retpoline_sites[], __retpoline_sites_end[];
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);
@@ -221,7 +223,7 @@ static __always_inline int optimize_nops
* "noinline" to cause control flow change and thus invalidate I$ and
* cause refetch after modification.
*/
-static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
+static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
{
struct insn insn;
int i = 0;
@@ -239,11 +241,11 @@ static void __init_or_module noinline op
* optimized.
*/
if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
- i += optimize_nops_range(instr, a->instrlen, i);
+ i += optimize_nops_range(instr, len, i);
else
i += insn.length;
- if (i >= a->instrlen)
+ if (i >= len)
return;
}
}
@@ -331,10 +333,130 @@ void __init_or_module noinline apply_alt
text_poke_early(instr, insn_buff, insn_buff_sz);
next:
- optimize_nops(a, instr);
+ optimize_nops(instr, a->instrlen);
}
}
+#ifdef CONFIG_X86_64
+
+/*
+ * CALL/JMP *%\reg
+ */
+static int emit_indirect(int op, int reg, u8 *bytes)
+{
+ int i = 0;
+ u8 modrm;
+
+ switch (op) {
+ case CALL_INSN_OPCODE:
+ modrm = 0x10; /* Reg = 2; CALL r/m */
+ break;
+
+ case JMP32_INSN_OPCODE:
+ modrm = 0x20; /* Reg = 4; JMP r/m */
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+
+ modrm |= 0xc0; /* Mod = 3 */
+ modrm += reg;
+
+ bytes[i++] = 0xff; /* opcode */
+ bytes[i++] = modrm;
+
+ return i;
+}
+
+/*
+ * Rewrite the compiler generated retpoline thunk calls.
+ *
+ * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
+ * indirect instructions, avoiding the extra indirection.
+ *
+ * For example, convert:
+ *
+ * CALL __x86_indirect_thunk_\reg
+ *
+ * into:
+ *
+ * CALL *%\reg
+ *
+ */
+static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
+{
+ void (*target)(void);
+ int reg, i = 0;
+
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
+ return -1;
+
+ target = addr + insn->length + insn->immediate.value;
+ reg = (target - &__x86_indirect_thunk_rax) /
+ (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
+
+ if (WARN_ON_ONCE(reg & ~0xf))
+ return -1;
+
+ i = emit_indirect(insn->opcode.bytes[0], reg, bytes);
+ if (i < 0)
+ return i;
+
+ for (; i < insn->length;)
+ bytes[i++] = BYTES_NOP1;
+
+ return i;
+}
+
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ struct insn insn;
+ int len, ret;
+ u8 bytes[16];
+ u8 op1, op2;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op1 = insn.opcode.bytes[0];
+ op2 = insn.opcode.bytes[1];
+
+ switch (op1) {
+ case CALL_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ len = patch_retpoline(addr, &insn, bytes);
+ if (len == insn.length) {
+ optimize_nops(bytes, len);
+ text_poke_early(addr, bytes, len);
+ }
+ }
+}
+
+#else /* CONFIG_X86_32 */
+
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
+
+#endif /* CONFIG_X86_64 */
+
#ifdef CONFIG_SMP
static void alternatives_smp_lock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end)
@@ -643,6 +765,12 @@ void __init alternative_instructions(voi
apply_paravirt(__parainstructions, __parainstructions_end);
/*
+ * Rewrite the retpolines, must be done before alternatives since
+ * those can rewrite the retpoline thunks.
+ */
+ apply_retpolines(__retpoline_sites, __retpoline_sites_end);
+
+ /*
* Then patch alternatives, such that those paravirt calls that are in
* alternatives can be overwritten by their immediate fragments.
*/
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -251,7 +251,8 @@ int module_finalize(const Elf_Ehdr *hdr,
struct module *me)
{
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
- *para = NULL, *orc = NULL, *orc_ip = NULL;
+ *para = NULL, *orc = NULL, *orc_ip = NULL,
+ *retpolines = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -267,8 +268,14 @@ int module_finalize(const Elf_Ehdr *hdr,
orc = s;
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
orc_ip = s;
+ if (!strcmp(".retpoline_sites", secstrings + s->sh_name))
+ retpolines = s;
}
+ if (retpolines) {
+ void *rseg = (void *)retpolines->sh_addr;
+ apply_retpolines(rseg, rseg + retpolines->sh_size);
+ }
if (alt) {
/* patch .altinstructions */
void *aseg = (void *)alt->sh_addr;
On Wed, Oct 13, 2021 at 03:38:27PM +0100, Andrew Cooper wrote:
> On 13/10/2021 13:22, Peter Zijlstra wrote:
> > +/*
> > + * Rewrite the compiler generated retpoline thunk calls.
> > + *
> > + * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
> > + * indirect instructions, avoiding the extra indirection.
> > + *
> > + * For example, convert:
> > + *
> > + * CALL __x86_indirect_thunk_\reg
> > + *
> > + * into:
> > + *
> > + * CALL *%\reg
> > + *
> > + */
> > +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
> > +{
> > + void (*target)(void);
> > + int reg, i = 0;
> > +
> > + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
> > + return -1;
> > +
> > + target = addr + insn->length + insn->immediate.value;
> > + reg = (target - &__x86_indirect_thunk_rax) /
> > + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
>
> This is equal measures beautiful and terrifying.
Thanks! :-)
> Something around here really wants to BUG_ON(reg == 4), because
> literally nothing good can come from selecting %rsp.
Ack, I had to add rsp to get the offsets right, but indeed, if anything
ever selects that we're in trouble.
> Also, it might be a good idea (previous patch perhaps) to have some
> linker assertions to confirm that the symbols are laid out safely to do
> this calculation.
I was hoping that since all this is in .S it would be immune from crazy
things like a compiler and do as told. But I suppose carzy stuff like
LTO (or worse BOLT) can totaly wreck this still (then BOLT won't care
about linker script assertions either).
I'll see if I can come up with something.
On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
> +{
> + void (*target)(void);
> + int reg, i = 0;
> +
> + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
> + return -1;
Better to do this check further up the call stack in apply_retpolines()
before looping through all the call sites?
> +
> + target = addr + insn->length + insn->immediate.value;
> + reg = (target - &__x86_indirect_thunk_rax) /
> + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
> +
> + if (WARN_ON_ONCE(reg & ~0xf))
> + return -1;
It would be more robust and less magical to just have a basic lookup
table array which converts a thunk address to a reg. Then you can just
avoid all the safety checks because it's no longer insane ;-)
--
Josh
On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> /*
> + * Rewrite the retpolines, must be done before alternatives since
> + * those can rewrite the retpoline thunks.
> + */
Why exactly is that a problem? This code doesn't read the thunks.
BTW, CALL_NOSPEC results in a retpoline site in .altinstr_replacement:
Relocation section [40] '.rela.retpoline_sites' for section [39] '.retpoline_sites' at offset 0x8d28 contains 1 entry:
Offset Type Value Addend Name
000000000000000000 X86_64_PC32 000000000000000000 +10 .altinstr_replacement
Which I assume we don't want.
--
Josh
On Wed, Oct 13, 2021 at 01:52:59PM -0700, Josh Poimboeuf wrote:
> On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> > /*
> > + * Rewrite the retpolines, must be done before alternatives since
> > + * those can rewrite the retpoline thunks.
> > + */
>
> Why exactly is that a problem? This code doesn't read the thunks.
The below problem :-) I didn't include it in the series, but I'm
thinking that's where I wants to go eventually.
---
Subject: x86,retpoline: Poison retpoline thunks for !X86_FEATURE_RETPOLINE
From: Peter Zijlstra <[email protected]>
Date: Tue Oct 12 10:30:56 CEST 2021
Now that objtool will out-of-line all retpoline thunk calls for
!RETPOLINE, poison them.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/lib/retpoline.S | 10 ++++++++++
1 file changed, 10 insertions(+)
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -32,9 +32,19 @@
SYM_FUNC_START(__x86_indirect_thunk_\reg)
+#ifdef CONFIG_STACK_VALIDATION
+/*
+ * When objtool runs, there should not be any __x86_indirect_thunk_* calls
+ * left after alternatives, ensure this by patching it to UD2.
+ */
+ ALTERNATIVE_2 __stringify(RETPOLINE \reg), \
+ __stringify(ud2), ALT_NOT(X86_FEATURE_RETPOLINE), \
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
+#else
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
__stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
__stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
+#endif
SYM_FUNC_END(__x86_indirect_thunk_\reg)
On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> +#ifdef CONFIG_X86_64
> +
> +/*
> + * CALL/JMP *%\reg
> + */
> +static int emit_indirect(int op, int reg, u8 *bytes)
X86_64 is already equivalent to STACK_VALIDATION these days, but might
as well clarify here where the retpoline_sites dependency comes from by
changing this to '#ifdef CONFIG_STACK_VALIDATION'.
--
Josh
On Wed, Oct 13, 2021 at 01:39:27PM -0700, Josh Poimboeuf wrote:
> On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> > +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
> > +{
> > + void (*target)(void);
> > + int reg, i = 0;
> > +
> > + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
> > + return -1;
>
> Better to do this check further up the call stack in apply_retpolines()
> before looping through all the call sites?
In fact, I've pushed it further down, in order to always validate the
absense of rsp.
> > +
> > + target = addr + insn->length + insn->immediate.value;
> > + reg = (target - &__x86_indirect_thunk_rax) /
> > + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
> > +
> > + if (WARN_ON_ONCE(reg & ~0xf))
> > + return -1;
>
> It would be more robust and less magical to just have a basic lookup
> table array which converts a thunk address to a reg. Then you can just
> avoid all the safety checks because it's no longer insane ;-)
Andrew suggested the reverse lookup to validate. That should give the
same robustness but lacks the linear lookup.
---
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -392,6 +392,12 @@ static int emit_indirect(int op, int reg
*/
static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
{
+ static const void *reg_to_thunk[] = {
+#undef GEN
+#define GEN(reg) &__x86_indirect_thunk_ ## reg,
+#include <asm/GEN-for-each-reg.h>
+ };
+
void (*target)(void);
int reg, i = 0;
@@ -402,6 +408,8 @@ static int patch_retpoline(void *addr, s
if (WARN_ON_ONCE(reg & ~0xf))
return -1;
+ BUG_ON(target != reg_to_thunk[reg]);
+
/*
* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble.
*/
On Wed, Oct 13, 2021 at 02:11:18PM -0700, Josh Poimboeuf wrote:
> On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> > +#ifdef CONFIG_X86_64
> > +
> > +/*
> > + * CALL/JMP *%\reg
> > + */
> > +static int emit_indirect(int op, int reg, u8 *bytes)
>
> X86_64 is already equivalent to STACK_VALIDATION these days, but might
> as well clarify here where the retpoline_sites dependency comes from by
> changing this to '#ifdef CONFIG_STACK_VALIDATION'.
Yeah, I was contemplating having x86_64 unconditionally select that.
Maybe we should.
Also, I think I've proposed it before, but what about:
s/STACK_VALIDATION/OBJTOOL/
on all that?
On Wed, Oct 13, 2021 at 11:20:02PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 13, 2021 at 01:39:27PM -0700, Josh Poimboeuf wrote:
> > On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> > > +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
> > > +{
> > > + void (*target)(void);
> > > + int reg, i = 0;
> > > +
> > > + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
> > > + return -1;
> >
> > Better to do this check further up the call stack in apply_retpolines()
> > before looping through all the call sites?
>
> In fact, I've pushed it further down, in order to always validate the
> absense of rsp.
>
> > > +
> > > + target = addr + insn->length + insn->immediate.value;
> > > + reg = (target - &__x86_indirect_thunk_rax) /
> > > + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
> > > +
> > > + if (WARN_ON_ONCE(reg & ~0xf))
> > > + return -1;
> >
> > It would be more robust and less magical to just have a basic lookup
> > table array which converts a thunk address to a reg. Then you can just
> > avoid all the safety checks because it's no longer insane ;-)
>
> Andrew suggested the reverse lookup to validate. That should give the
> same robustness but lacks the linear lookup.
So you've got a WARN_ON_ONCE, a BUG_ON, and a too-deep feature check,
all in the name of supporting this scheme. ok...
If performance of the linear lookup were a real concern then you could
just put rax and r11 at the beginning of the array.
--
Josh
On Wed, Oct 13, 2021 at 02:49:10PM -0700, Josh Poimboeuf wrote:
> On Wed, Oct 13, 2021 at 11:20:02PM +0200, Peter Zijlstra wrote:
> > On Wed, Oct 13, 2021 at 01:39:27PM -0700, Josh Poimboeuf wrote:
> > > On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> > > > +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
> > > > +{
> > > > + void (*target)(void);
> > > > + int reg, i = 0;
> > > > +
> > > > + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
> > > > + return -1;
> > >
> > > Better to do this check further up the call stack in apply_retpolines()
> > > before looping through all the call sites?
> >
> > In fact, I've pushed it further down, in order to always validate the
> > absense of rsp.
> >
> > > > +
> > > > + target = addr + insn->length + insn->immediate.value;
> > > > + reg = (target - &__x86_indirect_thunk_rax) /
> > > > + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
> > > > +
> > > > + if (WARN_ON_ONCE(reg & ~0xf))
> > > > + return -1;
> > >
> > > It would be more robust and less magical to just have a basic lookup
> > > table array which converts a thunk address to a reg. Then you can just
> > > avoid all the safety checks because it's no longer insane ;-)
> >
> > Andrew suggested the reverse lookup to validate. That should give the
> > same robustness but lacks the linear lookup.
>
> So you've got a WARN_ON_ONCE, a BUG_ON, and a too-deep feature check,
> all in the name of supporting this scheme. ok...
Not to mention the extra silly rsp thunks...
> If performance of the linear lookup were a real concern then you could
> just put rax and r11 at the beginning of the array.
--
Josh
On Wed, Oct 13, 2021 at 11:43:43PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 13, 2021 at 02:11:18PM -0700, Josh Poimboeuf wrote:
> > On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> > > +#ifdef CONFIG_X86_64
> > > +
> > > +/*
> > > + * CALL/JMP *%\reg
> > > + */
> > > +static int emit_indirect(int op, int reg, u8 *bytes)
> >
> > X86_64 is already equivalent to STACK_VALIDATION these days, but might
> > as well clarify here where the retpoline_sites dependency comes from by
> > changing this to '#ifdef CONFIG_STACK_VALIDATION'.
>
> Yeah, I was contemplating having x86_64 unconditionally select that.
> Maybe we should.
As far as I can tell, it already does that:
select HAVE_STACK_VALIDATION if X86_64
select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION
select STACK_VALIDATION if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE)
> Also, I think I've proposed it before, but what about:
>
> s/STACK_VALIDATION/OBJTOOL/
>
> on all that?
How about keeping STACK_VALIDATION, but then having it depend on
OBJTOOL, in case anybody cares to extricate the two at some point. Some
objtool features (like this one) don't rely on the full validation.
--
Josh
On Wed, Oct 13, 2021 at 02:49:07PM -0700, Josh Poimboeuf wrote:
> So you've got a WARN_ON_ONCE, a BUG_ON, and a too-deep feature check,
> all in the name of supporting this scheme. ok...
Mostly because I'm way too lazy to type out that table. With the thunks
being in .S, I really don't see that getting messed up, they even in
their own special section too.
> If performance of the linear lookup were a real concern then you could
> just put rax and r11 at the beginning of the array.
That would mean the table would have to be { __thunk, reg_idx }, which
is even more yuck.
On Wed, Oct 13, 2021 at 03:05:20PM -0700, Josh Poimboeuf wrote:
> On Wed, Oct 13, 2021 at 11:43:43PM +0200, Peter Zijlstra wrote:
> > On Wed, Oct 13, 2021 at 02:11:18PM -0700, Josh Poimboeuf wrote:
> > > On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> > > > +#ifdef CONFIG_X86_64
> > > > +
> > > > +/*
> > > > + * CALL/JMP *%\reg
> > > > + */
> > > > +static int emit_indirect(int op, int reg, u8 *bytes)
> > >
> > > X86_64 is already equivalent to STACK_VALIDATION these days, but might
> > > as well clarify here where the retpoline_sites dependency comes from by
> > > changing this to '#ifdef CONFIG_STACK_VALIDATION'.
> >
> > Yeah, I was contemplating having x86_64 unconditionally select that.
> > Maybe we should.
>
> As far as I can tell, it already does that:
>
> select HAVE_STACK_VALIDATION if X86_64
> select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION
> select STACK_VALIDATION if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE)
Oh right, I thought there was still a possible hole in there, but I
guess that's pretty solid. I suppose we should just remove the && ...
from the last line.
On Wed, Oct 13, 2021 at 05:12:13PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 13, 2021 at 03:38:27PM +0100, Andrew Cooper wrote:
> > On 13/10/2021 13:22, Peter Zijlstra wrote:
> > > +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
> > > +{
> > > + void (*target)(void);
> > > + int reg, i = 0;
> > > +
> > > + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
> > > + return -1;
> > > +
> > > + target = addr + insn->length + insn->immediate.value;
> > > + reg = (target - &__x86_indirect_thunk_rax) /
> > > + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
> >
> > This is equal measures beautiful and terrifying.
>
> Thanks! :-)
Would something like this appease people? If the toolchain can mess this
up everything is broken.
That makes the symtab looks like:
(and arguably, that array symbol could be local)
...
35: 0000000000000000 512 NOTYPE GLOBAL DEFAULT 4 __x86_indirect_thunk_array
36: 0000000000000000 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_rax
37: 0000000000000020 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_rcx
38: 0000000000000040 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_rdx
39: 0000000000000060 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_rbx
40: 0000000000000080 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_rsp
41: 00000000000000a0 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_rbp
42: 00000000000000c0 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_rsi
43: 00000000000000e0 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_rdi
44: 0000000000000100 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_r8
45: 0000000000000120 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_r9
46: 0000000000000140 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_r10
47: 0000000000000160 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_r11
48: 0000000000000180 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_r12
49: 00000000000001a0 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_r13
50: 00000000000001c0 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_r14
51: 00000000000001e0 17 FUNC GLOBAL DEFAULT 4 __x86_indirect_thunk_r15
---
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -30,7 +30,7 @@
.align 32
-SYM_FUNC_START(__x86_indirect_thunk_\reg)
+SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
__stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
@@ -55,10 +55,16 @@ SYM_FUNC_END(__x86_indirect_thunk_\reg)
#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym)
#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
+ .align 32
+SYM_CODE_START(__x86_indirect_thunk_array)
+
#define GEN(reg) THUNK reg
#include <asm/GEN-for-each-reg.h>
#undef GEN
+ .align 32
+SYM_CODE_END(__x86_indirect_thunk_array)
+
#define GEN(reg) EXPORT_THUNK(reg)
#include <asm/GEN-for-each-reg.h>
#undef GEN
On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
> +{
> + void (*target)(void);
> + int reg, i = 0;
> +
> + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
> + return -1;
> +
> + target = addr + insn->length + insn->immediate.value;
> + reg = (target - &__x86_indirect_thunk_rax) /
> + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
I guess you should compute those values once so that it doesn't have to
do them for each function invocation. And it does them here when I look
at the asm it generates.
> +
> + if (WARN_ON_ONCE(reg & ~0xf))
> + return -1;
Sanity-checking the alignment of those thunks?
> +
> + i = emit_indirect(insn->opcode.bytes[0], reg, bytes);
> + if (i < 0)
> + return i;
> +
> + for (; i < insn->length;)
> + bytes[i++] = BYTES_NOP1;
Why not:
nop_len = insn->length - i;
if (nop_len) {
memcpy(&bytes[i], x86_nops[nop_len], nop_len);
i += nop_len;
}
and then you save yourself the optimize_nops() call because it'll take
the right-sized NOP directly.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Fri, Oct 15, 2021 at 04:24:08PM +0200, Borislav Petkov wrote:
> On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> > +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
> > +{
> > + void (*target)(void);
> > + int reg, i = 0;
> > +
> > + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
> > + return -1;
> > +
> > + target = addr + insn->length + insn->immediate.value;
> > + reg = (target - &__x86_indirect_thunk_rax) /
> > + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
>
> I guess you should compute those values once so that it doesn't have to
> do them for each function invocation. And it does them here when I look
> at the asm it generates.
Takes away the simplicity of the thing. It can't know these values at
compile time (due to external symbols etc..) although I suppose LTO
might be able to fix that.
Other than that, the above is the trivial form of reverse indexing an
array.
> > +
> > + if (WARN_ON_ONCE(reg & ~0xf))
> > + return -1;
>
> Sanity-checking the alignment of those thunks?
Nah, the target address of the instruction; if that's not a retpoline
thunk (for whatever raisin) then the computation will not result in a
valid reg and we should bail.
> > +
> > + i = emit_indirect(insn->opcode.bytes[0], reg, bytes);
> > + if (i < 0)
> > + return i;
> > +
> > + for (; i < insn->length;)
> > + bytes[i++] = BYTES_NOP1;
>
> Why not:
>
> nop_len = insn->length - i;
> if (nop_len) {
> memcpy(&bytes[i], x86_nops[nop_len], nop_len);
> i += nop_len;
> }
>
> and then you save yourself the optimize_nops() call because it'll take
> the right-sized NOP directly.
That's not immediately safe; if for some reason or other the original
instrucion is 15 bytes long, and we generated 2 bytes, then we need 13
nop bytes, the above will then do an out-of-bound array access (due to
the nops array only doing 8 byte nops at max).
I wanted this code to be simple and obvious.
From: Peter Zijlstra <[email protected]>
Date: Fri, 15 Oct 2021 18:56:35 +0200
Hi,
Gave it a spin with Clang/LLVM, and
> On Fri, Oct 15, 2021 at 04:24:08PM +0200, Borislav Petkov wrote:
> > On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> > > +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
> > > +{
> > > + void (*target)(void);
> > > + int reg, i = 0;
> > > +
> > > + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
> > > + return -1;
> > > +
> > > + target = addr + insn->length + insn->immediate.value;
> > > + reg = (target - &__x86_indirect_thunk_rax) /
> > > + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
this triggers
> > I guess you should compute those values once so that it doesn't have to
> > do them for each function invocation. And it does them here when I look
> > at the asm it generates.
>
> Takes away the simplicity of the thing. It can't know these values at
> compile time (due to external symbols etc..) although I suppose LTO
> might be able to fix that.
>
> Other than that, the above is the trivial form of reverse indexing an
> array.
>
> > > +
> > > + if (WARN_ON_ONCE(reg & ~0xf))
> > > + return -1;
this:
WARN in patch_retpoline:408: addr pcibios_scan_specific_bus+0x196/0x200, op 0xe8, reg 0xb88ca
WARN in patch_retpoline:408: addr xen_pv_teardown_msi_irqs+0x8d/0x120, op 0xe8, reg 0xb88ca
WARN in patch_retpoline:408: addr __mptcp_sockopt_sync+0x7e/0x200, op 0xe8, reg 0xb88ca
[...]
(thousands of them, but op == 0xe8 && reg == 0xb88ca are always the same)
I know this reg calculation is about to be replaced, but anyway ;)
> > Sanity-checking the alignment of those thunks?
>
> Nah, the target address of the instruction; if that's not a retpoline
> thunk (for whatever raisin) then the computation will not result in a
> valid reg and we should bail.
>
> > > +
> > > + i = emit_indirect(insn->opcode.bytes[0], reg, bytes);
> > > + if (i < 0)
> > > + return i;
> > > +
> > > + for (; i < insn->length;)
> > > + bytes[i++] = BYTES_NOP1;
> >
> > Why not:
> >
> > nop_len = insn->length - i;
> > if (nop_len) {
> > memcpy(&bytes[i], x86_nops[nop_len], nop_len);
> > i += nop_len;
> > }
> >
> > and then you save yourself the optimize_nops() call because it'll take
> > the right-sized NOP directly.
>
> That's not immediately safe; if for some reason or other the original
> instrucion is 15 bytes long, and we generated 2 bytes, then we need 13
> nop bytes, the above will then do an out-of-bound array access (due to
> the nops array only doing 8 byte nops at max).
>
> I wanted this code to be simple and obvious.
Thanks,
Al
From: Alexander Lobakin <[email protected]>
Date: Mon, 18 Oct 2021 23:06:35 +0000
Sorry for double posting, should've include this from the start.
> Hi,
>
> Gave it a spin with Clang/LLVM, and
>
> > On Fri, Oct 15, 2021 at 04:24:08PM +0200, Borislav Petkov wrote:
> > > On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> > > > +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
> > > > +{
> > > > + void (*target)(void);
> > > > + int reg, i = 0;
> > > > +
> > > > + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
> > > > + return -1;
> > > > +
> > > > + target = addr + insn->length + insn->immediate.value;
> > > > + reg = (target - &__x86_indirect_thunk_rax) /
> > > > + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
>
> this triggers
>
> > > I guess you should compute those values once so that it doesn't have to
> > > do them for each function invocation. And it does them here when I look
> > > at the asm it generates.
> >
> > Takes away the simplicity of the thing. It can't know these values at
> > compile time (due to external symbols etc..) although I suppose LTO
> > might be able to fix that.
> >
> > Other than that, the above is the trivial form of reverse indexing an
> > array.
> >
> > > > +
> > > > + if (WARN_ON_ONCE(reg & ~0xf))
> > > > + return -1;
>
> this:
>
> WARN in patch_retpoline:408: addr pcibios_scan_specific_bus+0x196/0x200, op 0xe8, reg 0xb88ca
> WARN in patch_retpoline:408: addr xen_pv_teardown_msi_irqs+0x8d/0x120, op 0xe8, reg 0xb88ca
> WARN in patch_retpoline:408: addr __mptcp_sockopt_sync+0x7e/0x200, op 0xe8, reg 0xb88ca
> [...]
> (thousands of them, but op == 0xe8 && reg == 0xb88ca are always the same)
SMP alternatives: WARN in patch_retpoline:408: addr __strp_unpause+0x62/0x1b0/0xffffffff92d20a12, op 0xe8, reg 0xb88ca
SMP alternatives: insn->length: 5, insn->immediate.value: 0xffae0989
SMP alternatives: target: 0xffffffff928013a0/__x86_indirect_thunk_r11+0x0/0x20
SMP alternatives: rax: 0xffffffff9223cd50, target - rax: 0x5c4650
SMP alternatives: rcx - rax: 0x8
Imm value and addr are different each time, the rest are the same.
target is correct and even %pS works on it, but this distance
between r11 and rax thunks (0x5c4650) doesn't look fine, as well as
rcx - rax being 0x8. Thunks are 0x11 sized + alignment, should be
0x20, and it is, according to vmlinux.map. Weird. Amps/&s?
> I know this reg calculation is about to be replaced, but anyway ;)
>
> > > Sanity-checking the alignment of those thunks?
> >
> > Nah, the target address of the instruction; if that's not a retpoline
> > thunk (for whatever raisin) then the computation will not result in a
> > valid reg and we should bail.
> >
> > > > +
> > > > + i = emit_indirect(insn->opcode.bytes[0], reg, bytes);
> > > > + if (i < 0)
> > > > + return i;
> > > > +
> > > > + for (; i < insn->length;)
> > > > + bytes[i++] = BYTES_NOP1;
> > >
> > > Why not:
> > >
> > > nop_len = insn->length - i;
> > > if (nop_len) {
> > > memcpy(&bytes[i], x86_nops[nop_len], nop_len);
> > > i += nop_len;
> > > }
> > >
> > > and then you save yourself the optimize_nops() call because it'll take
> > > the right-sized NOP directly.
> >
> > That's not immediately safe; if for some reason or other the original
> > instrucion is 15 bytes long, and we generated 2 bytes, then we need 13
> > nop bytes, the above will then do an out-of-bound array access (due to
> > the nops array only doing 8 byte nops at max).
> >
> > I wanted this code to be simple and obvious.
>
> Thanks,
> Al
Thanks,
Al
On Mon, Oct 18, 2021 at 11:06:35PM +0000, Alexander Lobakin wrote:
> From: Peter Zijlstra <[email protected]>
> Date: Fri, 15 Oct 2021 18:56:35 +0200
>
> Hi,
>
> Gave it a spin with Clang/LLVM, and
Just your normal clang build, not some fancy LTO ? eg. make CC=clang.
> > > > + target = addr + insn->length + insn->immediate.value;
> > > > + reg = (target - &__x86_indirect_thunk_rax) /
> > > > + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
>
> this triggers
>
> > > > +
> > > > + if (WARN_ON_ONCE(reg & ~0xf))
> > > > + return -1;
>
> this:
>
> WARN in patch_retpoline:408: addr pcibios_scan_specific_bus+0x196/0x200, op 0xe8, reg 0xb88ca
> WARN in patch_retpoline:408: addr xen_pv_teardown_msi_irqs+0x8d/0x120, op 0xe8, reg 0xb88ca
> WARN in patch_retpoline:408: addr __mptcp_sockopt_sync+0x7e/0x200, op 0xe8, reg 0xb88ca
> [...]
> (thousands of them, but op == 0xe8 && reg == 0xb88ca are always the same)
>
> I know this reg calculation is about to be replaced, but anyway ;)
Well, I was sorta hoping to keep that with something like:
https://lkml.kernel.org/r/[email protected]
Anyway, let me try and reproduce.
From: Alexander Lobakin <[email protected]>
Date: Tue, 19 Oct 2021 00:25:30 +0000
> From: Alexander Lobakin <[email protected]>
> Date: Mon, 18 Oct 2021 23:06:35 +0000
>
> Sorry for double posting, should've include this from the start.
>
> > Hi,
> >
> > Gave it a spin with Clang/LLVM, and
> >
> > > On Fri, Oct 15, 2021 at 04:24:08PM +0200, Borislav Petkov wrote:
> > > > On Wed, Oct 13, 2021 at 02:22:21PM +0200, Peter Zijlstra wrote:
> > > > > +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
> > > > > +{
> > > > > + void (*target)(void);
> > > > > + int reg, i = 0;
> > > > > +
> > > > > + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
> > > > > + return -1;
> > > > > +
> > > > > + target = addr + insn->length + insn->immediate.value;
> > > > > + reg = (target - &__x86_indirect_thunk_rax) /
> > > > > + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
> >
> > this triggers
> >
> > > > I guess you should compute those values once so that it doesn't have to
> > > > do them for each function invocation. And it does them here when I look
> > > > at the asm it generates.
> > >
> > > Takes away the simplicity of the thing. It can't know these values at
> > > compile time (due to external symbols etc..) although I suppose LTO
> > > might be able to fix that.
> > >
> > > Other than that, the above is the trivial form of reverse indexing an
> > > array.
> > >
> > > > > +
> > > > > + if (WARN_ON_ONCE(reg & ~0xf))
> > > > > + return -1;
> >
> > this:
> >
> > WARN in patch_retpoline:408: addr pcibios_scan_specific_bus+0x196/0x200, op 0xe8, reg 0xb88ca
> > WARN in patch_retpoline:408: addr xen_pv_teardown_msi_irqs+0x8d/0x120, op 0xe8, reg 0xb88ca
> > WARN in patch_retpoline:408: addr __mptcp_sockopt_sync+0x7e/0x200, op 0xe8, reg 0xb88ca
> > [...]
> > (thousands of them, but op == 0xe8 && reg == 0xb88ca are always the same)
>
> SMP alternatives: WARN in patch_retpoline:408: addr __strp_unpause+0x62/0x1b0/0xffffffff92d20a12, op 0xe8, reg 0xb88ca
> SMP alternatives: insn->length: 5, insn->immediate.value: 0xffae0989
> SMP alternatives: target: 0xffffffff928013a0/__x86_indirect_thunk_r11+0x0/0x20
> SMP alternatives: rax: 0xffffffff9223cd50, target - rax: 0x5c4650
> SMP alternatives: rcx - rax: 0x8
>
> Imm value and addr are different each time, the rest are the same.
> target is correct and even %pS works on it, but this distance
> between r11 and rax thunks (0x5c4650) doesn't look fine, as well as
> rcx - rax being 0x8. Thunks are 0x11 sized + alignment, should be
> 0x20, and it is, according to vmlinux.map. Weird. Amps/&s?
Oh okay, it's because of ClangCFI:
SMP alternatives: You were looking for __typeid__ZTSFvvE_global_addr+0x370/0x1410 at 0xffffffffa523cd60,>
SMP alternatives: rax is __typeid__ZTSFvvE_global_addr+0x360/0x1410 at 0xffffffffa523cd50
Sorry for confusing, seems like it's a side effect of using it on
Clang 12 while the original series supports only 13+. I'll double
check and let know if find something.
[ snip ]
Thanks,
Al
On Tue, Oct 19, 2021 at 11:40:56AM +0200, Peter Zijlstra wrote:
> On Mon, Oct 18, 2021 at 11:06:35PM +0000, Alexander Lobakin wrote:
> > WARN in patch_retpoline:408: addr pcibios_scan_specific_bus+0x196/0x200, op 0xe8, reg 0xb88ca
> > WARN in patch_retpoline:408: addr xen_pv_teardown_msi_irqs+0x8d/0x120, op 0xe8, reg 0xb88ca
> > WARN in patch_retpoline:408: addr __mptcp_sockopt_sync+0x7e/0x200, op 0xe8, reg 0xb88ca
> > [...]
> > (thousands of them, but op == 0xe8 && reg == 0xb88ca are always the same)
> >
> > I know this reg calculation is about to be replaced, but anyway ;)
>
> Well, I was sorta hoping to keep that with something like:
>
> https://lkml.kernel.org/r/[email protected]
>
> Anyway, let me try and reproduce.
Using: make O=defconfig CC=clang-13 -j80 -s
( is actually defconfig+kvm_guest.config,
and clang-13 from debian/testing )
on https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=objtool/core
It seems to 'just' work:
spectre_v2=retpoline,amd:
gets me two very sad and lonely replacements:
[] SMP alternatives: retpoline at: 0xffffffff82e764b4 (ffffffff82e764b4) len: 5 to: __x86_indirect_thunk_rbx+0x0/0x20
[] SMP alternatives: ffffffff82e764b4: orig: e8 07 d3 18 ff
[] SMP alternatives: ffffffff82e764b4: repl: 0f ae e8 ff d3
[] SMP alternatives: retpoline at: 0xffffffff82e76f39 (ffffffff82e76f39) len: 5 to: __x86_indirect_thunk_rdi+0x0/0x20
[] SMP alternatives: ffffffff82e76f39: orig: e8 02 c9 18 ff
[] SMP alternatives: ffffffff82e76f39: repl: 0f ae e8 ff d7
The rest is R11 and therefore doesn't actaully fit.
For spectre_v2=off everything gets replaced, and that also seems happy.
[] SMP alternatives: retpoline at: pm_check_save_msr+0x24/0x30 (ffffffff81d2ef24) len: 5 to: __x86_indirect_thunk_r11+0x0/0x20
[] SMP alternatives: ffffffff82603eb0: [3:5) optimized NOPs: 41 ff d3 66 90
[] SMP alternatives: ffffffff81d2ef24: orig: e8 97 49 2d 00
[] SMP alternatives: ffffffff81d2ef24: repl: 41 ff d3 66 90
+ Sami
(Sami, for context:
https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=objtool/core
which contains the following code:
+ void (*target)(void);
+ int reg, i = 0;
+
+ target = addr + insn->length + insn->immediate.value;
+ reg = (target - &__x86_indirect_thunk_rax) /
+ (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
+
+ if (WARN_ON_ONCE(reg & ~0xf))
+ return -1;
which blows up something fierce on clang-cfi)
On Tue, Oct 19, 2021 at 09:47:26AM +0000, Alexander Lobakin wrote:
> Oh okay, it's because of ClangCFI:
>
> SMP alternatives: You were looking for __typeid__ZTSFvvE_global_addr+0x370/0x1410 at 0xffffffffa523cd60,>
> SMP alternatives: rax is __typeid__ZTSFvvE_global_addr+0x360/0x1410 at 0xffffffffa523cd50
>
> Sorry for confusing, seems like it's a side effect of using it on
> Clang 12 while the original series supports only 13+. I'll double
> check and let know if find something.
I'm thinking CFI will totally screw this up regardless, seeing how a
function pointer is taken, and the CFI magicks will turn that into one
of those weird trampolines instead of the actual symbol.
The compiler could of course deduce that these addresses are never
called and don't escape the function, and therefore doesn't need to do
the CFI transformation on then, but I'm guessing it isn't quite that
clever.
Also doing CFI on retpoline thunks seems 'weird', they have a very
particular calling convention, excplicitly very much not the standard C
one. Can't we mark them using asmlinkage or something to tell the
compiler to politely 'bugger off' or somesuch ;-)
On Wed, Oct 13, 2021 at 01:52:59PM -0700, Josh Poimboeuf wrote:
> BTW, CALL_NOSPEC results in a retpoline site in .altinstr_replacement:
>
> Relocation section [40] '.rela.retpoline_sites' for section [39] '.retpoline_sites' at offset 0x8d28 contains 1 entry:
> Offset Type Value Addend Name
> 000000000000000000 X86_64_PC32 000000000000000000 +10 .altinstr_replacement
>
> Which I assume we don't want.
(I missed this initially, and just independently rediscovered it)
In principle this problem affects static_call_list, the __sanitizer_cov_
and __fentry__ and now retpoline_sites.
Granted, it seems really unlikely to find __fentry__ or __sanitizer_cov_
references in alternatives, but it should be trivial to manually create
one.
I'm thinking we want to exclude all those when found in
.altinstr_replacement, right? It just doesn't make sense to rewrite
replacement text.
How is something like the below? (I'm not completely happy with it, but
I couldn't think of something better either).
---
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1039,14 +1039,39 @@ static void remove_insn_ops(struct instr
}
}
+#define DEST_RETPOLINE ((void *)-1L)
+
static void add_call_dest(struct objtool_file *file, struct instruction *insn,
struct symbol *dest, bool sibling)
{
struct reloc *reloc = insn_reloc(file, insn);
- insn->call_dest = dest;
- if (!dest)
+ if (dest != DEST_RETPOLINE) {
+ insn->call_dest = dest;
+ if (!dest )
+ return;
+ }
+
+ /*
+ * Whatever stack impact regular CALLs have, should be undone
+ * by the RETURN of the called function.
+ *
+ * Annotated intra-function calls retain the stack_ops but
+ * are converted to JUMP, see read_intra_function_calls().
+ */
+ remove_insn_ops(insn);
+
+ /*
+ * Whatever we do, do not rewrite replacement text.
+ */
+ if (!strcmp(insn->sec->name, ".altinstr_replacement"))
+ return;
+
+ if (dest == DEST_RETPOLINE) {
+ list_add_tail(&insn->call_node,
+ &file->retpoline_call_list);
return;
+ }
if (insn->call_dest->static_call_tramp) {
list_add_tail(&insn->call_node,
@@ -1091,15 +1116,6 @@ static void add_call_dest(struct objtool
list_add_tail(&insn->mcount_loc_node,
&file->mcount_loc_list);
}
-
- /*
- * Whatever stack impact regular CALLs have, should be undone
- * by the RETURN of the called function.
- *
- * Annotated intra-function calls retain the stack_ops but
- * are converted to JUMP, see read_intra_function_calls().
- */
- remove_insn_ops(insn);
}
/*
@@ -1133,10 +1149,9 @@ static int add_jump_destinations(struct
else
insn->type = INSN_JUMP_DYNAMIC_CONDITIONAL;
- list_add_tail(&insn->call_node,
- &file->retpoline_call_list);
-
insn->retpoline_safe = true;
+
+ add_call_dest(file, insn, DEST_RETPOLINE, true);
continue;
} else if (insn->func) {
/* internal or external sibling call (with reloc) */
@@ -1272,12 +1287,7 @@ static int add_call_destinations(struct
insn->type = INSN_CALL_DYNAMIC;
insn->retpoline_safe = true;
- list_add_tail(&insn->call_node,
- &file->retpoline_call_list);
-
- remove_insn_ops(insn);
- continue;
-
+ add_call_dest(file, insn, DEST_RETPOLINE, false);
} else
add_call_dest(file, insn, reloc->sym, false);
}
On Tue, Oct 19, 2021 at 3:17 AM Peter Zijlstra <[email protected]> wrote:
>
>
> + Sami
>
> (Sami, for context:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=objtool/core
>
> which contains the following code:
>
> + void (*target)(void);
> + int reg, i = 0;
> +
> + target = addr + insn->length + insn->immediate.value;
> + reg = (target - &__x86_indirect_thunk_rax) /
> + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
> +
> + if (WARN_ON_ONCE(reg & ~0xf))
> + return -1;
>
> which blows up something fierce on clang-cfi)
>
> On Tue, Oct 19, 2021 at 09:47:26AM +0000, Alexander Lobakin wrote:
>
> > Oh okay, it's because of ClangCFI:
> >
> > SMP alternatives: You were looking for __typeid__ZTSFvvE_global_addr+0x370/0x1410 at 0xffffffffa523cd60,>
> > SMP alternatives: rax is __typeid__ZTSFvvE_global_addr+0x360/0x1410 at 0xffffffffa523cd50
> >
> > Sorry for confusing, seems like it's a side effect of using it on
> > Clang 12 while the original series supports only 13+. I'll double
> > check and let know if find something.
>
> I'm thinking CFI will totally screw this up regardless, seeing how a
> function pointer is taken, and the CFI magicks will turn that into one
> of those weird trampolines instead of the actual symbol.
>
> The compiler could of course deduce that these addresses are never
> called and don't escape the function, and therefore doesn't need to do
> the CFI transformation on then, but I'm guessing it isn't quite that
> clever.
Yes, it's unfortunately not that clever.
> Also doing CFI on retpoline thunks seems 'weird', they have a very
> particular calling convention, excplicitly very much not the standard C
> one. Can't we mark them using asmlinkage or something to tell the
> compiler to politely 'bugger off' or somesuch ;-)
I confirmed that using an opaque type for the thunk declaration fixes
this issue with CFI. It also makes it obvious that these are not
callable from C code.
Sami
On Tue, Oct 19, 2021 at 01:37:09PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 13, 2021 at 01:52:59PM -0700, Josh Poimboeuf wrote:
>
> > BTW, CALL_NOSPEC results in a retpoline site in .altinstr_replacement:
> >
> > Relocation section [40] '.rela.retpoline_sites' for section [39] '.retpoline_sites' at offset 0x8d28 contains 1 entry:
> > Offset Type Value Addend Name
> > 000000000000000000 X86_64_PC32 000000000000000000 +10 .altinstr_replacement
> >
> > Which I assume we don't want.
>
> (I missed this initially, and just independently rediscovered it)
>
> In principle this problem affects static_call_list, the __sanitizer_cov_
> and __fentry__ and now retpoline_sites.
>
> Granted, it seems really unlikely to find __fentry__ or __sanitizer_cov_
> references in alternatives, but it should be trivial to manually create
> one.
>
> I'm thinking we want to exclude all those when found in
> .altinstr_replacement, right? It just doesn't make sense to rewrite
> replacement text.
Right.
(Someday, if it made sense to do so, objtool could put the annotation at
the original replaced instruction. Then the kernel self-patching code
could run after alternatives patching and could then decide whether the
annotation is relevant or not. But right now I can't think of any
scenario where that would be remotely sane.)
> How is something like the below? (I'm not completely happy with it, but
> I couldn't think of something better either).
How about something like this?
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 7c865a10372a..90d51c294034 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -993,18 +993,31 @@ static void remove_insn_ops(struct instruction *insn)
}
}
-static void add_call_dest(struct objtool_file *file, struct instruction *insn,
- struct symbol *dest, bool sibling)
+static void annotate_call_site(struct objtool_file *file,
+ struct instruction *insn, bool sibling)
{
struct reloc *reloc = insn_reloc(file, insn);
- insn->call_dest = dest;
- if (!dest)
+ if (!insn->call_dest)
+ return;
+
+ /*
+ * Alternative replacement code is just template code which is
+ * sometimes copied to the original instruction. For now, don't
+ * annotate it. (In the future we might consider annotating the
+ * original instruction if/when it ever makes sense to do so.)
+ */
+ if (!strcmp(insn->sec->name, ".altinstr_replacement"))
+ return;
+
+ if (insn->call_dest->retpoline) {
+ list_add_tail(&insn->call_node, &file->retpoline_call_list);
return;
+ }
if (insn->call_dest->static_call_tramp) {
- list_add_tail(&insn->call_node,
- &file->static_call_list);
+ list_add_tail(&insn->call_node, &file->static_call_list);
+ return;
}
/*
@@ -1025,6 +1038,7 @@ static void add_call_dest(struct objtool_file *file, struct instruction *insn,
: arch_nop_insn(insn->len));
insn->type = sibling ? INSN_RETURN : INSN_NOP;
+ return;
}
if (mcount && !strcmp(insn->call_dest->name, "__fentry__")) {
@@ -1042,9 +1056,17 @@ static void add_call_dest(struct objtool_file *file, struct instruction *insn,
insn->type = INSN_NOP;
- list_add_tail(&insn->mcount_loc_node,
- &file->mcount_loc_list);
+ list_add_tail(&insn->mcount_loc_node, &file->mcount_loc_list);
+ return;
}
+}
+
+static void add_call_dest(struct objtool_file *file, struct instruction *insn,
+ struct symbol *dest, bool sibling)
+{
+ insn->call_dest = dest;
+
+ annotate_call_site(file, insn, sibling);
/*
* Whatever stack impact regular CALLs have, should be undone
@@ -1053,7 +1075,9 @@ static void add_call_dest(struct objtool_file *file, struct instruction *insn,
* Annotated intra-function calls retain the stack_ops but
* are converted to JUMP, see read_intra_function_calls().
*/
- remove_insn_ops(insn);
+ if (dest)
+ remove_insn_ops(insn);
+
}
/*
@@ -1077,7 +1101,7 @@ static int add_jump_destinations(struct objtool_file *file)
} else if (reloc->sym->type == STT_SECTION) {
dest_sec = reloc->sym->sec;
dest_off = arch_dest_reloc_offset(reloc->addend);
- } else if (arch_is_retpoline(reloc->sym)) {
+ } else if (reloc->sym->retpoline) {
/*
* Retpoline jumps are really dynamic jumps in
* disguise, so convert them accordingly.
@@ -1087,9 +1111,7 @@ static int add_jump_destinations(struct objtool_file *file)
else
insn->type = INSN_JUMP_DYNAMIC_CONDITIONAL;
- list_add_tail(&insn->call_node,
- &file->retpoline_call_list);
-
+ add_call_dest(file, insn, reloc->sym, true);
insn->retpoline_safe = true;
continue;
} else if (insn->func) {
@@ -1218,20 +1240,14 @@ static int add_call_destinations(struct objtool_file *file)
add_call_dest(file, insn, dest, false);
- } else if (arch_is_retpoline(reloc->sym)) {
+ } else if (reloc->sym->retpoline) {
/*
* Retpoline calls are really dynamic calls in
* disguise, so convert them accordingly.
*/
insn->type = INSN_CALL_DYNAMIC;
+ add_call_dest(file, insn, reloc->sym, false);
insn->retpoline_safe = true;
-
- list_add_tail(&insn->call_node,
- &file->retpoline_call_list);
-
- remove_insn_ops(insn);
- continue;
-
} else
add_call_dest(file, insn, reloc->sym, false);
}
@@ -1916,8 +1932,25 @@ static int read_static_call_tramps(struct objtool_file *file)
list_for_each_entry(func, &sec->symbol_list, list) {
if (func->bind == STB_GLOBAL &&
!strncmp(func->name, STATIC_CALL_TRAMP_PREFIX_STR,
- strlen(STATIC_CALL_TRAMP_PREFIX_STR)))
+ strlen(STATIC_CALL_TRAMP_PREFIX_STR))) {
func->static_call_tramp = true;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int read_retpoline_thunks(struct objtool_file *file)
+{
+ struct section *sec;
+ struct symbol *func;
+
+ for_each_sec(file, sec) {
+ list_for_each_entry(func, &sec->symbol_list, list) {
+ if (func->bind == STB_GLOBAL && arch_is_retpoline(func)) {
+ func->retpoline = true;
+ }
}
}
@@ -1980,13 +2013,16 @@ static int decode_sections(struct objtool_file *file)
if (ret)
return ret;
- /*
- * Must be before add_{jump_call}_destination.
- */
+ /* Must be before add_{jump_call}_destination. */
ret = read_static_call_tramps(file);
if (ret)
return ret;
+ /* Must be before add_{jump_call}_destination. */
+ ret = read_retpoline_thunks(file);
+ if (ret)
+ return ret;
+
/*
* Must be before add_special_section_alts() as that depends on
* jump_dest being set.
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index 9bdc7c757bf8..b773366dfb52 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -56,6 +56,7 @@ struct symbol {
struct symbol *pfunc, *cfunc, *alias;
bool uaccess_safe;
bool static_call_tramp;
+ bool retpoline;
struct list_head pv_target;
};
On Tue, Oct 19, 2021 at 09:47:02AM -0700, Josh Poimboeuf wrote:
> On Tue, Oct 19, 2021 at 01:37:09PM +0200, Peter Zijlstra wrote:
> > On Wed, Oct 13, 2021 at 01:52:59PM -0700, Josh Poimboeuf wrote:
> >
> > > BTW, CALL_NOSPEC results in a retpoline site in .altinstr_replacement:
> > >
> > > Relocation section [40] '.rela.retpoline_sites' for section [39] '.retpoline_sites' at offset 0x8d28 contains 1 entry:
> > > Offset Type Value Addend Name
> > > 000000000000000000 X86_64_PC32 000000000000000000 +10 .altinstr_replacement
> > >
> > > Which I assume we don't want.
> >
> > (I missed this initially, and just independently rediscovered it)
> >
> > In principle this problem affects static_call_list, the __sanitizer_cov_
> > and __fentry__ and now retpoline_sites.
> >
> > Granted, it seems really unlikely to find __fentry__ or __sanitizer_cov_
> > references in alternatives, but it should be trivial to manually create
> > one.
> >
> > I'm thinking we want to exclude all those when found in
> > .altinstr_replacement, right? It just doesn't make sense to rewrite
> > replacement text.
>
> Right.
>
> (Someday, if it made sense to do so, objtool could put the annotation at
> the original replaced instruction. Then the kernel self-patching code
> could run after alternatives patching and could then decide whether the
> annotation is relevant or not. But right now I can't think of any
> scenario where that would be remotely sane.)
>
> > How is something like the below? (I'm not completely happy with it, but
> > I couldn't think of something better either).
>
> How about something like this?
Slightly improved version:
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 7c865a10372a..e0892632ef4a 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -993,18 +993,28 @@ static void remove_insn_ops(struct instruction *insn)
}
}
-static void add_call_dest(struct objtool_file *file, struct instruction *insn,
- struct symbol *dest, bool sibling)
+static void annotate_call_site(struct objtool_file *file,
+ struct instruction *insn, bool sibling)
{
struct reloc *reloc = insn_reloc(file, insn);
- insn->call_dest = dest;
- if (!dest)
+ /*
+ * Alternative replacement code is just template code which is
+ * sometimes copied to the original instruction. For now, don't
+ * annotate it. (In the future we might consider annotating the
+ * original instruction if/when it ever makes sense to do so.)
+ */
+ if (!strcmp(insn->sec->name, ".altinstr_replacement"))
return;
+ if (insn->call_dest->retpoline) {
+ list_add_tail(&insn->call_node, &file->retpoline_call_list);
+ return;
+ }
+
if (insn->call_dest->static_call_tramp) {
- list_add_tail(&insn->call_node,
- &file->static_call_list);
+ list_add_tail(&insn->call_node, &file->static_call_list);
+ return;
}
/*
@@ -1025,6 +1035,7 @@ static void add_call_dest(struct objtool_file *file, struct instruction *insn,
: arch_nop_insn(insn->len));
insn->type = sibling ? INSN_RETURN : INSN_NOP;
+ return;
}
if (mcount && !strcmp(insn->call_dest->name, "__fentry__")) {
@@ -1042,9 +1053,19 @@ static void add_call_dest(struct objtool_file *file, struct instruction *insn,
insn->type = INSN_NOP;
- list_add_tail(&insn->mcount_loc_node,
- &file->mcount_loc_list);
+ list_add_tail(&insn->mcount_loc_node, &file->mcount_loc_list);
+ return;
}
+}
+
+static void add_call_dest(struct objtool_file *file, struct instruction *insn,
+ struct symbol *dest, bool sibling)
+{
+ insn->call_dest = dest;
+ if (!dest)
+ return;
+
+ annotate_call_site(file, insn, sibling);
/*
* Whatever stack impact regular CALLs have, should be undone
@@ -1054,6 +1075,7 @@ static void add_call_dest(struct objtool_file *file, struct instruction *insn,
* are converted to JUMP, see read_intra_function_calls().
*/
remove_insn_ops(insn);
+
}
/*
@@ -1077,7 +1099,7 @@ static int add_jump_destinations(struct objtool_file *file)
} else if (reloc->sym->type == STT_SECTION) {
dest_sec = reloc->sym->sec;
dest_off = arch_dest_reloc_offset(reloc->addend);
- } else if (arch_is_retpoline(reloc->sym)) {
+ } else if (reloc->sym->retpoline) {
/*
* Retpoline jumps are really dynamic jumps in
* disguise, so convert them accordingly.
@@ -1087,9 +1109,7 @@ static int add_jump_destinations(struct objtool_file *file)
else
insn->type = INSN_JUMP_DYNAMIC_CONDITIONAL;
- list_add_tail(&insn->call_node,
- &file->retpoline_call_list);
-
+ add_call_dest(file, insn, reloc->sym, true);
insn->retpoline_safe = true;
continue;
} else if (insn->func) {
@@ -1218,20 +1238,14 @@ static int add_call_destinations(struct objtool_file *file)
add_call_dest(file, insn, dest, false);
- } else if (arch_is_retpoline(reloc->sym)) {
+ } else if (reloc->sym->retpoline) {
/*
* Retpoline calls are really dynamic calls in
* disguise, so convert them accordingly.
*/
insn->type = INSN_CALL_DYNAMIC;
+ add_call_dest(file, insn, reloc->sym, false);
insn->retpoline_safe = true;
-
- list_add_tail(&insn->call_node,
- &file->retpoline_call_list);
-
- remove_insn_ops(insn);
- continue;
-
} else
add_call_dest(file, insn, reloc->sym, false);
}
@@ -1916,8 +1930,25 @@ static int read_static_call_tramps(struct objtool_file *file)
list_for_each_entry(func, &sec->symbol_list, list) {
if (func->bind == STB_GLOBAL &&
!strncmp(func->name, STATIC_CALL_TRAMP_PREFIX_STR,
- strlen(STATIC_CALL_TRAMP_PREFIX_STR)))
+ strlen(STATIC_CALL_TRAMP_PREFIX_STR))) {
func->static_call_tramp = true;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int read_retpoline_thunks(struct objtool_file *file)
+{
+ struct section *sec;
+ struct symbol *func;
+
+ for_each_sec(file, sec) {
+ list_for_each_entry(func, &sec->symbol_list, list) {
+ if (func->bind == STB_GLOBAL && arch_is_retpoline(func)) {
+ func->retpoline = true;
+ }
}
}
@@ -1980,13 +2011,16 @@ static int decode_sections(struct objtool_file *file)
if (ret)
return ret;
- /*
- * Must be before add_{jump_call}_destination.
- */
+ /* Must be before add_{jump_call}_destination. */
ret = read_static_call_tramps(file);
if (ret)
return ret;
+ /* Must be before add_{jump_call}_destination. */
+ ret = read_retpoline_thunks(file);
+ if (ret)
+ return ret;
+
/*
* Must be before add_special_section_alts() as that depends on
* jump_dest being set.
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index 9bdc7c757bf8..b773366dfb52 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -56,6 +56,7 @@ struct symbol {
struct symbol *pfunc, *cfunc, *alias;
bool uaccess_safe;
bool static_call_tramp;
+ bool retpoline;
struct list_head pv_target;
};
From: Sami Tolvanen <[email protected]>
Date: Tue, 19 Oct 2021 08:37:14 -0700
> On Tue, Oct 19, 2021 at 3:17 AM Peter Zijlstra <[email protected]> wrote:
> >
> >
> > + Sami
> >
> > (Sami, for context:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=objtool/core
> >
> > which contains the following code:
> >
> > + void (*target)(void);
> > + int reg, i = 0;
> > +
> > + target = addr + insn->length + insn->immediate.value;
> > + reg = (target - &__x86_indirect_thunk_rax) /
> > + (&__x86_indirect_thunk_rcx - &__x86_indirect_thunk_rax);
> > +
> > + if (WARN_ON_ONCE(reg & ~0xf))
> > + return -1;
> >
> > which blows up something fierce on clang-cfi)
> >
> > On Tue, Oct 19, 2021 at 09:47:26AM +0000, Alexander Lobakin wrote:
> >
> > > Oh okay, it's because of ClangCFI:
> > >
> > > SMP alternatives: You were looking for __typeid__ZTSFvvE_global_addr+0x370/0x1410 at 0xffffffffa523cd60,>
> > > SMP alternatives: rax is __typeid__ZTSFvvE_global_addr+0x360/0x1410 at 0xffffffffa523cd50
> > >
> > > Sorry for confusing, seems like it's a side effect of using it on
> > > Clang 12 while the original series supports only 13+. I'll double
> > > check and let know if find something.
> >
> > I'm thinking CFI will totally screw this up regardless, seeing how a
> > function pointer is taken, and the CFI magicks will turn that into one
> > of those weird trampolines instead of the actual symbol.
> >
> > The compiler could of course deduce that these addresses are never
> > called and don't escape the function, and therefore doesn't need to do
> > the CFI transformation on then, but I'm guessing it isn't quite that
> > clever.
>
> Yes, it's unfortunately not that clever.
>
> > Also doing CFI on retpoline thunks seems 'weird', they have a very
> > particular calling convention, excplicitly very much not the standard C
> > one. Can't we mark them using asmlinkage or something to tell the
> > compiler to politely 'bugger off' or somesuch ;-)
>
> I confirmed that using an opaque type for the thunk declaration fixes
> this issue with CFI. It also makes it obvious that these are not
> callable from C code.
Oh, glad we caught this then, much thanks y'all!
> Sami
Thanks,
Al
On Tue, Oct 19, 2021 at 09:49:13AM -0700, Josh Poimboeuf wrote:
> @@ -1916,8 +1930,25 @@ static int read_static_call_tramps(struct objtool_file *file)
> list_for_each_entry(func, &sec->symbol_list, list) {
> if (func->bind == STB_GLOBAL &&
> !strncmp(func->name, STATIC_CALL_TRAMP_PREFIX_STR,
> - strlen(STATIC_CALL_TRAMP_PREFIX_STR)))
> + strlen(STATIC_CALL_TRAMP_PREFIX_STR))) {
> func->static_call_tramp = true;
> + }
> + }
> + }
> +
> + return 0;
> +}
> +
> +static int read_retpoline_thunks(struct objtool_file *file)
> +{
> + struct section *sec;
> + struct symbol *func;
> +
> + for_each_sec(file, sec) {
> + list_for_each_entry(func, &sec->symbol_list, list) {
> + if (func->bind == STB_GLOBAL && arch_is_retpoline(func)) {
> + func->retpoline = true;
> + }
> }
> }
I've folded these two identical loops over all symbols into a single
classify_symbols() function.
On Tue, Oct 19, 2021 at 09:49:13AM -0700, Josh Poimboeuf wrote:
> @@ -1087,9 +1109,7 @@ static int add_jump_destinations(struct objtool_file *file)
> else
> insn->type = INSN_JUMP_DYNAMIC_CONDITIONAL;
>
> - list_add_tail(&insn->call_node,
> - &file->retpoline_call_list);
> -
> + add_call_dest(file, insn, reloc->sym, true);
> insn->retpoline_safe = true;
> continue;
> } else if (insn->func) {
> @@ -1218,20 +1238,14 @@ static int add_call_destinations(struct objtool_file *file)
>
> add_call_dest(file, insn, dest, false);
>
> - } else if (arch_is_retpoline(reloc->sym)) {
> + } else if (reloc->sym->retpoline) {
> /*
> * Retpoline calls are really dynamic calls in
> * disguise, so convert them accordingly.
> */
> insn->type = INSN_CALL_DYNAMIC;
> + add_call_dest(file, insn, reloc->sym, false);
> insn->retpoline_safe = true;
> -
> - list_add_tail(&insn->call_node,
> - &file->retpoline_call_list);
> -
> - remove_insn_ops(insn);
> - continue;
> -
> } else
> add_call_dest(file, insn, reloc->sym, false);
> }
So the problem with add_call_dest() like this, is that the instructions
now get to have ->call_dest set, which is really strange for
INSN_CALL_DYNAMIC etc.. At the very least it makes call_dest_name()
misbehave.
I've added add_retpoline_call() that also does the ->type frobbing and
->retpoline_safe marking instead.