A little while ago someone (Kirill) ran into the whole 'alternatives don't
do relocations nonsense' again and I got annoyed enough to actually look
at the code.
Since the whole alternative machinery already fully decodes the
instructions it is simple enough to adjust immediates and displacement
when needed. Specifically, the immediates for IP modifying instructions
(JMP, CALL, Jcc) and the displacement for RIP-relative instructions.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/kernel/alternative.c | 248 +++++++++++++++++++++++++--------------
tools/objtool/arch/x86/special.c | 8 -
2 files changed, 161 insertions(+), 95 deletions(-)
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -123,71 +123,6 @@ extern s32 __smp_locks[], __smp_locks_en
void text_poke_early(void *addr, const void *opcode, size_t len);
/*
- * Are we looking at a near JMP with a 1 or 4-byte displacement.
- */
-static inline bool is_jmp(const u8 opcode)
-{
- return opcode == 0xeb || opcode == 0xe9;
-}
-
-static void __init_or_module
-recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
-{
- u8 *next_rip, *tgt_rip;
- s32 n_dspl, o_dspl;
- int repl_len;
-
- if (a->replacementlen != 5)
- return;
-
- o_dspl = *(s32 *)(insn_buff + 1);
-
- /* next_rip of the replacement JMP */
- next_rip = repl_insn + a->replacementlen;
- /* target rip of the replacement JMP */
- tgt_rip = next_rip + o_dspl;
- n_dspl = tgt_rip - orig_insn;
-
- DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
-
- if (tgt_rip - orig_insn >= 0) {
- if (n_dspl - 2 <= 127)
- goto two_byte_jmp;
- else
- goto five_byte_jmp;
- /* negative offset */
- } else {
- if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
- goto two_byte_jmp;
- else
- goto five_byte_jmp;
- }
-
-two_byte_jmp:
- n_dspl -= 2;
-
- insn_buff[0] = 0xeb;
- insn_buff[1] = (s8)n_dspl;
- add_nops(insn_buff + 2, 3);
-
- repl_len = 2;
- goto done;
-
-five_byte_jmp:
- n_dspl -= 5;
-
- insn_buff[0] = 0xe9;
- *(s32 *)&insn_buff[1] = n_dspl;
-
- repl_len = 5;
-
-done:
-
- DPRINTK("final displ: 0x%08x, JMP 0x%lx",
- n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
-}
-
-/*
* optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
*
* @instr: instruction byte stream
@@ -254,6 +189,127 @@ static void __init_or_module noinline op
}
/*
+ * What we start with is:
+ *
+ * src_imm = target - src_next_ip (1)
+ *
+ * what we want is:
+ *
+ * dst_imm = target - dst_next_ip (2)
+ *
+ * so what we do is rework (1) as an expression for target like:
+ *
+ * target = src_imm + src_next_ip (1a)
+ *
+ * and substitute in (2) to get:
+ *
+ * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
+ *
+ * Now, since the instruction stream is 'identical' at src and dst (we copy
+ * after all) we can state that:
+ *
+ * src_next_ip = src + ip_offset
+ * dst_next_ip = dst + ip_offset (4)
+ *
+ * Substitute (4) in (3) and observe ip_offset being cancelled out to
+ * obtain:
+ *
+ * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
+ * = src_imm + src - dst + ip_offset - ip_offset
+ * = src_imm + src - dst (5)
+ *
+ * IOW, only the relative displacement of the code block matters.
+ */
+
+#define apply_reloc_n(n_, p_, d_) \
+ do { \
+ s32 v = *(s##n_ *)(p_); \
+ v += (d_); \
+ BUG_ON((v >> 31) != (v >> (n_-1))); \
+ *(s##n_ *)(p_) = (s##n_)v; \
+ } while (0)
+
+
+static __always_inline
+void apply_reloc(int n, void *ptr, uintptr_t diff)
+{
+ switch (n) {
+ case 1: apply_reloc_n(8, ptr, diff); break;
+ case 2: apply_reloc_n(16, ptr, diff); break;
+ case 4: apply_reloc_n(32, ptr, diff); break;
+ default: BUG();
+ }
+}
+
+static __always_inline
+bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
+{
+ u8 *target = src + offset;
+ /*
+ * If the target is inside the patched block, it's relative to the
+ * block itself and does not need relocation.
+ */
+ return (target < src || target > src + src_len);
+}
+
+static void __init_or_module noinline
+apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+{
+ for (int next, i = 0; i < len; len = next) {
+ struct insn insn;
+
+ if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
+ return;
+
+ next = i + insn.length;
+
+ switch (insn.opcode.bytes[0]) {
+ case 0x0f:
+ if (insn.opcode.bytes[1] < 0x80 ||
+ insn.opcode.bytes[1] > 0x8f)
+ break;
+
+ fallthrough; /* Jcc.d32 */
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ case JMP8_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case CALL_INSN_OPCODE:
+ if (need_reloc(next + insn.immediate.value, src, src_len)) {
+ apply_reloc(insn.immediate.nbytes,
+ buf + i + insn_offset_immediate(&insn),
+ src - dest);
+ }
+
+ /*
+ * Where possible, convert JMP.d32 into JMP.d8.
+ */
+ if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
+ s32 imm = insn.immediate.value + src - dest;
+ if ((imm >> 31) == (imm >> 7)) {
+ buf[i+0] = JMP8_INSN_OPCODE;
+ buf[i+1] = (s8)imm;
+ for (int j = 2; j < insn.length; j++)
+ buf[i+j] = INT3_INSN_OPCODE;
+ }
+ }
+ break;
+
+ case 0x90: /* nop */
+ next = i + optimize_nops_range(buf, len, i);
+ continue;
+ }
+
+ if (insn_rip_relative(&insn)) {
+ if (need_reloc(next + insn.displacement.value, src, src_len)) {
+ apply_reloc(insn.displacement.nbytes,
+ buf + i + insn_offset_displacement(&insn),
+ src - dest);
+ }
+ }
+ }
+}
+
+/*
* Replace instructions with better alternatives for this CPU type. This runs
* before SMP is initialized to avoid SMP problems with self modifying code.
* This implies that asymmetric systems where APs have less capabilities than
@@ -296,8 +352,10 @@ void __init_or_module noinline apply_alt
* - feature not present but ALTINSTR_FLAG_INV is set to mean,
* patch if feature is *NOT* present.
*/
- if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
- goto next;
+ if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) {
+ optimize_nops(instr, a->instrlen);
+ continue;
+ }
DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
(a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
@@ -306,37 +364,20 @@ void __init_or_module noinline apply_alt
instr, instr, a->instrlen,
replacement, a->replacementlen);
- DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
- DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
- /*
- * 0xe8 is a relative jump; fix the offset.
- *
- * Instruction length is checked before the opcode to avoid
- * accessing uninitialized bytes for zero-length replacements.
- */
- if (a->replacementlen == 5 && *insn_buff == 0xe8) {
- *(s32 *)(insn_buff + 1) += replacement - instr;
- DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
- *(s32 *)(insn_buff + 1),
- (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
- }
-
- if (a->replacementlen && is_jmp(replacement[0]))
- recompute_jump(a, instr, replacement, insn_buff);
-
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
+ apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
+
+ DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
text_poke_early(instr, insn_buff, insn_buff_sz);
-
-next:
- optimize_nops(instr, a->instrlen);
}
}
@@ -1334,6 +1375,35 @@ static noinline void __init int3_selftes
unregister_die_notifier(&int3_exception_nb);
}
+static __initdata int __alt_reloc_selftest_addr;
+
+__visible noinline void __init __alt_reloc_selftest(void *arg)
+{
+ WARN_ON(arg != &__alt_reloc_selftest_addr);
+}
+
+static noinline void __init alt_reloc_selftest(void)
+{
+ /*
+ * Tests apply_relocation().
+ *
+ * This has a relative immediate (CALL) in a place other than the first
+ * instruction and additionally on x86_64 we get a RIP-relative LEA:
+ *
+ * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c
+ * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4
+ *
+ * Getting this wrong will either crash and burn or tickle the WARN
+ * above.
+ */
+ asm_inline volatile (
+ ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
+ : /* output */
+ : [mem] "m" (__alt_reloc_selftest_addr)
+ : _ASM_ARG1
+ );
+}
+
void __init alternative_instructions(void)
{
int3_selftest();
@@ -1421,6 +1491,8 @@ void __init alternative_instructions(voi
restart_nmi();
alternatives_patched = 1;
+
+ alt_reloc_selftest();
}
/**
--- a/tools/objtool/arch/x86/special.c
+++ b/tools/objtool/arch/x86/special.c
@@ -42,13 +42,7 @@ bool arch_support_alt_relocation(struct
struct instruction *insn,
struct reloc *reloc)
{
- /*
- * The x86 alternatives code adjusts the offsets only when it
- * encounters a branch instruction at the very beginning of the
- * replacement group.
- */
- return insn->offset == special_alt->new_off &&
- (insn->type == INSN_CALL || is_jump(insn));
+ return true;
}
/*
On Mon, Feb 06, 2023 at 04:05:38PM +0100, Peter Zijlstra wrote:
>
> A little while ago someone (Kirill) ran into the whole 'alternatives don't
> do relocations nonsense' again and I got annoyed enough to actually look
> at the code.
>
> Since the whole alternative machinery already fully decodes the
> instructions it is simple enough to adjust immediates and displacement
> when needed. Specifically, the immediates for IP modifying instructions
> (JMP, CALL, Jcc) and the displacement for RIP-relative instructions.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/x86/kernel/alternative.c | 248 +++++++++++++++++++++++++--------------
> tools/objtool/arch/x86/special.c | 8 -
> 2 files changed, 161 insertions(+), 95 deletions(-)
>
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -123,71 +123,6 @@ extern s32 __smp_locks[], __smp_locks_en
> void text_poke_early(void *addr, const void *opcode, size_t len);
>
> /*
> - * Are we looking at a near JMP with a 1 or 4-byte displacement.
> - */
> -static inline bool is_jmp(const u8 opcode)
> -{
> - return opcode == 0xeb || opcode == 0xe9;
> -}
> -
> -static void __init_or_module
> -recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
> -{
> - u8 *next_rip, *tgt_rip;
> - s32 n_dspl, o_dspl;
> - int repl_len;
> -
> - if (a->replacementlen != 5)
> - return;
> -
> - o_dspl = *(s32 *)(insn_buff + 1);
> -
> - /* next_rip of the replacement JMP */
> - next_rip = repl_insn + a->replacementlen;
> - /* target rip of the replacement JMP */
> - tgt_rip = next_rip + o_dspl;
> - n_dspl = tgt_rip - orig_insn;
> -
> - DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
> -
> - if (tgt_rip - orig_insn >= 0) {
> - if (n_dspl - 2 <= 127)
> - goto two_byte_jmp;
> - else
> - goto five_byte_jmp;
> - /* negative offset */
> - } else {
> - if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
> - goto two_byte_jmp;
> - else
> - goto five_byte_jmp;
> - }
> -
> -two_byte_jmp:
> - n_dspl -= 2;
> -
> - insn_buff[0] = 0xeb;
> - insn_buff[1] = (s8)n_dspl;
> - add_nops(insn_buff + 2, 3);
> -
> - repl_len = 2;
> - goto done;
> -
> -five_byte_jmp:
> - n_dspl -= 5;
> -
> - insn_buff[0] = 0xe9;
> - *(s32 *)&insn_buff[1] = n_dspl;
> -
> - repl_len = 5;
> -
> -done:
> -
> - DPRINTK("final displ: 0x%08x, JMP 0x%lx",
> - n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
> -}
> -
> -/*
> * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
> *
> * @instr: instruction byte stream
> @@ -254,6 +189,127 @@ static void __init_or_module noinline op
> }
>
> /*
> + * What we start with is:
> + *
> + * src_imm = target - src_next_ip (1)
> + *
> + * what we want is:
> + *
> + * dst_imm = target - dst_next_ip (2)
> + *
> + * so what we do is rework (1) as an expression for target like:
> + *
> + * target = src_imm + src_next_ip (1a)
> + *
> + * and substitute in (2) to get:
> + *
> + * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
> + *
> + * Now, since the instruction stream is 'identical' at src and dst (we copy
> + * after all) we can state that:
> + *
> + * src_next_ip = src + ip_offset
> + * dst_next_ip = dst + ip_offset (4)
> + *
> + * Substitute (4) in (3) and observe ip_offset being cancelled out to
> + * obtain:
> + *
> + * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
> + * = src_imm + src - dst + ip_offset - ip_offset
> + * = src_imm + src - dst (5)
> + *
> + * IOW, only the relative displacement of the code block matters.
> + */
> +
> +#define apply_reloc_n(n_, p_, d_) \
> + do { \
> + s32 v = *(s##n_ *)(p_); \
> + v += (d_); \
> + BUG_ON((v >> 31) != (v >> (n_-1))); \
> + *(s##n_ *)(p_) = (s##n_)v; \
> + } while (0)
> +
> +
> +static __always_inline
> +void apply_reloc(int n, void *ptr, uintptr_t diff)
> +{
> + switch (n) {
> + case 1: apply_reloc_n(8, ptr, diff); break;
> + case 2: apply_reloc_n(16, ptr, diff); break;
> + case 4: apply_reloc_n(32, ptr, diff); break;
> + default: BUG();
> + }
> +}
> +
> +static __always_inline
> +bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
> +{
> + u8 *target = src + offset;
> + /*
> + * If the target is inside the patched block, it's relative to the
> + * block itself and does not need relocation.
> + */
> + return (target < src || target > src + src_len);
> +}
> +
> +static void __init_or_module noinline
> +apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
> +{
> + for (int next, i = 0; i < len; len = next) {
'len = next'? I guess it suppose to be 'i = next', right? Otherwise it
hangs for me.
And if I fix this, I get this crash later on:
BUG: unable to handle page fault for address: ffffffff53fb2f2e
#PF: supervisor instruction fetch in kernel mode
#PF: error_code(0x0010) - not-present page
PGD 283b067 P4D 283b067 PUD 0
Oops: 0010 [#1] PREEMPT SMP PTI
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.2.0-rc7-00012-ga7fd2d79efc5-dirty #1736
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
RIP: 0010:0xffffffff53fb2f2e
Code: Unable to access opcode bytes at 0xffffffff53fb2f04.
RSP: 0000:ffffffff82803d58 EFLAGS: 00010206
RAX: 0000000000000bc9 RBX: 0000000000000005 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffffff826be1f9 RDI: ffffffff82662566
RBP: ffffffff82803d8a R08: 0000000000000001 R09: 0000000000000001
R10: 0000000000000005 R11: 00000000ffffffff R12: ffffffff836f1a37
R13: ffffffff8388c65d R14: ffffffff8388bef0 R15: ffffffff82803d8a
FS: 0000000000000000(0000) GS:ffff8881f9c00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffff53fb2f04 CR3: 0000000002836001 CR4: 0000000000370ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? text_poke_early+0x46/0x5d
? text_poke_early+0x3a/0x5d
? apply_alternatives+0x258/0x287
? lock_release+0x1d8/0x2e0
? _raw_spin_unlock_irqrestore+0x2b/0x50
? synchronize_rcu+0x1b0/0x1d0
? synchronize_rcu+0x16c/0x1d0
? synchronize_rcu+0x16c/0x1d0
? lockdep_hardirqs_on+0x79/0x100
? synchronize_rcu+0x16c/0x1d0
? lock_release+0x13a/0x2e0
? _raw_spin_unlock_irqrestore+0x2b/0x50
? _raw_spin_unlock_irqrestore+0x2b/0x50
? lockdep_hardirqs_on+0x79/0x100
? alternative_instructions+0x35/0xd1
? check_bugs+0xd6f/0xdc3
? start_kernel+0x674/0x6a8
? secondary_startup_64_no_verify+0xe0/0xeb
</TASK>
CR2: ffffffff53fb2f2e
---[ end trace 0000000000000000 ]---
It happens when applied on top of current Linus' tree. My config is
attached just in case.
Maybe you forgot to fold in some fixup, I donno.
--
Kiryl Shutsemau / Kirill A. Shutemov
On Tue, Feb 07, 2023 at 02:38:13PM +0300, Kirill A. Shutemov wrote:
> > +static void __init_or_module noinline
> > +apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
> > +{
> > + for (int next, i = 0; i < len; len = next) {
>
> 'len = next'? I guess it suppose to be 'i = next', right? Otherwise it
> hangs for me.
>
Yeah, last minute changes and not testing :/ Sorry about that. I'll try
and post an actually tested patch later today.
Also, Masami, how difficuly would it be to do insn_is_nop() that matches
most/all conventional NOP instructions?
That might make it more convenient to write a more generic
optimize_nops_range() -- it currently needs a single byte nop range and
one of the bugs in the patch as posted is caused by a multi-byte nop
(specifically a 0x66 prefixed 0x90).
On Tue, Feb 07, 2023 at 05:05:27PM +0100, Peter Zijlstra wrote:
> Also, Masami, how difficuly would it be to do insn_is_nop() that matches
> most/all conventional NOP instructions?
Just looked at that, while NOP and NOPL are relatively easy to do, the
32bit NOPs are a pain to decode since they're actual instructions with
the side-effect of not actually doing anything, like 'mov %reg,%reg' and
'lea (%reg),%reg'. Esp. that latter is popular since it has all the
various displacement encodings to grow it into a very long instruction.
I'll put it on the todo list I suppose, not something for now.
From: Peter Zijlstra
> Sent: 06 February 2023 15:06
...
> +#define apply_reloc_n(n_, p_, d_) \
> + do { \
> + s32 v = *(s##n_ *)(p_); \
You've added '_' suffixes to the parameters.
But these only refer to the body of the #define
so are never a problem.
OTOH the local 'v' will cause confusion if one of the
actual parameters is 'v'.
Which is why it is common to prefix locals with '_'.
(Which doesn't help with recursive expansions.)
Since this is only actually expended in the one .c file
it is unlikely to cause a problem.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)