2023-02-08 17:18:22

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some

This rewrite address two issues:

- it no longer hard requires single byte nop runs, it now accepts
any NOP and NOPL encoded instruction (but not the more complicated
32bit NOPs).

- it writes a single 'instruction' replacement.

Specifically, ORC unwinder relies on the tail NOP of an alternative to
be a single instruction, in particular it relies on the inner bytes
not being executed.

Once we reach the max supported NOP length (currently 8, could easily
be extended to 11 on x86_64), switches to JMP.d8 and INT3 padding to
achieve the same result.

The ORC unwinder uses this guarantee in the analysis of
alternative/overlapping CFI state,

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/kernel/alternative.c | 103 ++++++++++++++++++++++--------------------
1 file changed, 54 insertions(+), 49 deletions(-)

--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -126,6 +126,30 @@ static void __init_or_module add_nops(vo
}
}

+static void __init_or_module add_nop(u8 *instr, unsigned int len)
+{
+ u8 *target = instr + len;
+
+ if (!len)
+ return;
+
+ if (len <= ASM_NOP_MAX) {
+ memcpy(instr, x86_nops[len], len);
+ return;
+ }
+
+ if (len < 128) {
+ __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
+ instr += JMP8_INSN_SIZE;
+ } else {
+ __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
+ instr += JMP32_INSN_SIZE;
+ }
+
+ for (;instr < target; instr++)
+ *instr = INT3_INSN_OPCODE;
+}
+
extern s32 __retpoline_sites[], __retpoline_sites_end[];
extern s32 __return_sites[], __return_sites_end[];
extern s32 __cfi_sites[], __cfi_sites_end[];
@@ -134,39 +158,32 @@ extern struct alt_instr __alt_instructio
extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);

-/*
- * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
- *
- * @instr: instruction byte stream
- * @instrlen: length of the above
- * @off: offset within @instr where the first NOP has been detected
- *
- * Return: number of NOPs found (and replaced).
- */
-static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
+static bool insn_is_nop(struct insn *insn)
{
- unsigned long flags;
- int i = off, nnops;
+ if (insn->opcode.bytes[0] == 0x90)
+ return true;

- while (i < instrlen) {
- if (instr[i] != 0x90)
- break;
+ if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
+ return true;

- i++;
- }
+ /* TODO: more nops */

- nnops = i - off;
+ return false;
+}

- if (nnops <= 1)
- return nnops;
+static int skip_nops(u8 *instr, int offset, int len)
+{
+ struct insn insn;

- local_irq_save(flags);
- add_nops(instr + off, nnops);
- local_irq_restore(flags);
+ for (; offset < len; offset += insn.length) {
+ if (insn_decode_kernel(&insn, &instr[offset]))
+ break;

- DUMP_BYTES(ALT, instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
+ if (!insn_is_nop(&insn))
+ break;
+ }

- return nnops;
+ return offset;
}

/*
@@ -175,28 +192,19 @@ static __always_inline int optimize_nops
*/
static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
{
- struct insn insn;
- int i = 0;
+ for (int next, i = 0; i < len; i = next) {
+ struct insn insn;

- /*
- * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
- * ones.
- */
- for (;;) {
if (insn_decode_kernel(&insn, &instr[i]))
return;

- /*
- * See if this and any potentially following NOPs can be
- * optimized.
- */
- if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
- i += optimize_nops_range(instr, len, i);
- else
- i += insn.length;
+ next = i + insn.length;

- if (i >= len)
- return;
+ if (insn_is_nop(&insn)) {
+ next = skip_nops(instr, next, len);
+ add_nop(instr + i, next - i);
+ DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, i, next);
+ }
}
}

@@ -317,13 +325,10 @@ apply_relocation(u8 *buf, size_t len, u8
}
}

-
- /*
- * See if this and any potentially following NOPs can be
- * optimized.
- */
- if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
- next = i + optimize_nops_range(buf, len, i);
+ if (insn_is_nop(&insn)) {
+ next = skip_nops(buf, next, len);
+ add_nop(buf + i, next - i);
+ }
}
}





2023-02-08 19:52:19

by Andrew Cooper

[permalink] [raw]
Subject: Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some

On 08/02/2023 5:10 pm, Peter Zijlstra wrote:
> This rewrite address two issues:
>
> - it no longer hard requires single byte nop runs, it now accepts
> any NOP and NOPL encoded instruction (but not the more complicated
> 32bit NOPs).
>
> - it writes a single 'instruction' replacement.
>
> Specifically, ORC unwinder relies on the tail NOP of an alternative to
> be a single instruction, in particular it relies on the inner bytes
> not being executed.
>
> Once we reach the max supported NOP length (currently 8, could easily
> be extended to 11 on x86_64), switches to JMP.d8 and INT3 padding to
> achieve the same result.
>
> The ORC unwinder uses this guarantee in the analysis of
> alternative/overlapping CFI state,
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

How lucky are you feeling for your game of performance roulette?

Unconditional jmps cost branch prediction these days, and won't be
successfully predicted until taken.

There is a point after which a jmp is more efficient that brute forcing
through a line of nops, and where this point is is very uarch specific,
but it's not a single nop...

Whether you care or not is a different matter, but at least be aware
doing a jmp like this instead of e.g. 2 or 3 nops, is contrary to the
prior advice given by the architects.

~Andrew

2023-02-08 20:29:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some

On Wed, Feb 08, 2023 at 07:52:04PM +0000, [email protected] wrote:
> On 08/02/2023 5:10 pm, Peter Zijlstra wrote:
> > This rewrite address two issues:
> >
> > - it no longer hard requires single byte nop runs, it now accepts
> > any NOP and NOPL encoded instruction (but not the more complicated
> > 32bit NOPs).
> >
> > - it writes a single 'instruction' replacement.
> >
> > Specifically, ORC unwinder relies on the tail NOP of an alternative to
> > be a single instruction, in particular it relies on the inner bytes
> > not being executed.
> >
> > Once we reach the max supported NOP length (currently 8, could easily
> > be extended to 11 on x86_64), switches to JMP.d8 and INT3 padding to
> > achieve the same result.
> >
> > The ORC unwinder uses this guarantee in the analysis of
> > alternative/overlapping CFI state,
> >
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>
> How lucky are you feeling for your game of performance roulette?

Yeah, not very lucky.. I've been talking about this with Boris for a bit
already.

> Unconditional jmps cost branch prediction these days, and won't be
> successfully predicted until taken.

IKR, insane, but that's what it is.

> There is a point after which a jmp is more efficient that brute forcing
> through a line of nops, and where this point is is very uarch specific,
> but it's not a single nop...

Yeah, I know, even very big nops. $I prefetch is good at straight lines
etc..

> Whether you care or not is a different matter, but at least be aware
> doing a jmp like this instead of e.g. 2 or 3 nops, is contrary to the
> prior advice given by the architects.

So, it is either this, or make the CFI conflict analysis done by objtool
a lot more 'interesting'. I figured I'd try this first.

As is this is actually a correctness issue, not a performance issue.
Goes hand in hand with the patches:

https://lkml.kernel.org/r/[email protected]
https://lkml.kernel.org/r/[email protected]

Specifically, for short alternatives objtool adds a single nop of
size difference size at the end. The advantage is that you only get one
CFI entry for that thing and so don't have potential conflict for any of
the inner bytes.

The alternative is making it multiple NOPs and ensuring objtool and the
kernel both agree on the length of them.

As is, there were only a hand full of NOPs that turned into this jmp.d8
thing on the defconfig+kvm_guest.config I build to test this -- it is by
no means a common thing. And about half of them would be gone by
extending the max nop length to at least 10 or so.

In fact, I did that patch once, lemme see if I still have it...

2023-02-08 20:36:53

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some

On Wed, Feb 08, 2023 at 09:29:23PM +0100, Peter Zijlstra wrote:

> As is, there were only a hand full of NOPs that turned into this jmp.d8
> thing on the defconfig+kvm_guest.config I build to test this -- it is by
> no means a common thing. And about half of them would be gone by
> extending the max nop length to at least 10 or so.
>
> In fact, I did that patch once, lemme see if I still have it...

Even still applies too, lemme go test that.

Also, there's a bunch of alternatives() where we explicitly put in that
short jmp instead of taking the nops, see for example:

$ git grep -i "alternative.*jmp" arch/x86/

---
Subject: x86_64: Longer NOPs
From: Peter Zijlstra <[email protected]>


Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/include/asm/nops.h | 13 +++++++++++--
arch/x86/kernel/alternative.c | 8 ++++++++
2 files changed, 19 insertions(+), 2 deletions(-)

--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -34,6 +34,8 @@
#define BYTES_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
#define BYTES_NOP8 0x3e,BYTES_NOP7

+#define ASM_NOP_MAX 8
+
#else

/*
@@ -47,6 +49,8 @@
* 6: osp nopl 0x00(%eax,%eax,1)
* 7: nopl 0x00000000(%eax)
* 8: nopl 0x00000000(%eax,%eax,1)
+ * 9: cs nopl 0x00000000(%eax,%eax,1)
+ * 10: osp cs nopl 0x00000000(%eax,%eax,1)
*/
#define BYTES_NOP1 0x90
#define BYTES_NOP2 0x66,BYTES_NOP1
@@ -56,6 +60,13 @@
#define BYTES_NOP6 0x66,BYTES_NOP5
#define BYTES_NOP7 0x0f,0x1f,0x80,0x00,0x00,0x00,0x00
#define BYTES_NOP8 0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+#define BYTES_NOP9 0x2e,BYTES_NOP8
+#define BYTES_NOP10 0x66,BYTES_NOP9
+
+#define ASM_NOP9 _ASM_BYTES(BYTES_NOP9)
+#define ASM_NOP10 _ASM_BYTES(BYTES_NOP10)
+
+#define ASM_NOP_MAX 10

#endif /* CONFIG_64BIT */

@@ -68,8 +79,6 @@
#define ASM_NOP7 _ASM_BYTES(BYTES_NOP7)
#define ASM_NOP8 _ASM_BYTES(BYTES_NOP8)

-#define ASM_NOP_MAX 8
-
#ifndef __ASSEMBLY__
extern const unsigned char * const x86_nops[];
#endif
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -86,6 +86,10 @@ static const unsigned char x86nops[] =
BYTES_NOP6,
BYTES_NOP7,
BYTES_NOP8,
+#ifdef CONFIG_64BIT
+ BYTES_NOP9,
+ BYTES_NOP10,
+#endif
};

const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
@@ -99,6 +103,10 @@ const unsigned char * const x86_nops[ASM
x86nops + 1 + 2 + 3 + 4 + 5,
x86nops + 1 + 2 + 3 + 4 + 5 + 6,
x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+#ifdef CONFIG_64BIT
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
+#endif
};

/* Use this to add nops to a buffer, then text_poke the whole buffer. */

2023-02-08 20:44:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some

On Wed, Feb 08, 2023 at 09:36:24PM +0100, Peter Zijlstra wrote:
> On Wed, Feb 08, 2023 at 09:29:23PM +0100, Peter Zijlstra wrote:
>
> > As is, there were only a hand full of NOPs that turned into this jmp.d8
> > thing on the defconfig+kvm_guest.config I build to test this -- it is by
> > no means a common thing. And about half of them would be gone by
> > extending the max nop length to at least 10 or so.
> >
> > In fact, I did that patch once, lemme see if I still have it...
>
> Even still applies too, lemme go test that.

Works as advertised. Next up are 8 12 byte nopes and 8 13 byte nops,
after that we're left with:

[ 11.584069] SMP alternatives: ffffffff82000095: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[ 11.588621] SMP alternatives: ffffffff810026dc: [0:44) optimized NOPs: eb 2a cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cccc cc cc cc cc cc cc cc cc
[ 11.590068] SMP alternatives: ffffffff820001f3: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[ 11.720069] SMP alternatives: ffffffff8200189f: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[ 11.731069] SMP alternatives: ffffffff820019ae: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[ 11.738069] SMP alternatives: ffffffff82001a4a: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[ 11.746069] SMP alternatives: ffffffff82001b2d: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[ 11.766069] SMP alternatives: ffffffff82001d14: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[ 11.770069] SMP alternatives: ffffffff82001dd5: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[ 11.779069] SMP alternatives: ffffffff82001f35: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc

Which is all pretty big...

GNU as seems to go to .nops 12. 13 gives me a single byte nop and a 12.
That's also exactly the 3 prefix limit, after which some uarchs start
taking heavy decode penalties.

Let me go extend that patch of mine to cover 12. Also, instead of
printing hex addresses, lemme go print symbol+off things.

2023-02-08 20:45:55

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some

On Wed, Feb 08, 2023 at 09:44:04PM +0100, Peter Zijlstra wrote:
> GNU as seems to go to .nops 12. 13 gives me a single byte nop and a 12.
> That's also exactly the 3 prefix limit, after which some uarchs start
> taking heavy decode penalties.

Oh noes, counting hard. It maxes out at 11, not 12. Sadness.

Instead, lemme go figure out what those alternatives actually are.

2023-02-08 21:02:22

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some

On Wed, Feb 08, 2023 at 09:44:04PM +0100, Peter Zijlstra wrote:

> Works as advertised. Next up are 8 12 byte nopes and 8 13 byte nops,
> after that we're left with:

Yeah, we can ignore those, those are the retpoline thunks being patched.
They should be totally unused. These NOPs are after an indirect jmp and
as such pure speculation food.

Arguably I should just merge that patch that turns them into UD2 --
funneling and all that.

But there was some i386 issues... oh well.

2023-02-08 21:08:36

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some

On Wed, Feb 08, 2023 at 09:44:04PM +0100, Peter Zijlstra wrote:

> [ 11.584069] SMP alternatives: ffffffff82000095: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [ 11.590068] SMP alternatives: ffffffff820001f3: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [ 11.720069] SMP alternatives: ffffffff8200189f: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [ 11.731069] SMP alternatives: ffffffff820019ae: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [ 11.738069] SMP alternatives: ffffffff82001a4a: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [ 11.746069] SMP alternatives: ffffffff82001b2d: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [ 11.766069] SMP alternatives: ffffffff82001d14: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [ 11.770069] SMP alternatives: ffffffff82001dd5: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [ 11.779069] SMP alternatives: ffffffff82001f35: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc

UNTRAIN_RET -- specifically RESET_CALL_DEPTH

> [ 11.588621] SMP alternatives: ffffffff810026dc: [0:44) optimized NOPs: eb 2a cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cccc cc cc cc cc cc cc cc cc

FILL_RETURN_BUFFER, and I'm thinking a 44 byte nop we ought to just jmp.

2023-02-08 21:21:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some

On Wed, Feb 08, 2023 at 10:08:12PM +0100, Peter Zijlstra wrote:
> On Wed, Feb 08, 2023 at 09:44:04PM +0100, Peter Zijlstra wrote:
>
> > [ 11.584069] SMP alternatives: ffffffff82000095: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [ 11.590068] SMP alternatives: ffffffff820001f3: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [ 11.720069] SMP alternatives: ffffffff8200189f: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [ 11.731069] SMP alternatives: ffffffff820019ae: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [ 11.738069] SMP alternatives: ffffffff82001a4a: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [ 11.746069] SMP alternatives: ffffffff82001b2d: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [ 11.766069] SMP alternatives: ffffffff82001d14: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [ 11.770069] SMP alternatives: ffffffff82001dd5: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [ 11.779069] SMP alternatives: ffffffff82001f35: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>
> UNTRAIN_RET -- specifically RESET_CALL_DEPTH

19: 48 c7 c0 80 00 00 00 mov $0x80,%rax
20: 48 c1 e0 38 shl $0x38,%rax
24: 65 48 89 04 25 00 00 00 00 mov %rax,%gs:0x0 29: R_X86_64_32S pcpu_hot+0x10

Is ofc an atrocity.

We can easily trim that by 5 bytes to:

0: b0 80 mov $0x80,%al
2: 48 c1 e0 38 shl $0x38,%rax
6: 65 48 89 04 25 00 00 00 00 mov %rax,%gs:0x0

Who cares about the top bytes, we're explicitly shifting them out
anyway. But that's still 15 bytes or so.

If it weren't for those pesky prefix penalties that would make exactly
one instruction :-)


diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index e04313e89f4f..be792f9407b5 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -84,7 +84,7 @@
movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);

#define RESET_CALL_DEPTH \
- mov $0x80, %rax; \
+ movb $0x80, %al; \
shl $56, %rax; \
movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);


2023-02-08 23:04:35

by David Laight

[permalink] [raw]
Subject: RE: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some

From: [email protected]
> Sent: 08 February 2023 19:52
...
> Unconditional jmps cost branch prediction these days, and won't be
> successfully predicted until taken.

That is sad :-(

Don't they also use the BTB for the target address?
So even if predicted taken the cpu can speculate from
the wrong address??

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2023-02-09 01:11:39

by Andrew Cooper

[permalink] [raw]
Subject: Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some

On 08/02/2023 9:21 pm, Peter Zijlstra wrote:
> On Wed, Feb 08, 2023 at 10:08:12PM +0100, Peter Zijlstra wrote:
>> On Wed, Feb 08, 2023 at 09:44:04PM +0100, Peter Zijlstra wrote:
>>
>>> [ 11.584069] SMP alternatives: ffffffff82000095: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [ 11.590068] SMP alternatives: ffffffff820001f3: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [ 11.720069] SMP alternatives: ffffffff8200189f: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [ 11.731069] SMP alternatives: ffffffff820019ae: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [ 11.738069] SMP alternatives: ffffffff82001a4a: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [ 11.746069] SMP alternatives: ffffffff82001b2d: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [ 11.766069] SMP alternatives: ffffffff82001d14: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [ 11.770069] SMP alternatives: ffffffff82001dd5: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [ 11.779069] SMP alternatives: ffffffff82001f35: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>> UNTRAIN_RET -- specifically RESET_CALL_DEPTH
> 19: 48 c7 c0 80 00 00 00 mov $0x80,%rax
> 20: 48 c1 e0 38 shl $0x38,%rax
> 24: 65 48 89 04 25 00 00 00 00 mov %rax,%gs:0x0 29: R_X86_64_32S pcpu_hot+0x10
>
> Is ofc an atrocity.
>
> We can easily trim that by 5 bytes to:
>
> 0: b0 80 mov $0x80,%al
> 2: 48 c1 e0 38 shl $0x38,%rax
> 6: 65 48 89 04 25 00 00 00 00 mov %rax,%gs:0x0
>
> Who cares about the top bytes, we're explicitly shifting them out
> anyway. But that's still 15 bytes or so.
>
> If it weren't for those pesky prefix penalties that would make exactly
> one instruction :-)

Yeah, but then you're taking a merge penalty instead.

Given that you can't reduce enough anyway, while only a 4 byte reduction
rather than 5, you're probably better off with:

0:   31 c0                   xor    %eax,%eax
2:   48 0f ba e8 3f          bts    $0x3f,%rax
7:   65 48 89 04 25 00 00 00 00      mov    %rax,%gs:0x0

because of the zeroing idiom splitting these 3 instructions away from
the previous operation on rax.

It's a shame that x86 doesn't have a mov $imm8, %d32 form, because
loading 1 into a register is an incredibly common operation to perform.

~Andrew

2023-02-09 01:33:19

by Andrew Cooper

[permalink] [raw]
Subject: Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some

On 08/02/2023 8:29 pm, Peter Zijlstra wrote:
> On Wed, Feb 08, 2023 at 07:52:04PM +0000, [email protected] wrote:
>> On 08/02/2023 5:10 pm, Peter Zijlstra wrote:
>>> This rewrite address two issues:
>>>
>>> - it no longer hard requires single byte nop runs, it now accepts
>>> any NOP and NOPL encoded instruction (but not the more complicated
>>> 32bit NOPs).
>>>
>>> - it writes a single 'instruction' replacement.
>>>
>>> Specifically, ORC unwinder relies on the tail NOP of an alternative to
>>> be a single instruction, in particular it relies on the inner bytes
>>> not being executed.
>>>
>>> Once we reach the max supported NOP length (currently 8, could easily
>>> be extended to 11 on x86_64), switches to JMP.d8 and INT3 padding to
>>> achieve the same result.
>>>
>>> The ORC unwinder uses this guarantee in the analysis of
>>> alternative/overlapping CFI state,
>>>
>>> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>> How lucky are you feeling for your game of performance roulette?
> Yeah, not very lucky.. I've been talking about this with Boris for a bit
> already.
>
>> Unconditional jmps cost branch prediction these days, and won't be
>> successfully predicted until taken.
> IKR, insane, but that's what it is.

In terms of rationalising how things work, sure, put the resulting perf
numbers speak for themselves.

For the benefit of others reading this and not following what's going
on, modern x86 processors have branch prediction occurring pre-decode,
not post-decode, to reduce the misprediction latency.

Branch prediction operates using the current %rip and past history, and
selects the $I lines to send for decode.  The "decoded bytes disagree
with prediction metadata" feedback cycle is fast, but missing this
disagreement is the root cause of the Branch Type Confusion speculation
issue (a.k.a. AMD Retbleed).

~Andrew

2023-02-09 22:27:20

by David Laight

[permalink] [raw]
Subject: RE: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some

From: [email protected]
> Sent: 09 February 2023 01:11
...
> >> UNTRAIN_RET -- specifically RESET_CALL_DEPTH
> > 19: 48 c7 c0 80 00 00 00 mov $0x80,%rax
> > 20: 48 c1 e0 38 shl $0x38,%rax
> > 24: 65 48 89 04 25 00 00 00 00 mov %rax,%gs:0x0 29: R_X86_64_32S
> pcpu_hot+0x10
> >
> > Is ofc an atrocity.
> >
> > We can easily trim that by 5 bytes to:
> >
> > 0: b0 80 mov $0x80,%al
> > 2: 48 c1 e0 38 shl $0x38,%rax
> > 6: 65 48 89 04 25 00 00 00 00 mov %rax,%gs:0x0
> >
> > Who cares about the top bytes, we're explicitly shifting them out
> > anyway. But that's still 15 bytes or so.
> >
> > If it weren't for those pesky prefix penalties that would make exactly
> > one instruction :-)
>
> Yeah, but then you're taking a merge penalty instead.
>
> Given that you can't reduce enough anyway, while only a 4 byte reduction
> rather than 5, you're probably better off with:
>
> 0:   31 c0                   xor    %eax,%eax
> 2:   48 0f ba e8 3f          bts    $0x3f,%rax
> 7:   65 48 89 04 25 00 00 00 00      mov    %rax,%gs:0x0
>
> because of the zeroing idiom splitting these 3 instructions away from
> the previous operation on rax.

How about:
31 c0 xor %eax,%eax
f9 stc
48 d1 d8 rcr $1,%rax
So 6 bytes total.
But that might be a partial dependency on flags.
(Although that isn't any worse than the xor.)
It is also a longer dependency chain - so the execution time
rather depends on what else is going on.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2023-05-13 13:06:05

by tip-bot2 for Zqiang

[permalink] [raw]
Subject: [tip: x86/alternatives] x86/alternative: Rewrite optimize_nops() some

The following commit has been merged into the x86/alternatives branch of tip:

Commit-ID: 6c480f22212826425b57932f09b1f0abbec85485
Gitweb: https://git.kernel.org/tip/6c480f22212826425b57932f09b1f0abbec85485
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 08 Feb 2023 18:10:53 +01:00
Committer: Borislav Petkov (AMD) <[email protected]>
CommitterDate: Thu, 11 May 2023 17:33:36 +02:00

x86/alternative: Rewrite optimize_nops() some

Address two issues:

- it no longer hard requires single byte NOP runs - now it accepts any
NOP and NOPL encoded instruction (but not the more complicated 32bit
NOPs).

- it writes a single 'instruction' replacement.

Specifically, ORC unwinder relies on the tail NOP of an alternative to
be a single instruction. In particular, it relies on the inner bytes not
being executed.

Once the max supported NOP length has been reached (currently 8, could easily
be extended to 11 on x86_64), switch to JMP.d8 and INT3 padding to
achieve the same result.

Objtool uses this guarantee in the analysis of alternative/overlapping
CFI state for the ORC unwinder data. Every instruction edge gets a CFI
state and the more instructions the larger the chance of conflicts.

[ bp:
- Add a comment over add_nop() to explain why it does it this way
- Make add_nops() PARAVIRT only as it is used solely there now ]

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Borislav Petkov (AMD) <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/kernel/alternative.c | 129 ++++++++++++++++++---------------
1 file changed, 71 insertions(+), 58 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 28eb1d0..839bc6d 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -113,17 +113,35 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};

-/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-static void __init_or_module add_nops(void *insns, unsigned int len)
+/*
+ * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
+ * for every single-byte NOP, try to generate the maximally available NOP of
+ * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
+ * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
+ * *jump* over instead of executing long and daft NOPs.
+ */
+static void __init_or_module add_nop(u8 *instr, unsigned int len)
{
- while (len > 0) {
- unsigned int noplen = len;
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
- memcpy(insns, x86_nops[noplen], noplen);
- insns += noplen;
- len -= noplen;
+ u8 *target = instr + len;
+
+ if (!len)
+ return;
+
+ if (len <= ASM_NOP_MAX) {
+ memcpy(instr, x86_nops[len], len);
+ return;
}
+
+ if (len < 128) {
+ __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
+ instr += JMP8_INSN_SIZE;
+ } else {
+ __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
+ instr += JMP32_INSN_SIZE;
+ }
+
+ for (;instr < target; instr++)
+ *instr = INT3_INSN_OPCODE;
}

extern s32 __retpoline_sites[], __retpoline_sites_end[];
@@ -134,39 +152,32 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);

-/*
- * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
- *
- * @instr: instruction byte stream
- * @instrlen: length of the above
- * @off: offset within @instr where the first NOP has been detected
- *
- * Return: number of NOPs found (and replaced).
- */
-static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
+static bool insn_is_nop(struct insn *insn)
{
- unsigned long flags;
- int i = off, nnops;
+ if (insn->opcode.bytes[0] == 0x90)
+ return true;

- while (i < instrlen) {
- if (instr[i] != 0x90)
- break;
+ if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
+ return true;

- i++;
- }
+ /* TODO: more nops */

- nnops = i - off;
+ return false;
+}

- if (nnops <= 1)
- return nnops;
+static int skip_nops(u8 *instr, int offset, int len)
+{
+ struct insn insn;

- local_irq_save(flags);
- add_nops(instr + off, nnops);
- local_irq_restore(flags);
+ for (; offset < len; offset += insn.length) {
+ if (insn_decode_kernel(&insn, &instr[offset]))
+ break;

- DUMP_BYTES(ALT, instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
+ if (!insn_is_nop(&insn))
+ break;
+ }

- return nnops;
+ return offset;
}

/*
@@ -175,28 +186,19 @@ static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
*/
static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
{
- struct insn insn;
- int i = 0;
+ for (int next, i = 0; i < len; i = next) {
+ struct insn insn;

- /*
- * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
- * ones.
- */
- for (;;) {
if (insn_decode_kernel(&insn, &instr[i]))
return;

- /*
- * See if this and any potentially following NOPs can be
- * optimized.
- */
- if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
- i += optimize_nops_range(instr, len, i);
- else
- i += insn.length;
+ next = i + insn.length;

- if (i >= len)
- return;
+ if (insn_is_nop(&insn)) {
+ next = skip_nops(instr, next, len);
+ add_nop(instr + i, next - i);
+ DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, i, next);
+ }
}
}

@@ -323,13 +325,10 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
}
}

-
- /*
- * See if this and any potentially following NOPs can be
- * optimized.
- */
- if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
- next = i + optimize_nops_range(buf, len, i);
+ if (insn_is_nop(&insn)) {
+ next = skip_nops(buf, next, len);
+ add_nop(buf + i, next - i);
+ }
}
}

@@ -1289,6 +1288,20 @@ int alternatives_text_reserved(void *start, void *end)
#endif /* CONFIG_SMP */

#ifdef CONFIG_PARAVIRT
+
+/* Use this to add nops to a buffer, then text_poke the whole buffer. */
+static void __init_or_module add_nops(void *insns, unsigned int len)
+{
+ while (len > 0) {
+ unsigned int noplen = len;
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+ memcpy(insns, x86_nops[noplen], noplen);
+ insns += noplen;
+ len -= noplen;
+ }
+}
+
void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
struct paravirt_patch_site *end)
{