LinuxLists.cc - [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

2018-01-11 11:37:43

Subject: [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

In accordance with the Intel and AMD documentation, we need to overwrite
all entries in the RSB on exiting a guest, to prevent malicious branch
target predictions from affecting the host kernel. This is needed both
for retpoline and for IBRS.

Signed-off-by: David Woodhouse <[email protected]>
---
v2: Reduce the size of the ALTERNATIVE insns, fix .align (again)!
Sent in private email for testing, hence this second public post is
v2.1: Add CONFIG_RETPOLINE around RSB stuffing

Sorry, Josh, objtool broke again! :)

arch/x86/include/asm/nospec-branch.h | 76 ++++++++++++++++++++++++++++++++++++
arch/x86/kvm/svm.c | 4 ++
arch/x86/kvm/vmx.c | 4 ++
3 files changed, 84 insertions(+)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 7d70ea9..b55ff79 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -7,6 +7,50 @@
#include <asm/alternative-asm.h>
#include <asm/cpufeatures.h>

+/*
+ * Fill the CPU return stack buffer.
+ *
+ * Each entry in the RSB, if used for a speculative 'ret', contains an
+ * infinite 'pause; jmp' loop to capture speculative execution.
+ *
+ * This is required in various cases for retpoline and IBRS-based
+ * mitigations for the Spectre variant 2 vulnerability. Sometimes to
+ * eliminate potentially bogus entries from the RSB, and sometimes
+ * purely to ensure that it doesn't get empty, which on some CPUs would
+ * allow predictions from other (unwanted!) sources to be used.
+ *
+ * We define a CPP macro such that it can be used from both .S files and
+ * inline assembly. It's possible to do a .macro and then include that
+ * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
+ */
+
+#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
+#define RSB_FILL_LOOPS 16 /* To avoid underflow */
+
+#define __FILL_RETURN_BUFFER_PREP(reg, nr) \
+ mov $(nr/2), reg
+
+#define __FILL_RETURN_BUFFER(reg, nr, sp, uniq) \
+ .align 16; \
+.Ldo_call1_ ## uniq: \
+ call .Ldo_call2_ ## uniq; \
+.Ltrap1_ ## uniq: \
+ pause; \
+ jmp .Ltrap1_ ## uniq; \
+ .align 16; \
+.Ldo_call2_ ## uniq: \
+ call .Ldo_loop_ ## uniq; \
+.Ltrap2_ ## uniq: \
+ pause; \
+ jmp .Ltrap2_ ## uniq; \
+ .align 16; \
+.Ldo_loop_ ## uniq: \
+ dec reg; \
+ jnz .Ldo_call1_ ## uniq; \
+ add $(BITS_PER_LONG/8) * nr, sp; \
+.Lskip_rsb_ ## uniq:;
+
+
#ifdef __ASSEMBLY__

/*
@@ -61,6 +105,19 @@
#endif
.endm

+ /*
+ * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
+ * monstrosity above, manually.
+ */
+.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
+#ifdef CONFIG_RETPOLINE
+ ALTERNATIVE "jmp .Lskip_rsb_\@", \
+ __stringify(__FILL_RETURN_BUFFER_PREP(\reg,\nr)), \
+ \ftr
+ __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP,\@)
+#endif
+.endm
+
#else /* __ASSEMBLY__ */

#if defined(CONFIG_X86_64) && defined(RETPOLINE)
@@ -115,5 +172,24 @@ enum spectre_v2_mitigation {
SPECTRE_V2_IBRS,
};

+/*
+ * On VMEXIT we must ensure that no RSB predictions learned in the guest
+ * can be followed in the host, by overwriting the RSB completely. Both
+ * retpoline and IBRS mitigations for Spectre v2 need this; only on future
+ * CPUs with IBRS_ATT *might* it be avoided.
+ */
+static inline void vmexit_fill_RSB(void)
+{
+#ifdef CONFIG_RETPOLINE
+ unsigned long loops = RSB_CLEAR_LOOPS / 2;
+
+ asm volatile (ALTERNATIVE("jmp .Lskip_rsb__%=",
+ __stringify(__FILL_RETURN_BUFFER_PREP(%0, RSB_CLEAR_LOOPS)),
+ X86_FEATURE_RETPOLINE)
+ __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1, _%=))
+ : "=&r" (loops), ASM_CALL_CONSTRAINT
+ : "r" (loops) : "memory" );
+#endif
+}
#endif /* __ASSEMBLY__ */
#endif /* __NOSPEC_BRANCH_H__ */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0e68f0b..2744b973 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -45,6 +45,7 @@
#include <asm/debugreg.h>
#include <asm/kvm_para.h>
#include <asm/irq_remapping.h>
+#include <asm/nospec-branch.h>

#include <asm/virtext.h>
#include "trace.h"
@@ -4985,6 +4986,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);

+ /* Eliminate branch target predictions from guest mode */
+ vmexit_fill_RSB();
+
#ifdef CONFIG_X86_64
wrmsrl(MSR_GS_BASE, svm->host.gs_base);
#else
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 62ee436..d1e25db 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -50,6 +50,7 @@
#include <asm/apic.h>
#include <asm/irq_remapping.h>
#include <asm/mmu_context.h>
+#include <asm/nospec-branch.h>

#include "trace.h"
#include "pmu.h"
@@ -9403,6 +9404,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);

+ /* Eliminate branch target predictions from guest mode */
+ vmexit_fill_RSB();
+
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (debugctlmsr)
update_debugctlmsr(debugctlmsr);
--
2.7.4

2018-01-11 12:26:23

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

2018-01-11 14:21:08

by Josh Poimboeuf

[permalink] [raw]

Subject: Re: [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

On Thu, Jan 11, 2018 at 11:37:18AM +0000, David Woodhouse wrote:
> In accordance with the Intel and AMD documentation, we need to overwrite
> all entries in the RSB on exiting a guest, to prevent malicious branch
> target predictions from affecting the host kernel. This is needed both
> for retpoline and for IBRS.
>
> Signed-off-by: David Woodhouse <[email protected]>
> ---
> v2: Reduce the size of the ALTERNATIVE insns, fix .align (again)!
> Sent in private email for testing, hence this second public post is
> v2.1: Add CONFIG_RETPOLINE around RSB stuffing
>
> Sorry, Josh, objtool broke again! :)
>
> arch/x86/include/asm/nospec-branch.h | 76 ++++++++++++++++++++++++++++++++++++
> arch/x86/kvm/svm.c | 4 ++
> arch/x86/kvm/vmx.c | 4 ++
> 3 files changed, 84 insertions(+)
>
> diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> index 7d70ea9..b55ff79 100644
> --- a/arch/x86/include/asm/nospec-branch.h
> +++ b/arch/x86/include/asm/nospec-branch.h
> @@ -7,6 +7,50 @@
> #include <asm/alternative-asm.h>
> #include <asm/cpufeatures.h>
>
> +/*
> + * Fill the CPU return stack buffer.
> + *
> + * Each entry in the RSB, if used for a speculative 'ret', contains an
> + * infinite 'pause; jmp' loop to capture speculative execution.
> + *
> + * This is required in various cases for retpoline and IBRS-based
> + * mitigations for the Spectre variant 2 vulnerability. Sometimes to
> + * eliminate potentially bogus entries from the RSB, and sometimes
> + * purely to ensure that it doesn't get empty, which on some CPUs would
> + * allow predictions from other (unwanted!) sources to be used.
> + *
> + * We define a CPP macro such that it can be used from both .S files and
> + * inline assembly. It's possible to do a .macro and then include that
> + * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
> + */
> +
> +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
> +#define RSB_FILL_LOOPS 16 /* To avoid underflow */
> +
> +#define __FILL_RETURN_BUFFER_PREP(reg, nr) \
> + mov $(nr/2), reg
> +
> +#define __FILL_RETURN_BUFFER(reg, nr, sp, uniq) \
> + .align 16; \
> +.Ldo_call1_ ## uniq: \
> + call .Ldo_call2_ ## uniq; \
> +.Ltrap1_ ## uniq: \
> + pause; \
> + jmp .Ltrap1_ ## uniq; \
> + .align 16; \
> +.Ldo_call2_ ## uniq: \
> + call .Ldo_loop_ ## uniq; \
> +.Ltrap2_ ## uniq: \
> + pause; \
> + jmp .Ltrap2_ ## uniq; \
> + .align 16; \
> +.Ldo_loop_ ## uniq: \
> + dec reg; \
> + jnz .Ldo_call1_ ## uniq; \
> + add $(BITS_PER_LONG/8) * nr, sp; \
> +.Lskip_rsb_ ## uniq:;
> +
> +
> #ifdef __ASSEMBLY__
>
> /*
> @@ -61,6 +105,19 @@
> #endif
> .endm
>
> + /*
> + * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
> + * monstrosity above, manually.
> + */
> +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
> +#ifdef CONFIG_RETPOLINE
> + ALTERNATIVE "jmp .Lskip_rsb_\@", \
> + __stringify(__FILL_RETURN_BUFFER_PREP(\reg,\nr)), \
> + \ftr
> + __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP,\@)
> +#endif
> +.endm

This seems weird. I liked v1 a lot better. What's the problem with
patching in the whole thing?

Also, if you go back to v1, it should be an easy objtool fix, just add
ANNOTATE_NOSPEC_ALTERNATIVE in front of it.

--
Josh

2018-01-11 14:28:49

by David Woodhouse

[permalink] [raw]

Subject: Re: [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

On Thu, 2018-01-11 at 08:20 -0600, Josh Poimboeuf wrote:
>
> This seems weird. I liked v1 a lot better. What's the problem with
> patching in the whole thing?
>
> Also, if you go back to v1, it should be an easy objtool fix, just add
> ANNOTATE_NOSPEC_ALTERNATIVE in front of it.

The objection was that I was patching in a fairly long set of
instructions. I confess I don't actually know why that's a problem, but
once I looked at it I realised the alignment was broken again. Again,
alignment in the altinstr section doesn't necessarily mean alignment
when it's copied into place over the oldinstr.

I took a quick look at doing it out-of-line and calling it... and
exporting it... and defining it to take *one* register rather than
being a macro... and ditched that approach then ended up with what's in
v2.

Attachments:

smime.p7s (5.09 kB)

2018-01-11 14:32:18

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

On Thu, Jan 11, 2018 at 02:28:32PM +0000, David Woodhouse wrote:
> On Thu, 2018-01-11 at 08:20 -0600, Josh Poimboeuf wrote:
> >
> > This seems weird.? I liked v1 a lot better.? What's the problem with
> > patching in the whole thing?
> >
> > Also, if you go back to v1, it should be an easy objtool fix, just add
> > ANNOTATE_NOSPEC_ALTERNATIVE in front of it.
>
> The objection was that I was patching in a fairly long set of
> instructions. I confess I don't actually know why that's a problem,

You get a giant string of NOPs, a single jmp should be faster.

2018-01-11 14:54:12

by David Woodhouse

[permalink] [raw]

Subject: Re: [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

On Thu, 2018-01-11 at 15:32 +0100, Peter Zijlstra wrote:
> On Thu, Jan 11, 2018 at 02:28:32PM +0000, David Woodhouse wrote:
> >
> > On Thu, 2018-01-11 at 08:20 -0600, Josh Poimboeuf wrote:
> > >
> > >
> > > This seems weird. I liked v1 a lot better. What's the problem with
> > > patching in the whole thing?
> > >
> > > Also, if you go back to v1, it should be an easy objtool fix, just add
> > > ANNOTATE_NOSPEC_ALTERNATIVE in front of it.
> > The objection was that I was patching in a fairly long set of
> > instructions. I confess I don't actually know why that's a problem,
>
> You get a giant string of NOPs, a single jmp should be faster.

How about this one then (with ANNOTATE_NOSPEC_ALTERNATIVE):

-       asm volatile (ALTERNATIVE("",
+       asm volatile (ALTERNATIVE("jmp " alt_end_marker "f",
                                  __stringify(__FILL_RETURN_BUFFER(%0, %1, _%=)),
                                  X86_FEATURE_RETPOLINE)
                      : "=r" (dummy), ASM_CALL_CONSTRAINT : : "memory" );

Attachments:

smime.p7s (5.09 kB)

2018-01-11 14:59:23

by Josh Poimboeuf

[permalink] [raw]

Subject: Re: [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

On Thu, Jan 11, 2018 at 02:28:32PM +0000, David Woodhouse wrote:
> On Thu, 2018-01-11 at 08:20 -0600, Josh Poimboeuf wrote:
> >
> > This seems weird. I liked v1 a lot better. What's the problem with
> > patching in the whole thing?
> >
> > Also, if you go back to v1, it should be an easy objtool fix, just add
> > ANNOTATE_NOSPEC_ALTERNATIVE in front of it.
>
> The objection was that I was patching in a fairly long set of
> instructions. I confess I don't actually know why that's a problem, but
> once I looked at it I realised the alignment was broken again. Again,
> alignment in the altinstr section doesn't necessarily mean alignment
> when it's copied into place over the oldinstr.

I'd *much* rather have things be consistent, and put all the crap code
behind specific features, like the retpolines already do. It makes the
source code easier to read, object code easier to read, and, ahem, makes
objtool's life a lot easier.

In fact, Boris mentioned on IRC that I could remove the
ANNOTATE_NOSPEC_ALTERNATIVE annotations, and instead have objtool detect
the nospec stuff by looking at the alternative CPU feature bit, which it
already knows how to do. So it would be really helpful to guard all
that nasty stuff behind alternatives.

I may have missed previous discussions about alignment, but if we
*really* needed alignment, maybe that could be accomplished by splitting
the alternative into two alternatives, with an .align directive as the
original instruction for the second one.

But we shouldn't even worry about alignment unless there's a real and
measureable reason to do so.

> exporting it... and defining it to take *one* register rather than
> being a macro... and ditched that approach then ended up with what's in
> v2.

Instead of exporting it you could keep it in the header file and give it
a "noinline" annotation so that it's out-of-line but still static.

--
Josh

2018-01-11 15:04:46

by Josh Poimboeuf

[permalink] [raw]

Subject: Re: [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

On Thu, Jan 11, 2018 at 02:53:57PM +0000, David Woodhouse wrote:
> On Thu, 2018-01-11 at 15:32 +0100, Peter Zijlstra wrote:
> > On Thu, Jan 11, 2018 at 02:28:32PM +0000, David Woodhouse wrote:
> > >
> > > On Thu, 2018-01-11 at 08:20 -0600, Josh Poimboeuf wrote:
> > > >
> > > >
> > > > This seems weird. I liked v1 a lot better. What's the problem with
> > > > patching in the whole thing?
> > > >
> > > > Also, if you go back to v1, it should be an easy objtool fix, just add
> > > > ANNOTATE_NOSPEC_ALTERNATIVE in front of it.
> > > The objection was that I was patching in a fairly long set of
> > > instructions. I confess I don't actually know why that's a problem,
> >
> > You get a giant string of NOPs, a single jmp should be faster.
>
> How about this one then (with ANNOTATE_NOSPEC_ALTERNATIVE):
>
> -       asm volatile (ALTERNATIVE("",
> +       asm volatile (ALTERNATIVE("jmp " alt_end_marker "f",
>                                   __stringify(__FILL_RETURN_BUFFER(%0, %1, _%=)),
>                                   X86_FEATURE_RETPOLINE)
>                       : "=r" (dummy), ASM_CALL_CONSTRAINT : : "memory" );

Looks good to me.

Another IRC discussion was that Boris may eventually add a feature to
the alternatives code to automatically insert such a jump if there are a
lot of nops.

--
Josh

2018-01-11 15:22:46

by Brian Gerst

[permalink] [raw]

Subject: Re: [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

On Thu, Jan 11, 2018 at 9:32 AM, Peter Zijlstra <[email protected]> wrote:
> On Thu, Jan 11, 2018 at 02:28:32PM +0000, David Woodhouse wrote:
>> On Thu, 2018-01-11 at 08:20 -0600, Josh Poimboeuf wrote:
>> >
>> > This seems weird. I liked v1 a lot better. What's the problem with
>> > patching in the whole thing?
>> >
>> > Also, if you go back to v1, it should be an easy objtool fix, just add
>> > ANNOTATE_NOSPEC_ALTERNATIVE in front of it.
>>
>> The objection was that I was patching in a fairly long set of
>> instructions. I confess I don't actually know why that's a problem,
>
> You get a giant string of NOPs, a single jmp should be faster.

That could be handled in add_nops(), where if over a certain threshold
it changes to a JMP.

--
Brian Gerst

2018-01-11 15:24:02

by David Woodhouse

[permalink] [raw]

Subject: Re: [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

On Thu, 2018-01-11 at 10:22 -0500, Brian Gerst wrote:
> On Thu, Jan 11, 2018 at 9:32 AM, Peter Zijlstra <[email protected]> wrote:
> > On Thu, Jan 11, 2018 at 02:28:32PM +0000, David Woodhouse wrote:
> >> On Thu, 2018-01-11 at 08:20 -0600, Josh Poimboeuf wrote:
> >> >
> >> > This seems weird. I liked v1 a lot better. What's the problem with
> >> > patching in the whole thing?
> >> >
> >> > Also, if you go back to v1, it should be an easy objtool fix, just add
> >> > ANNOTATE_NOSPEC_ALTERNATIVE in front of it.
> >>
> >> The objection was that I was patching in a fairly long set of
> >> instructions. I confess I don't actually know why that's a problem,
> >
> > You get a giant string of NOPs, a single jmp should be faster.
>
> That could be handled in add_nops(), where if over a certain threshold
> it changes to a JMP.

Currently that only processes the *altinstr* doesn't it? Not the
oldinstr?

Attachments:

smime.p7s (5.09 kB)

2018-01-11 15:28:26

by Brian Gerst

[permalink] [raw]

Subject: Re: [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

On Thu, Jan 11, 2018 at 10:23 AM, David Woodhouse <[email protected]> wrote:
> On Thu, 2018-01-11 at 10:22 -0500, Brian Gerst wrote:
>> On Thu, Jan 11, 2018 at 9:32 AM, Peter Zijlstra <[email protected]> wrote:
>> > On Thu, Jan 11, 2018 at 02:28:32PM +0000, David Woodhouse wrote:
>> >> On Thu, 2018-01-11 at 08:20 -0600, Josh Poimboeuf wrote:
>> >> >
>> >> > This seems weird. I liked v1 a lot better. What's the problem with
>> >> > patching in the whole thing?
>> >> >
>> >> > Also, if you go back to v1, it should be an easy objtool fix, just add
>> >> > ANNOTATE_NOSPEC_ALTERNATIVE in front of it.
>> >>
>> >> The objection was that I was patching in a fairly long set of
>> >> instructions. I confess I don't actually know why that's a problem,
>> >
>> > You get a giant string of NOPs, a single jmp should be faster.
>>
>> That could be handled in add_nops(), where if over a certain threshold
>> it changes to a JMP.
>
> Currently that only processes the *altinstr* doesn't it? Not the
> oldinstr?

It does both.

--
Brian Gerst

2018-01-11 16:10:53

by David Woodhouse

[permalink] [raw]

Subject: Re: [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

On Thu, 2018-01-11 at 09:04 -0600, Josh Poimboeuf wrote:
>
> > How about this one then (with ANNOTATE_NOSPEC_ALTERNATIVE):
> >
> > -       asm volatile (ALTERNATIVE("",
> > +       asm volatile (ALTERNATIVE("jmp " alt_end_marker "f",
> >                                   __stringify(__FILL_RETURN_BUFFER(%0, %1, _%=)),
> >                                   X86_FEATURE_RETPOLINE)
> >                       : "=r" (dummy), ASM_CALL_CONSTRAINT : : "memory" );
>
> Looks good to me.
>
> Another IRC discussion was that Boris may eventually add a feature to
> the alternatives code to automatically insert such a jump if there are a
> lot of nops.

OK, v3 sent out with that. I've just manually put in a jump round it
(less hackishly than the alt_end_marker one) in the oldinstr for now.

This wants rolling into your objtool fixes:

--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -119,7 +119,8 @@
   */
.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
#ifdef CONFIG_RETPOLINE
- ALTERNATIVE "jmp .Lskip_rsb_\@", \
+ ANNOTATE_NOSPEC_ALTERNATIVE
+ ALTERNATIVE "jmp .Lskip_rsb_\@", \
__stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP,\@)) \
\ftr
.Lskip_rsb_\@:
@@ -149,6 +150,7 @@
# define THUNK_TARGET(addr) [thunk_target] "r" (addr)

#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE)
+# define ANNOTATE_NOSPEC_ALTERNATIVE
/*
* For i386 we use the original ret-equivalent retpoline, because
* otherwise we'll run out of registers. We don't care about CET
@@ -201,7 +203,8 @@ static inline void vmexit_fill_RSB(void)
#ifdef CONFIG_RETPOLINE
unsigned long loops = RSB_CLEAR_LOOPS / 2;

- asm volatile (ALTERNATIVE("jmp .Lskip_rsb_%=",
+ asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
+       ALTERNATIVE("jmp .Lskip_rsb_%=",
  __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1, _%=)),
  X86_FEATURE_RETPOLINE)
      ".Lskip_rsb_%=:"

Attachments:

smime.p7s (5.09 kB)

2018-01-27 04:24:55

by Konrad Rzeszutek Wilk

[permalink] [raw]

Subject: Re: [PATCH v2.1] x86/retpoline: Fill return stack buffer on vmexit

> Looks good to me.
>
> Another IRC discussion was that Boris may eventually add a feature to
> the alternatives code to automatically insert such a jump if there are a
> lot of nops.

I vaguely recall that NOPs are skipped by the CPU so that it shouldn't
matter as compared to say jumping over them. But is that no the case
anymore? Or is it that we have so many NOPs that the prefetcher runs
out of space and stalls?