2023-08-09 07:57:10

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH 02/17] x86/cpu: Clean up SRSO return thunk mess

Use the existing configurable return thunk. There is absolute no
justification for having created this __x86_return_thunk alternative.

To clarify, the whole thing looks like:

Zen3/4 does:

srso_alias_untrain_ret:
nop2
lfence
jmp srso_alias_return_thunk
int3

srso_alias_safe_ret: // aliasses srso_alias_untrain_ret just so
add $8, %rsp
ret
int3

srso_alias_return_thunk:
call srso_alias_safe_ret
ud2

While Zen1/2 does:

srso_untrain_ret:
movabs $foo, %rax
lfence
call srso_safe_ret (jmp srso_return_thunk ?)
int3

srso_safe_ret: // embedded in movabs immediate
add $8,%rsp
ret
int3

srso_return_thunk:
call srso_safe_ret
ud2

While retbleed does:

zen_untrain_ret:
test $0xcc, %bl
lfence
jmp zen_return_thunk
int3

zen_return_thunk: // embedded in the test instruction
ret
int3

Where Zen1/2 flush the BTB using the instruction decoder trick
(test,movabs) Zen3/4 use instruction aliasing. SRSO adds RSB (RAP in
AMD speak) stuffing to force a return mis-predict.

That is; the AMD retbleed is a form of Speculative-Type-Confusion
where the branch predictor is trained to use the BTB to predict the
RET address, while AMD inception/SRSO is a form of
Speculative-Type-Confusion where another instruction is trained to be
treated like a CALL instruction and poison the RSB (RAP).

Pick one of three options at boot.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/include/asm/nospec-branch.h | 4 +++
arch/x86/kernel/cpu/bugs.c | 7 ++++--
arch/x86/kernel/vmlinux.lds.S | 2 -
arch/x86/lib/retpoline.S | 37 ++++++++++++++++++++++++-----------
4 files changed, 36 insertions(+), 14 deletions(-)

--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -342,9 +342,13 @@ extern retpoline_thunk_t __x86_indirect_
extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];

extern void __x86_return_thunk(void);
+extern void srso_return_thunk(void);
+extern void srso_alias_return_thunk(void);
+
extern void zen_untrain_ret(void);
extern void srso_untrain_ret(void);
extern void srso_untrain_ret_alias(void);
+
extern void entry_ibpb(void);

extern void (*x86_return_thunk)(void);
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -2305,10 +2305,13 @@ static void __init srso_select_mitigatio
*/
setup_force_cpu_cap(X86_FEATURE_RETHUNK);

- if (boot_cpu_data.x86 == 0x19)
+ if (boot_cpu_data.x86 == 0x19) {
setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
- else
+ x86_return_thunk = srso_alias_return_thunk;
+ } else {
setup_force_cpu_cap(X86_FEATURE_SRSO);
+ x86_return_thunk = srso_return_thunk;
+ }
srso_mitigation = SRSO_MITIGATION_SAFE_RET;
} else {
pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -523,7 +523,7 @@ INIT_PER_CPU(irq_stack_backing_store);
#endif

#ifdef CONFIG_RETHUNK
-. = ASSERT((__ret & 0x3f) == 0, "__ret not cacheline-aligned");
+. = ASSERT((__x86_return_thunk & 0x3f) == 0, "__x86_return_thunk not cacheline-aligned");
. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
#endif

--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -151,10 +151,11 @@ SYM_CODE_END(__x86_indirect_jump_thunk_a
.section .text.__x86.rethunk_untrain

SYM_START(srso_untrain_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
+ UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
ASM_NOP2
lfence
- jmp __x86_return_thunk
+ jmp srso_alias_return_thunk
SYM_FUNC_END(srso_untrain_ret_alias)
__EXPORT_THUNK(srso_untrain_ret_alias)

@@ -184,7 +185,7 @@ SYM_FUNC_END(srso_safe_ret_alias)
* from re-poisioning the BTB prediction.
*/
.align 64
- .skip 64 - (__ret - zen_untrain_ret), 0xcc
+ .skip 64 - (__x86_return_thunk - zen_untrain_ret), 0xcc
SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
ANNOTATE_NOENDBR
/*
@@ -216,10 +217,10 @@ SYM_START(zen_untrain_ret, SYM_L_GLOBAL,
* evicted, __x86_return_thunk will suffer Straight Line Speculation
* which will be contained safely by the INT3.
*/
-SYM_INNER_LABEL(__ret, SYM_L_GLOBAL)
+SYM_INNER_LABEL(__x86_return_thunk, SYM_L_GLOBAL)
ret
int3
-SYM_CODE_END(__ret)
+SYM_CODE_END(__x86_return_thunk)

/*
* Ensure the TEST decoding / BTB invalidation is complete.
@@ -230,11 +231,13 @@ SYM_CODE_END(__ret)
* Jump back and execute the RET in the middle of the TEST instruction.
* INT3 is for SLS protection.
*/
- jmp __ret
+ jmp __x86_return_thunk
int3
SYM_FUNC_END(zen_untrain_ret)
__EXPORT_THUNK(zen_untrain_ret)

+EXPORT_SYMBOL(__x86_return_thunk)
+
/*
* SRSO untraining sequence for Zen1/2, similar to zen_untrain_ret()
* above. On kernel entry, srso_untrain_ret() is executed which is a
@@ -257,6 +260,7 @@ SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLO
int3
int3
int3
+ /* end of movabs */
lfence
call srso_safe_ret
int3
@@ -264,12 +268,23 @@ SYM_CODE_END(srso_safe_ret)
SYM_FUNC_END(srso_untrain_ret)
__EXPORT_THUNK(srso_untrain_ret)

-SYM_FUNC_START(__x86_return_thunk)
- ALTERNATIVE_2 "jmp __ret", "call srso_safe_ret", X86_FEATURE_SRSO, \
- "call srso_safe_ret_alias", X86_FEATURE_SRSO_ALIAS
- int3
-SYM_CODE_END(__x86_return_thunk)
-EXPORT_SYMBOL(__x86_return_thunk)
+/*
+ * Both these do an unbalanced CALL to mess up the RSB, terminate with UD2
+ * to indicate noreturn.
+ */
+SYM_CODE_START(srso_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ call srso_safe_ret
+ ud2
+SYM_CODE_END(srso_return_thunk)
+
+SYM_CODE_START(srso_alias_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ call srso_safe_ret_alias
+ ud2
+SYM_CODE_END(srso_alias_return_thunk)

#endif /* CONFIG_RETHUNK */





2023-08-09 17:23:20

by Nikolay Borisov

[permalink] [raw]
Subject: Re: [RFC][PATCH 02/17] x86/cpu: Clean up SRSO return thunk mess



On 9.08.23 г. 10:12 ч., Peter Zijlstra wrote:
> Use the existing configurable return thunk. There is absolute no
> justification for having created this __x86_return_thunk alternative.
>
> To clarify, the whole thing looks like:
>
> Zen3/4 does:
>
> srso_alias_untrain_ret:
> nop2
> lfence
> jmp srso_alias_return_thunk
> int3
>
> srso_alias_safe_ret: // aliasses srso_alias_untrain_ret just so
> add $8, %rsp
> ret
> int3
>
> srso_alias_return_thunk:
> call srso_alias_safe_ret
> ud2
>
> While Zen1/2 does:
>
> srso_untrain_ret:
> movabs $foo, %rax
> lfence
> call srso_safe_ret (jmp srso_return_thunk ?)
> int3
>
> srso_safe_ret: // embedded in movabs immediate
> add $8,%rsp
> ret
> int3
>
> srso_return_thunk:
> call srso_safe_ret
> ud2
>
> While retbleed does:
>
> zen_untrain_ret:
> test $0xcc, %bl
> lfence
> jmp zen_return_thunk
> int3
>
> zen_return_thunk: // embedded in the test instruction
> ret
> int3
>
> Where Zen1/2 flush the BTB using the instruction decoder trick
> (test,movabs) Zen3/4 use instruction aliasing. SRSO adds RSB (RAP in
> AMD speak) stuffing to force a return mis-predict.
>
> That is; the AMD retbleed is a form of Speculative-Type-Confusion
> where the branch predictor is trained to use the BTB to predict the
> RET address, while AMD inception/SRSO is a form of
> Speculative-Type-Confusion where another instruction is trained to be
> treated like a CALL instruction and poison the RSB (RAP).
>
> Pick one of three options at boot.
>


So this boils down to simply removing one level of indirection, instead
of patching the body of __x86_return_thunk you directly patch the return
sites with the correct thunk.


Reviewed-by: Nikolay Borisov <[email protected]>

2023-08-10 12:21:32

by Borislav Petkov

[permalink] [raw]
Subject: Re: [RFC][PATCH 02/17] x86/cpu: Clean up SRSO return thunk mess

On Wed, Aug 09, 2023 at 09:12:20AM +0200, Peter Zijlstra wrote:
> Where Zen1/2 flush the BTB using the instruction decoder trick
> (test,movabs) Zen3/4 use instruction aliasing. SRSO adds RSB (RAP in

BTB aliasing.

> AMD speak) stuffing to force a return mis-predict.

No it doesn't. It causes BTB aliasing which evicts any potentially
poisoned entries.

> That is; the AMD retbleed is a form of Speculative-Type-Confusion
> where the branch predictor is trained to use the BTB to predict the
> RET address, while AMD inception/SRSO is a form of
> Speculative-Type-Confusion where another instruction is trained to be
> treated like a CALL instruction and poison the RSB (RAP).

Nope, Andy explained it already in the 0th message.

> Pick one of three options at boot.

Yes, provided microarchitecturally that works, I'm all for removing the
__ret alternative.

Thx.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2023-08-10 14:20:04

by Borislav Petkov

[permalink] [raw]
Subject: Re: [RFC][PATCH 02/17] x86/cpu: Clean up SRSO return thunk mess

On Thu, Aug 10, 2023 at 02:37:56PM +0200, Peter Zijlstra wrote:
> It does; so zen1/2 use the decoder thing to flush BTB entry of the RET,
> both retbleed and srso do.
>
> Then zen3/4 use the aliassing trick to flush the BTB entry of the RET.

Yes, I was correcting your "instruction aliasing". It is "BTB aliasing"
by causing those bits in the VAs to XOR.

> Then both srso options use RSB/RAP stuffing to force a mispredict there.

They cause the RETs to mispredict - no stuffing. That's the add $8,
%rsp in the zen3/4 case which causes the RET to mispredict. There's no
doing a bunch of CALLs to stuff something.

> Retbleed doesn't do this.
>
> retbleed is about BTB, srso does both BTB and RSB/RAP.

Yes.

> So this patch doesn't actually change anything except one layer of
> indirection.

I agree with everything from here on to the end. Provided we can do that
and there's no some microarchitectural catch there, I'm all for removing
the __ret alternative.

Thx.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2023-08-10 15:09:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 02/17] x86/cpu: Clean up SRSO return thunk mess

On Thu, Aug 10, 2023 at 02:56:31PM +0200, Borislav Petkov wrote:

> > Then both srso options use RSB/RAP stuffing to force a mispredict there.
>
> They cause the RETs to mispredict - no stuffing. That's the add $8,
> %rsp in the zen3/4 case which causes the RET to mispredict. There's no
> doing a bunch of CALLs to stuff something.

This is what is called RSB stuffing, we've been doing it for ages on the
Intel side, and code in nospec-branch.h has a number of variants of
this.

CALL srso_safe_ret // push addr of UD2 into RSB -- aka 'stuff'
UD2
srso_safe_ret:
ADD $8, %RSP // skip over the return to UD2
RET // pop RSB, speculate into UD2, miss like a beast


Now compare to __FILL_ONE_RETURN, which has the comment 'Stuff a single
RSB slot.' That expands to:

call 772f
int3
772: add $8, %rsp
lfence

Which is the same sequence and causes the next RET to speculate into
that int3.


So RSB stuffing is sticking addresses to traps in the RSB so that
subsequent predictions go into said traps instead of potentially user
controlled targets.


2023-08-10 15:13:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 02/17] x86/cpu: Clean up SRSO return thunk mess

On Thu, Aug 10, 2023 at 01:51:48PM +0200, Borislav Petkov wrote:
> On Wed, Aug 09, 2023 at 09:12:20AM +0200, Peter Zijlstra wrote:
> > Where Zen1/2 flush the BTB using the instruction decoder trick
> > (test,movabs) Zen3/4 use instruction aliasing. SRSO adds RSB (RAP in
>
> BTB aliasing.
>
> > AMD speak) stuffing to force a return mis-predict.
>
> No it doesn't. It causes BTB aliasing which evicts any potentially
> poisoned entries.

It does; so zen1/2 use the decoder thing to flush BTB entry of the RET,
both retbleed and srso do.

Then zen3/4 use the aliassing trick to flush the BTB entry of the RET.

Then both srso options use RSB/RAP stuffing to force a mispredict there.
Retbleed doesn't do this.

retbleed is about BTB, srso does both BTB and RSB/RAP.

> > That is; the AMD retbleed is a form of Speculative-Type-Confusion
> > where the branch predictor is trained to use the BTB to predict the
> > RET address, while AMD inception/SRSO is a form of
> > Speculative-Type-Confusion where another instruction is trained to be
> > treated like a CALL instruction and poison the RSB (RAP).
>
> Nope, Andy explained it already in the 0th message.

I'm still of the opinion that branch-type-confusion is an integral part
of setting up the srso RSB/RAP trickery. It just targets a different
predictor, RSB/RAP vs BTB.

> > Pick one of three options at boot.
>
> Yes, provided microarchitecturally that works, I'm all for removing the
> __ret alternative.

So this patch doesn't actually change anything except one layer of
indirection.

Your thing does:

SYNC_FUNC_START(foo)
...
ALTERNATIVE "ret; int3",
"jmp __x86_return_thunk", X86_FEATURE_RETHUNK
SYM_FUNC_END(foo)

SYM_FUNC_START(__x86_return_thunk)
ALTERNATIVE("jmp __ret",
"call srso_safe_ret", X86_FEATURE_SRSO,
"call srso_alias_safe_ret", X86_FEATURE_SRSO_ALIAS);
int3
SYM_FUNC_END(__x86_return_thunk)


So what was RET, jumps to __x86_return_thunk, which then jumps to the
actual return thunk.

After this patch things look equivalent to:

SYM_FUNC_START(foo)
...
ALTERNATIVE "ret; int3"
"jmp __x86_return_thunk", X86_FEATURE_RETHUNK
"jmp srso_return_thunk, X86_FEATURE_SRSO
"jmp srsi_alias_return_thunk", X86_FEATURE_SRSO_ALIAS
SYM_FUNC_END(foo)

SYM_CODE_START(srso_return_thunk)
UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
call srso_safe_ret;
ud2
SYM_CODE_END(srso_return_thunk)

SYM_CODE_START(srso_alias_return_thunk)
UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
call srso_alias_safe_ret;
ud2
SYM_CODE_END(srso_alias_return_thunk)


Except of course we don't have an actual ALTERNATIVE at the ret site,
but .return_sites and rewriting things to either "ret; int3" or whatever
function is in x86_return_thunk.


Before this patch, only one ret thunk is used at any one time, after
this patch still only one ret thunk is used.

fundamentally, you can only ever use one ret.

IOW this patch changes nothing for SRSO, it still does a jump to a call.
But it does clean up retbleed, which you had as a jump to a jump, back
to just a jump, and it does get rid of that extra alternative layer yo
had by using the one we already have at .return_sites rewrite.


2023-08-11 07:11:14

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 02/17] x86/cpu: Clean up SRSO return thunk mess

On Thu, Aug 10, 2023 at 02:37:56PM +0200, Peter Zijlstra wrote:

> After this patch things look equivalent to:
>
> SYM_FUNC_START(foo)
> ...
> ALTERNATIVE "ret; int3"
> "jmp __x86_return_thunk", X86_FEATURE_RETHUNK
> "jmp srso_return_thunk, X86_FEATURE_SRSO
> "jmp srsi_alias_return_thunk", X86_FEATURE_SRSO_ALIAS
> SYM_FUNC_END(foo)
>
> SYM_CODE_START(srso_return_thunk)
> UNWIND_HINT_FUNC
> ANNOTATE_NOENDBR
> call srso_safe_ret;
> ud2
> SYM_CODE_END(srso_return_thunk)
>
> SYM_CODE_START(srso_alias_return_thunk)
> UNWIND_HINT_FUNC
> ANNOTATE_NOENDBR
> call srso_alias_safe_ret;
> ud2
> SYM_CODE_END(srso_alias_return_thunk)
>

So it looks like the compilers are still not emitting int3 after jmp,
even with the SLS options enabled :/

This means the tail end of functions compiled with:

-mharden-sls=all -mfunction-return=thunk-extern

Is still a regular: jmp __x86_return_thunk, no trailing trap.

https://godbolt.org/z/Ecqv76YbE

If we all could please finally fix that, then I can rewrite the above to
effectively be:

SYM_FUNC_START(foo)
...
ALTERNATIVE "ret; int3"
"jmp __x86_return_thunk", X86_FEATURE_RETHUNK
"call srso_safe_ret, X86_FEATURE_SRSO
"call srso_alias_safe_ret", X86_FEATURE_SRSO_ALIAS
int3 // <--- *MISSING*
SYM_FUNC_END(foo)

Bonus points if I can compile time tell if a compiler DTRT, feature flag
or what have you in the preprocessor would be awesome.

2023-08-11 18:17:15

by Nick Desaulniers

[permalink] [raw]
Subject: Re: [RFC][PATCH 02/17] x86/cpu: Clean up SRSO return thunk mess

On Fri, Aug 11, 2023 at 12:01 AM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Aug 10, 2023 at 02:37:56PM +0200, Peter Zijlstra wrote:
>
> > After this patch things look equivalent to:
> >
> > SYM_FUNC_START(foo)
> > ...
> > ALTERNATIVE "ret; int3"
> > "jmp __x86_return_thunk", X86_FEATURE_RETHUNK
> > "jmp srso_return_thunk, X86_FEATURE_SRSO
> > "jmp srsi_alias_return_thunk", X86_FEATURE_SRSO_ALIAS
> > SYM_FUNC_END(foo)
> >
> > SYM_CODE_START(srso_return_thunk)
> > UNWIND_HINT_FUNC
> > ANNOTATE_NOENDBR
> > call srso_safe_ret;
> > ud2
> > SYM_CODE_END(srso_return_thunk)
> >
> > SYM_CODE_START(srso_alias_return_thunk)
> > UNWIND_HINT_FUNC
> > ANNOTATE_NOENDBR
> > call srso_alias_safe_ret;
> > ud2
> > SYM_CODE_END(srso_alias_return_thunk)
> >
>
> So it looks like the compilers are still not emitting int3 after jmp,
> even with the SLS options enabled :/
>
> This means the tail end of functions compiled with:
>
> -mharden-sls=all -mfunction-return=thunk-extern
>
> Is still a regular: jmp __x86_return_thunk, no trailing trap.
>
> https://godbolt.org/z/Ecqv76YbE

I don't have time to finish this today, but
https://reviews.llvm.org/D157734 should do what you're looking for, I
think.

>
> If we all could please finally fix that, then I can rewrite the above to
> effectively be:
>
> SYM_FUNC_START(foo)
> ...
> ALTERNATIVE "ret; int3"
> "jmp __x86_return_thunk", X86_FEATURE_RETHUNK
> "call srso_safe_ret, X86_FEATURE_SRSO
> "call srso_alias_safe_ret", X86_FEATURE_SRSO_ALIAS
> int3 // <--- *MISSING*
> SYM_FUNC_END(foo)
>
> Bonus points if I can compile time tell if a compiler DTRT, feature flag
> or what have you in the preprocessor would be awesome.

Probably not a preprocessor token; in the past I have made that
suggestion and the old guard informed me "no, too many preprocessor
tokens to lex, no more!" I still disagree but that is a viewpoint I
can sympathize with, slightly.

Probably version checks for now on the SLS config (or version checks
on a new kconfig CONFIG_IMPROVED_SLS)

--
Thanks,
~Nick Desaulniers

2023-08-12 12:25:47

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 02/17] x86/cpu: Clean up SRSO return thunk mess

On Fri, Aug 11, 2023 at 10:00:31AM -0700, Nick Desaulniers wrote:
> On Fri, Aug 11, 2023 at 12:01 AM Peter Zijlstra <[email protected]> wrote:
> >
> > On Thu, Aug 10, 2023 at 02:37:56PM +0200, Peter Zijlstra wrote:
> >
> > > After this patch things look equivalent to:
> > >
> > > SYM_FUNC_START(foo)
> > > ...
> > > ALTERNATIVE "ret; int3"
> > > "jmp __x86_return_thunk", X86_FEATURE_RETHUNK
> > > "jmp srso_return_thunk, X86_FEATURE_SRSO
> > > "jmp srsi_alias_return_thunk", X86_FEATURE_SRSO_ALIAS
> > > SYM_FUNC_END(foo)
> > >
> > > SYM_CODE_START(srso_return_thunk)
> > > UNWIND_HINT_FUNC
> > > ANNOTATE_NOENDBR
> > > call srso_safe_ret;
> > > ud2
> > > SYM_CODE_END(srso_return_thunk)
> > >
> > > SYM_CODE_START(srso_alias_return_thunk)
> > > UNWIND_HINT_FUNC
> > > ANNOTATE_NOENDBR
> > > call srso_alias_safe_ret;
> > > ud2
> > > SYM_CODE_END(srso_alias_return_thunk)
> > >
> >
> > So it looks like the compilers are still not emitting int3 after jmp,
> > even with the SLS options enabled :/
> >
> > This means the tail end of functions compiled with:
> >
> > -mharden-sls=all -mfunction-return=thunk-extern
> >
> > Is still a regular: jmp __x86_return_thunk, no trailing trap.
> >
> > https://godbolt.org/z/Ecqv76YbE
>
> I don't have time to finish this today, but
> https://reviews.llvm.org/D157734 should do what you're looking for, I
> think.

Hmm, so your wording seems to imply regular SLS would already emit INT3
after jump, but I'm not seeing that in clang-16 output. Should I upgrade
my llvm?

[[edit]] Oooh, now I see, regular SLS would emit RET; INT3, but what I'm
alluding to was that sls=all should also emit INT3 after every JMP due
to AMD BTC. This is an SLS option that seems to have gone missing in
both compilers for a long while.


And yesterday I only quickly looked at bigger gcc output and not clang.
But when I look at clang-16 output I see things like:

1053: 2e e8 00 00 00 00 cs call 1059 <yield_to+0xe9> 1055: R_X86_64_PLT32 __x86_indirect_thunk_r11-0x4
1059: 84 c0 test %al,%al
105b: 74 1c je 1079 <yield_to+0x109>
105d: eb 6e jmp 10cd <yield_to+0x15d>

No INT3

105f: 41 bc 01 00 00 00 mov $0x1,%r12d
1065: 80 7c 24 04 00 cmpb $0x0,0x4(%rsp)
106a: 74 0d je 1079 <yield_to+0x109>
106c: 4d 39 fe cmp %r15,%r14
106f: 74 08 je 1079 <yield_to+0x109>
1071: 4c 89 ff mov %r15,%rdi
1074: e8 00 00 00 00 call 1079 <yield_to+0x109> 1075: R_X86_64_PLT32 resched_curr-0x4
1079: 4d 39 fe cmp %r15,%r14
107c: 74 08 je 1086 <yield_to+0x116>
107e: 4c 89 ff mov %r15,%rdi
1081: e8 00 00 00 00 call 1086 <yield_to+0x116> 1082: R_X86_64_PLT32 _raw_spin_unlock-0x4
1086: 4c 89 f7 mov %r14,%rdi
1089: e8 00 00 00 00 call 108e <yield_to+0x11e> 108a: R_X86_64_PLT32 _raw_spin_unlock-0x4
108e: f7 c3 00 02 00 00 test $0x200,%ebx
1094: 74 06 je 109c <yield_to+0x12c>
1096: ff 15 00 00 00 00 call *0x0(%rip) # 109c <yield_to+0x12c> 1098: R_X86_64_PC32 pv_ops+0xfc
109c: 45 85 e4 test %r12d,%r12d
109f: 7e 05 jle 10a6 <yield_to+0x136>
10a1: e8 00 00 00 00 call 10a6 <yield_to+0x136> 10a2: R_X86_64_PLT32 schedule-0x4
10a6: 44 89 e0 mov %r12d,%eax
10a9: 48 83 c4 08 add $0x8,%rsp
10ad: 5b pop %rbx
10ae: 41 5c pop %r12
10b0: 41 5d pop %r13
10b2: 41 5e pop %r14
10b4: 41 5f pop %r15
10b6: 5d pop %rbp
10b7: 2e e9 00 00 00 00 cs jmp 10bd <yield_to+0x14d> 10b9: R_X86_64_PLT32 __x86_return_thunk-0x4

CS padding!!

10bd: 41 bc fd ff ff ff mov $0xfffffffd,%r12d
10c3: f7 c3 00 02 00 00 test $0x200,%ebx


So since you (surprisingly!) CS pad the return thunk, I *could* pull it
off there, 6 bytes is enough space to write: 'CALL foo; INT3'

But really SLS *should* put INT3 after every JMP instruction -- of
course including the return thunk one.