The current BPF call convention is __nocfi, except when it calls !JIT things,
then it calls regular C functions.
It so happens that with FineIBT the __nocfi and C calling conventions are
incompatible. Specifically __nocfi will call at func+0, while FineIBT will have
endbr-poison there, which is not a valid indirect target. Causing #CP.
Notably this only triggers on IBT enabled hardware, which is probably why this
hasn't been reported (also, most people will have JIT on anyway).
Implement proper CFI prologues for the BPF JIT codegen and drop __nocfi for
x86.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/include/asm/cfi.h | 12 +++++
arch/x86/kernel/alternative.c | 41 ++++++++++++++---
arch/x86/net/bpf_jit_comp.c | 96 +++++++++++++++++++++++++++++++++++++-----
include/linux/bpf.h | 9 +++
4 files changed, 137 insertions(+), 21 deletions(-)
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -9,15 +9,27 @@
*/
#include <linux/bug.h>
+enum cfi_mode {
+ CFI_DEFAULT,
+ CFI_OFF,
+ CFI_KCFI,
+ CFI_FINEIBT,
+};
+
+extern enum cfi_mode cfi_mode;
+
struct pt_regs;
#ifdef CONFIG_CFI_CLANG
enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
+#define __bpfcall
+extern u32 cfi_bpf_hash;
#else
static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
{
return BUG_TRAP_TYPE_NONE;
}
+#define cfi_bpf_hash 0U
#endif /* CONFIG_CFI_CLANG */
#endif /* _ASM_X86_CFI_H */
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -30,6 +30,7 @@
#include <asm/fixmap.h>
#include <asm/paravirt.h>
#include <asm/asm-prototypes.h>
+#include <asm/cfi.h>
int __read_mostly alternatives_patched;
@@ -832,15 +833,37 @@ void __init_or_module apply_seal_endbr(s
#endif /* CONFIG_X86_KERNEL_IBT */
#ifdef CONFIG_FINEIBT
+#define __CFI_DEFAULT CFI_DEFAULT
+#elif defined(CONFIG_CFI_CLANG)
+#define __CFI_DEFAULT CFI_KCFI
+#else
+#define __CFI_DEFAULT CFI_OFF
+#endif
-enum cfi_mode {
- CFI_DEFAULT,
- CFI_OFF,
- CFI_KCFI,
- CFI_FINEIBT,
-};
+enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+
+#ifdef CONFIG_CFI_CLANG
+struct bpf_insn;
+
+extern unsigned int bpf_func_proto(const void *ctx,
+ const struct bpf_insn *insn);
+
+__ADDRESSABLE(bpf_func_proto);
+
+asm (
+" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
+" .type cfi_bpf_hash,@object \n"
+" .globl cfi_bpf_hash \n"
+" .p2align 2, 0x0 \n"
+"cfi_bpf_hash: \n"
+" .long __kcfi_typeid_bpf_func_proto \n"
+" .size cfi_bpf_hash, 4 \n"
+" .popsection \n"
+);
+#endif
+
+#ifdef CONFIG_FINEIBT
-static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
static bool cfi_rand __ro_after_init = true;
static u32 cfi_seed __ro_after_init;
@@ -1149,8 +1172,10 @@ static void __apply_fineibt(s32 *start_r
goto err;
if (cfi_rand) {
- if (builtin)
+ if (builtin) {
cfi_seed = get_random_u32();
+ cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
+ }
ret = cfi_rand_preamble(start_cfi, end_cfi);
if (ret)
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -17,6 +17,7 @@
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>
#include <asm/unwind.h>
+#include <asm/cfi.h>
static bool all_callee_regs_used[4] = {true, true, true, true};
@@ -51,9 +52,11 @@ static u8 *emit_code(u8 *ptr, u32 bytes,
do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
#ifdef CONFIG_X86_KERNEL_IBT
-#define EMIT_ENDBR() EMIT(gen_endbr(), 4)
+#define EMIT_ENDBR() EMIT(gen_endbr(), 4)
+#define EMIT_ENDBR_POISON() EMIT(gen_endbr_poison(), 4);
#else
#define EMIT_ENDBR()
+#define EMIT_ENDBR_POISON()
#endif
static bool is_imm8(int value)
@@ -247,6 +250,7 @@ struct jit_context {
*/
int tail_call_direct_label;
int tail_call_indirect_label;
+ int prog_offset;
};
/* Maximum number of bytes emitted while JITing one eBPF insn */
@@ -304,21 +308,86 @@ static void pop_callee_regs(u8 **pprog,
*pprog = prog;
}
+static int emit_fineibt(u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ EMIT_ENDBR();
+ EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash);
+ EMIT2(0x74, 0x07);
+ EMIT2(0x0f, 0x0b);
+ EMIT1(0x90);
+ EMIT_ENDBR_POISON();
+
+ *pprog = prog;
+ return 16;
+}
+
+static int emit_kcfi(u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int offset = 5;
+
+ EMIT1_off32(0xb8, cfi_bpf_hash);
+#ifdef CONFIG_CALL_PADDING
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ offset += 11;
+#endif
+ EMIT_ENDBR();
+
+ *pprog = prog;
+ return offset;
+}
+
+static int emit_cfi(u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int offset = 0;
+
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ offset = emit_fineibt(&prog);
+ break;
+
+ case CFI_KCFI:
+ offset = emit_kcfi(&prog);
+ break;
+
+ default:
+ EMIT_ENDBR();
+ break;
+ }
+
+ *pprog = prog;
+ return offset;
+}
+
/*
* Emit x86-64 prologue code for BPF program.
* bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
* while jumping to another program
*/
-static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
- bool tail_call_reachable, bool is_subprog,
- bool is_exception_cb)
+static int emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
+ bool tail_call_reachable, bool is_subprog,
+ bool is_exception_cb)
{
u8 *prog = *pprog;
+ int offset;
+ offset = emit_cfi(&prog);
/* BPF trampoline can be made to work without these nops,
* but let's waste 5 bytes for now and optimize later
*/
- EMIT_ENDBR();
memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
if (!ebpf_from_cbpf) {
@@ -357,6 +426,8 @@ static void emit_prologue(u8 **pprog, u3
if (tail_call_reachable)
EMIT1(0x50); /* push rax */
*pprog = prog;
+
+ return offset;
}
static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
@@ -1083,8 +1154,8 @@ static int do_jit(struct bpf_prog *bpf_p
bool tail_call_seen = false;
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
- int i, excnt = 0;
int ilen, proglen = 0;
+ int i, excnt = 0;
u8 *prog = temp;
int err;
@@ -1094,9 +1165,12 @@ static int do_jit(struct bpf_prog *bpf_p
/* tail call's presence in current prog implies it is reachable */
tail_call_reachable |= tail_call_seen;
- emit_prologue(&prog, bpf_prog->aux->stack_depth,
- bpf_prog_was_classic(bpf_prog), tail_call_reachable,
- bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
+ ctx->prog_offset = emit_prologue(&prog, bpf_prog->aux->stack_depth,
+ bpf_prog_was_classic(bpf_prog),
+ tail_call_reachable,
+ bpf_is_subprog(bpf_prog),
+ bpf_prog->aux->exception_cb);
+
/* Exception callback will clobber callee regs for its own use, and
* restore the original callee regs from main prog's stack frame.
*/
@@ -2935,9 +3009,9 @@ struct bpf_prog *bpf_int_jit_compile(str
jit_data->header = header;
jit_data->rw_header = rw_header;
}
- prog->bpf_func = (void *)image;
+ prog->bpf_func = (void *)image + ctx.prog_offset;
prog->jited = 1;
- prog->jited_len = proglen;
+ prog->jited_len = proglen - ctx.prog_offset; // XXX?
} else {
prog = orig_prog;
}
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -29,6 +29,7 @@
#include <linux/rcupdate_trace.h>
#include <linux/static_call.h>
#include <linux/memcontrol.h>
+#include <linux/cfi.h>
struct bpf_verifier_env;
struct bpf_verifier_log;
@@ -1188,7 +1189,11 @@ struct bpf_dispatcher {
#endif
};
-static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func(
+#ifndef __bpfcall
+#define __bpfcall __nocfi
+#endif
+
+static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func(
const void *ctx,
const struct bpf_insn *insnsi,
bpf_func_t bpf_func)
@@ -1278,7 +1283,7 @@ int arch_prepare_bpf_dispatcher(void *im
#define DEFINE_BPF_DISPATCHER(name) \
__BPF_DISPATCHER_SC(name); \
- noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \
+ noinline __bpfcall unsigned int bpf_dispatcher_##name##_func( \
const void *ctx, \
const struct bpf_insn *insnsi, \
bpf_func_t bpf_func) \
On Mon, Nov 20, 2023 at 03:46:44PM +0100, Peter Zijlstra wrote:
> @@ -2935,9 +3009,9 @@ struct bpf_prog *bpf_int_jit_compile(str
> jit_data->header = header;
> jit_data->rw_header = rw_header;
> }
> - prog->bpf_func = (void *)image;
> + prog->bpf_func = (void *)image + ctx.prog_offset;
> prog->jited = 1;
> - prog->jited_len = proglen;
> + prog->jited_len = proglen - ctx.prog_offset; // XXX?
> } else {
> prog = orig_prog;
> }
Note the XXX there, I wasn't sure what the desired semantics of proglen
was. As implemented it is the length from where bpf_func points to the
end, not including the pre-preamble -- as indicated by offset.
On Mon, Nov 20, 2023 at 03:46:44PM +0100, Peter Zijlstra wrote:
> +
> +#ifdef CONFIG_CFI_CLANG
> +struct bpf_insn;
> +
> +extern unsigned int bpf_func_proto(const void *ctx,
> + const struct bpf_insn *insn);
To make it more obvious what is going on could you rename it to
__bpf_prog_runX()
and add a comment that its prototype should match exactly
bpf interpreters created by DEFINE_BPF_PROG_RUN() macro,
otherwise cfi will explode.
> +
> +__ADDRESSABLE(bpf_func_proto);
> +
> +asm (
> +" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
> +" .type cfi_bpf_hash,@object \n"
> +" .globl cfi_bpf_hash \n"
> +" .p2align 2, 0x0 \n"
> +"cfi_bpf_hash: \n"
> +" .long __kcfi_typeid_bpf_func_proto \n"
Took me some time to grok this.
Cannot you use __CFI_TYPE() macro here ?
> +" .size cfi_bpf_hash, 4 \n"
> +" .popsection \n"
> +);
> +#endif
...
> +static int emit_fineibt(u8 **pprog)
> +{
> + u8 *prog = *pprog;
> +
> + EMIT_ENDBR();
> + EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash);
> + EMIT2(0x74, 0x07);
> + EMIT2(0x0f, 0x0b);
> + EMIT1(0x90);
> + EMIT_ENDBR_POISON();
Please add comments what this asm does. No one can read hex.
> +
> + *pprog = prog;
> + return 16;
16 means "the caller of this code will jump to endbr_poison", right?
> +}
> +
> +static int emit_kcfi(u8 **pprog)
> +{
> + u8 *prog = *pprog;
> + int offset = 5;
> +
> + EMIT1_off32(0xb8, cfi_bpf_hash);
and here too.
> +#ifdef CONFIG_CALL_PADDING
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + EMIT1(0x90);
> + offset += 11;
> +#endif
> + EMIT_ENDBR();
> +
> + *pprog = prog;
> + return offset;
5 or 16 would mean "jump to endbr" ?
> +}
> +
> +static int emit_cfi(u8 **pprog)
> +{
> + u8 *prog = *pprog;
> + int offset = 0;
> +
> + switch (cfi_mode) {
> + case CFI_FINEIBT:
> + offset = emit_fineibt(&prog);
> + break;
> +
> + case CFI_KCFI:
> + offset = emit_kcfi(&prog);
> + break;
> +
> + default:
> + EMIT_ENDBR();
> + break;
> + }
> +
> + *pprog = prog;
> + return offset;
> +}
> +
> /*
> * Emit x86-64 prologue code for BPF program.
> * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
> * while jumping to another program
> */
> -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
> - bool tail_call_reachable, bool is_subprog,
> - bool is_exception_cb)
> +static int emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
> + bool tail_call_reachable, bool is_subprog,
> + bool is_exception_cb)
> {
> u8 *prog = *pprog;
> + int offset;
>
> + offset = emit_cfi(&prog);
I'm not sure doing cfi_bpf_hash check in JITed code is completely solving the problem.
From bpf_dispatcher_*_func() calling into JITed will work,
but this emit_prologue() is doing the same job for all bpf progs.
Some bpf progs call each other directly and indirectly.
bpf_dispatcher_*_func() -> JITed_BPF_A -> JITed_BPF_B.
A into B can be a direct call (which cfi doesn't care about) and
indirect via emit_bpf_tail_call_indirect()->emit_indirect_jump().
Should we care about fineibt/kcfi there too?
> /* BPF trampoline can be made to work without these nops,
> * but let's waste 5 bytes for now and optimize later
> */
> - EMIT_ENDBR();
> memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
> prog += X86_PATCH_SIZE;
> if (!ebpf_from_cbpf) {
> @@ -357,6 +426,8 @@ static void emit_prologue(u8 **pprog, u3
> if (tail_call_reachable)
> EMIT1(0x50); /* push rax */
> *pprog = prog;
> +
> + return offset;
> }
>
> static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
> @@ -1083,8 +1154,8 @@ static int do_jit(struct bpf_prog *bpf_p
> bool tail_call_seen = false;
> bool seen_exit = false;
> u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
> - int i, excnt = 0;
> int ilen, proglen = 0;
> + int i, excnt = 0;
> u8 *prog = temp;
> int err;
>
> @@ -1094,9 +1165,12 @@ static int do_jit(struct bpf_prog *bpf_p
> /* tail call's presence in current prog implies it is reachable */
> tail_call_reachable |= tail_call_seen;
>
> - emit_prologue(&prog, bpf_prog->aux->stack_depth,
> - bpf_prog_was_classic(bpf_prog), tail_call_reachable,
> - bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
> + ctx->prog_offset = emit_prologue(&prog, bpf_prog->aux->stack_depth,
> + bpf_prog_was_classic(bpf_prog),
> + tail_call_reachable,
> + bpf_is_subprog(bpf_prog),
> + bpf_prog->aux->exception_cb);
> +
> /* Exception callback will clobber callee regs for its own use, and
> * restore the original callee regs from main prog's stack frame.
> */
> @@ -2935,9 +3009,9 @@ struct bpf_prog *bpf_int_jit_compile(str
> jit_data->header = header;
> jit_data->rw_header = rw_header;
> }
> - prog->bpf_func = (void *)image;
> + prog->bpf_func = (void *)image + ctx.prog_offset;
I don't understand this.
prog->bpf_func is the main entry point. Everything jumps there.
Are you trying to skip all of cfi code in the prologue and let
xdp_dispatcher jump to endbr or endbr_poison (depending on fineibt vs kcfi) ?
Then what is the point of earlier asm bits?
Is it a some clang thing that knows to offset indirect jump by
exactly that many hard coded bytes ?
Something in the clang does ptr -= 16 in case of fineibt and just
jumps there ? and ptr -= 5 for kcfi ?
If so, please add a giant comment explaining that.
No one should be reverse engineering such intricate details.
> prog->jited = 1;
> - prog->jited_len = proglen;
> + prog->jited_len = proglen - ctx.prog_offset; // XXX?
jited_len is used later to cover the whole generated code.
See bpf_prog_ksym_set_addr():
prog->aux->ksym.start = (unsigned long) prog->bpf_func;
prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len;
we definitely want ksym [start, end] to cover every useful byte
of JITed code in case IRQ happens on that byte.
Without covering cfi prologue the stack dump will be wrong for that frame.
I guess if xdp_dispatcher with fineibt=on jumps into prog->bpf_func - 16
and IRQ fires we don't care that much about accurate stack of last frame ?
I guess it's acceptable, but a comment is necessary.
On Tue, Nov 21, 2023 at 06:18:17PM -0800, Alexei Starovoitov wrote:
> On Mon, Nov 20, 2023 at 03:46:44PM +0100, Peter Zijlstra wrote:
> > +
> > +#ifdef CONFIG_CFI_CLANG
> > +struct bpf_insn;
> > +
> > +extern unsigned int bpf_func_proto(const void *ctx,
> > + const struct bpf_insn *insn);
>
> To make it more obvious what is going on could you rename it to
> __bpf_prog_runX()
> and add a comment that its prototype should match exactly
> bpf interpreters created by DEFINE_BPF_PROG_RUN() macro,
> otherwise cfi will explode.
Sure.
>
> > +
> > +__ADDRESSABLE(bpf_func_proto);
> > +
> > +asm (
> > +" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
> > +" .type cfi_bpf_hash,@object \n"
> > +" .globl cfi_bpf_hash \n"
> > +" .p2align 2, 0x0 \n"
> > +"cfi_bpf_hash: \n"
> > +" .long __kcfi_typeid_bpf_func_proto \n"
>
> Took me some time to grok this.
> Cannot you use __CFI_TYPE() macro here ?
__CFI_TYPE() creates the content of the __cfi_foo prefix symbol, which
is different from what the above does. The above is basically:
u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid_bpf_func_proto;
Except I need to do that in asm because __kcfi_typeid magic only works
in asm. I'll put the C version in a comment on top.
> > +" .size cfi_bpf_hash, 4 \n"
> > +" .popsection \n"
> > +);
> > +#endif
> ...
> > +static int emit_fineibt(u8 **pprog)
> > +{
> > + u8 *prog = *pprog;
> > +
> > + EMIT_ENDBR();
> > + EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash);
> > + EMIT2(0x74, 0x07);
> > + EMIT2(0x0f, 0x0b);
> > + EMIT1(0x90);
> > + EMIT_ENDBR_POISON();
>
> Please add comments what this asm does. No one can read hex.
Well, I've stared at this so very long that I can in fact get quite a
long way with just hex :-/ But point taken. My only problem here is that
this file uses Intel syntax, and that melts my brain.
Would it be acceptable to have AT&T syntax comments?
> > +
> > + *pprog = prog;
> > + return 16;
>
> 16 means "the caller of this code will jump to endbr_poison", right?
Ah, so the way FineIBT works is that direct calls go to foo()+0, as
normal. However the indirect calls will target foo()-16. The 16 bytes
preceding every symbol will contain the FineIBT landing pad.
As such, we need to offset prog->bpf_func by the expected amount,
otherwise foo()-16 will land in outer space and things go *boom*.
To be very explicit, let me list all the various forms of function
calls:
Traditional:
foo:
... code here ...
ret
direct caller:
call foo
indirect caller:
lea foo(%rip), %r11
call *%r11
IBT:
foo:
endbr64
... code here ...
ret
direct caller:
call foo / call foo+4
indirect caller:
lea foo(%rip), %r11
...
call *%r11
kCFI:
__cfi_foo:
movl $0x12345678, %rax
(11 nops when CALL_PADDING)
foo:
endbr64 (when also IBT)
... code here ...
ret
direct caller:
call foo / call foo+4
indirect caller:
lea foo(%rip), %r11
...
movl $(-0x12345678), %r10d
addl -15(%r11), %r10d (or -4 without CALL_PADDING)
je 1f
ud2
1:call *%r11
FineIBT (builds as kCFI + CALL_PADDING + IBT + RETPOLINE and runtime
patches things to look like):
__cfi_foo:
endbr64
subl $0x12345678, %r10d
jz foo
ud2
nop
foo:
osp nop3 (was endbr64)
... code here ...
ret
direct caller:
call foo / call foo+4
indirect caller:
lea foo(%rip), %r11
...
movl $0x12345678, %r10d
subl $16, %r11
nop4
call *%r11
As can be seen, both kCFI and FineIBT use the prefix __cfi symbol /
negative offsets.
To make this work the JIT starts by emitting the prefix text but then
offsets prog->bpf_func to point to the 'real' begin.
> > +}
> > +
> > +static int emit_kcfi(u8 **pprog)
> > +{
> > + u8 *prog = *pprog;
> > + int offset = 5;
> > +
> > + EMIT1_off32(0xb8, cfi_bpf_hash);
>
> and here too.
>
> > +#ifdef CONFIG_CALL_PADDING
> > + EMIT1(0x90);
> > + EMIT1(0x90);
> > + EMIT1(0x90);
> > + EMIT1(0x90);
> > + EMIT1(0x90);
> > + EMIT1(0x90);
> > + EMIT1(0x90);
> > + EMIT1(0x90);
> > + EMIT1(0x90);
> > + EMIT1(0x90);
> > + EMIT1(0x90);
> > + offset += 11;
> > +#endif
> > + EMIT_ENDBR();
> > +
> > + *pprog = prog;
> > + return offset;
>
> 5 or 16 would mean "jump to endbr" ?
It's the size of the prefix symbol.
> > +}
> > +
> > +static int emit_cfi(u8 **pprog)
> > +{
> > + u8 *prog = *pprog;
> > + int offset = 0;
> > +
> > + switch (cfi_mode) {
> > + case CFI_FINEIBT:
> > + offset = emit_fineibt(&prog);
> > + break;
> > +
> > + case CFI_KCFI:
> > + offset = emit_kcfi(&prog);
> > + break;
> > +
> > + default:
> > + EMIT_ENDBR();
> > + break;
> > + }
> > +
> > + *pprog = prog;
> > + return offset;
> > +}
> > +
> > /*
> > * Emit x86-64 prologue code for BPF program.
> > * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
> > * while jumping to another program
> > */
> > -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
> > - bool tail_call_reachable, bool is_subprog,
> > - bool is_exception_cb)
> > +static int emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
> > + bool tail_call_reachable, bool is_subprog,
> > + bool is_exception_cb)
> > {
> > u8 *prog = *pprog;
> > + int offset;
> >
> > + offset = emit_cfi(&prog);
>
> I'm not sure doing cfi_bpf_hash check in JITed code is completely solving the problem.
> From bpf_dispatcher_*_func() calling into JITed will work,
> but this emit_prologue() is doing the same job for all bpf progs.
> Some bpf progs call each other directly and indirectly.
> bpf_dispatcher_*_func() -> JITed_BPF_A -> JITed_BPF_B.
> A into B can be a direct call (which cfi doesn't care about) and
> indirect via emit_bpf_tail_call_indirect()->emit_indirect_jump().
> Should we care about fineibt/kcfi there too?
The way I understood the tail-call thing to work is that it jumps to
bpf_prog + X86_TAIL_CALL_OFFSET, we already emit an extra ENDBR there to
make this work.
So the A -> B indirect call is otherwise unadornen and only needs ENDBR.
Ideally that would use kCFI/FineIBT but since it also skips some of the
setup, this gets to be non-trivial, so I've let this be as is.
> > /* BPF trampoline can be made to work without these nops,
> > * but let's waste 5 bytes for now and optimize later
> > */
> > - EMIT_ENDBR();
> > memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
> > prog += X86_PATCH_SIZE;
> > if (!ebpf_from_cbpf) {
> > @@ -357,6 +426,8 @@ static void emit_prologue(u8 **pprog, u3
> > if (tail_call_reachable)
> > EMIT1(0x50); /* push rax */
> > *pprog = prog;
> > +
> > + return offset;
> > }
> >
> > static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
> > @@ -1083,8 +1154,8 @@ static int do_jit(struct bpf_prog *bpf_p
> > bool tail_call_seen = false;
> > bool seen_exit = false;
> > u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
> > - int i, excnt = 0;
> > int ilen, proglen = 0;
> > + int i, excnt = 0;
> > u8 *prog = temp;
> > int err;
> >
> > @@ -1094,9 +1165,12 @@ static int do_jit(struct bpf_prog *bpf_p
> > /* tail call's presence in current prog implies it is reachable */
> > tail_call_reachable |= tail_call_seen;
> >
> > - emit_prologue(&prog, bpf_prog->aux->stack_depth,
> > - bpf_prog_was_classic(bpf_prog), tail_call_reachable,
> > - bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
> > + ctx->prog_offset = emit_prologue(&prog, bpf_prog->aux->stack_depth,
> > + bpf_prog_was_classic(bpf_prog),
> > + tail_call_reachable,
> > + bpf_is_subprog(bpf_prog),
> > + bpf_prog->aux->exception_cb);
> > +
> > /* Exception callback will clobber callee regs for its own use, and
> > * restore the original callee regs from main prog's stack frame.
> > */
> > @@ -2935,9 +3009,9 @@ struct bpf_prog *bpf_int_jit_compile(str
> > jit_data->header = header;
> > jit_data->rw_header = rw_header;
> > }
> > - prog->bpf_func = (void *)image;
> > + prog->bpf_func = (void *)image + ctx.prog_offset;
>
> I don't understand this.
> Is it a some clang thing that knows to offset indirect jump by
> exactly that many hard coded bytes ?
> Something in the clang does ptr -= 16 in case of fineibt and just
> jumps there ? and ptr -= 5 for kcfi ?
Yep, that. I hope my earlier explanation clarified this.
> If so, please add a giant comment explaining that.
> No one should be reverse engineering such intricate details.
So the kCFI thing is 'new' but readily inspected by objdump or godbolt:
https://godbolt.org/z/sGe18z3ca
(@Sami, that .Ltmp15 thing, I don't see that in the kernel, what
compiler flag makes that go away?)
As to FineIBT, that has a big comment in arch/x86/kernel/alternative.c
where I rewrite the kCFI thing into FineIBT. I can refer there to avoid
duplicating comments, would that work?
>
> > prog->jited = 1;
> > - prog->jited_len = proglen;
> > + prog->jited_len = proglen - ctx.prog_offset; // XXX?
>
> jited_len is used later to cover the whole generated code.
> See bpf_prog_ksym_set_addr():
> prog->aux->ksym.start = (unsigned long) prog->bpf_func;
> prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len;
> we definitely want ksym [start, end] to cover every useful byte
> of JITed code in case IRQ happens on that byte.
> Without covering cfi prologue the stack dump will be wrong for that frame.
> I guess if xdp_dispatcher with fineibt=on jumps into prog->bpf_func - 16
> and IRQ fires we don't care that much about accurate stack of last frame ?
> I guess it's acceptable, but a comment is necessary.
Ah, so normally the __cfi_foo symbol would catch those, lemme see what I
can do here.
Thanks!
On Wed, Nov 22, 2023 at 12:15:17PM +0100, Peter Zijlstra wrote:
> Ah, so normally the __cfi_foo symbol would catch those, lemme see what I
> can do here.
I have the below delta (untested etc..), does that look about right?
---
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -845,19 +845,24 @@ enum cfi_mode cfi_mode __ro_after_init =
#ifdef CONFIG_CFI_CLANG
struct bpf_insn;
-extern unsigned int bpf_func_proto(const void *ctx,
- const struct bpf_insn *insn);
+/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
+extern unsigned int __bpf_prog_runX(const void *ctx,
+ const struct bpf_insn *insn);
-__ADDRESSABLE(bpf_func_proto);
+/*
+ * Force a reference to the external symbol so the compiler generates
+ * __kcfi_typid.
+ */
+__ADDRESSABLE(__bpf_prog_runX);
-/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid_bpf_func_proto */
+/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX */
asm (
" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
" .type cfi_bpf_hash,@object \n"
" .globl cfi_bpf_hash \n"
" .p2align 2, 0x0 \n"
"cfi_bpf_hash: \n"
-" .long __kcfi_typeid_bpf_func_proto \n"
+" .long __kcfi_typeid___bpf_prog_runX \n"
" .size cfi_bpf_hash, 4 \n"
" .popsection \n"
);
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -308,15 +308,20 @@ static void pop_callee_regs(u8 **pprog,
*pprog = prog;
}
+/*
+ * Emit the various CFI preambles, see the large comment about FineIBT
+ * in arch/x86/kernel/alternative.c
+ */
+
static int emit_fineibt(u8 **pprog)
{
u8 *prog = *pprog;
EMIT_ENDBR();
- EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash);
- EMIT2(0x74, 0x07);
- EMIT2(0x0f, 0x0b);
- EMIT1(0x90);
+ EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash); /* subl $hash, %r10d */
+ EMIT2(0x74, 0x07); /* jz.d8 +7 */
+ EMIT2(0x0f, 0x0b); /* ud2 */
+ EMIT1(0x90); /* nop */
EMIT_ENDBR_POISON();
*pprog = prog;
@@ -328,7 +333,7 @@ static int emit_kcfi(u8 **pprog)
u8 *prog = *pprog;
int offset = 5;
- EMIT1_off32(0xb8, cfi_bpf_hash);
+ EMIT1_off32(0xb8, cfi_bpf_hash); /* movl $hash, %eax */
#ifdef CONFIG_CALL_PADDING
EMIT1(0x90);
EMIT1(0x90);
@@ -3009,6 +3014,10 @@ struct bpf_prog *bpf_int_jit_compile(str
jit_data->header = header;
jit_data->rw_header = rw_header;
}
+ /*
+ * ctx.prog_offset is used when CFI preambles put code *before*
+ * the function. See emit_cfi().
+ */
prog->bpf_func = (void *)image + ctx.prog_offset;
prog->jited = 1;
prog->jited_len = proglen - ctx.prog_offset; // XXX?
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1431,6 +1431,9 @@ struct bpf_prog_aux {
struct bpf_kfunc_desc_tab *kfunc_tab;
struct bpf_kfunc_btf_tab *kfunc_btf_tab;
u32 size_poke_tab;
+#ifdef CONFIG_FINEIBT
+ struct bpf_ksym ksym_prefix;
+#endif
struct bpf_ksym ksym;
const struct bpf_prog_ops *ops;
struct bpf_map **used_maps;
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -683,6 +683,23 @@ void bpf_prog_kallsyms_add(struct bpf_pr
fp->aux->ksym.prog = true;
bpf_ksym_add(&fp->aux->ksym);
+
+#ifdef CONFIG_FINEIBT
+ /*
+ * When FineIBT, code in the __cfi_foo() symbols can get executed
+ * and hence unwinder needs help.
+ */
+ if (cfi_mode != CFI_FINEIBT)
+ return;
+
+ snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
+ "__cfi_%s", fp->aux->ksym.name);
+
+ prog->aux->ksym_prefix.start = (unsigned long) prog->bpf_func - 16;
+ prog->aux->ksym_prefix.end = (unsigned long) prog->bpf_func;
+
+ bpf_ksym_add(&fp->aux->ksym_prefix);
+#endif
}
void bpf_prog_kallsyms_del(struct bpf_prog *fp)
On Wed, Nov 22, 2023 at 01:41:34PM +0100, Peter Zijlstra wrote:
> +#ifdef CONFIG_FINEIBT
> + /*
> + * When FineIBT, code in the __cfi_foo() symbols can get executed
> + * and hence unwinder needs help.
> + */
> + if (cfi_mode != CFI_FINEIBT)
> + return;
> +
> + snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
> + "__cfi_%s", fp->aux->ksym.name);
> +
> + prog->aux->ksym_prefix.start = (unsigned long) prog->bpf_func - 16;
> + prog->aux->ksym_prefix.end = (unsigned long) prog->bpf_func;
> +
> + bpf_ksym_add(&fp->aux->ksym_prefix);
> +#endif
> }
s/prog/fp/ makes compiler happier
On Wed, Nov 22, 2023 at 3:15 AM Peter Zijlstra <[email protected]> wrote:
>
>
> To be very explicit, let me list all the various forms of function
> calls:
>
> Traditional:
>
> foo:
> ... code here ...
> ret
>
> direct caller:
>
> call foo
>
> indirect caller:
>
> lea foo(%rip), %r11
> call *%r11
>
> IBT:
>
> foo:
> endbr64
> ... code here ...
> ret
>
> direct caller:
>
> call foo / call foo+4
>
> indirect caller:
>
> lea foo(%rip), %r11
> ...
> call *%r11
>
>
> kCFI:
>
> __cfi_foo:
> movl $0x12345678, %rax
> (11 nops when CALL_PADDING)
> foo:
> endbr64 (when also IBT)
> ... code here ...
> ret
>
> direct caller:
>
> call foo / call foo+4
>
> indirect caller:
>
> lea foo(%rip), %r11
> ...
> movl $(-0x12345678), %r10d
> addl -15(%r11), %r10d (or -4 without CALL_PADDING)
> je 1f
> ud2
> 1:call *%r11
>
>
> FineIBT (builds as kCFI + CALL_PADDING + IBT + RETPOLINE and runtime
> patches things to look like):
>
> __cfi_foo:
> endbr64
> subl $0x12345678, %r10d
> jz foo
> ud2
> nop
> foo:
> osp nop3 (was endbr64)
> ... code here ...
> ret
>
> direct caller:
>
> call foo / call foo+4
>
> indirect caller:
>
> lea foo(%rip), %r11
> ...
> movl $0x12345678, %r10d
> subl $16, %r11
> nop4
> call *%r11
Got it. That helps a lot!
You kind of have this comment scattered through arch/x86/kernel/alternative.c
but having it in one place like above would go a long way.
Could you please add it to arch/x86/net/bpf_jit_comp.c
or arch/x86/include/asm/cfi.h next to enum cfi_mode ?
> > I'm not sure doing cfi_bpf_hash check in JITed code is completely solving the problem.
> > From bpf_dispatcher_*_func() calling into JITed will work,
> > but this emit_prologue() is doing the same job for all bpf progs.
> > Some bpf progs call each other directly and indirectly.
> > bpf_dispatcher_*_func() -> JITed_BPF_A -> JITed_BPF_B.
> > A into B can be a direct call (which cfi doesn't care about) and
> > indirect via emit_bpf_tail_call_indirect()->emit_indirect_jump().
> > Should we care about fineibt/kcfi there too?
>
> The way I understood the tail-call thing to work is that it jumps to
> bpf_prog + X86_TAIL_CALL_OFFSET, we already emit an extra ENDBR there to
> make this work.
>
> So the A -> B indirect call is otherwise unadornen and only needs ENDBR.
>
> Ideally that would use kCFI/FineIBT but since it also skips some of the
> setup, this gets to be non-trivial, so I've let this be as is.
I see. yeah. The setup is not trivial indeed. Keep as-is is fine.
> So the kCFI thing is 'new' but readily inspected by objdump or godbolt:
>
> https://godbolt.org/z/sGe18z3ca
>
> (@Sami, that .Ltmp15 thing, I don't see that in the kernel, what
> compiler flag makes that go away?)
I also noticed this discrepancy. It doesn't seem to be used.
Looks weird to spend 8 bytes to store -sizeof(ud2)
> As to FineIBT, that has a big comment in arch/x86/kernel/alternative.c
> where I rewrite the kCFI thing into FineIBT. I can refer there to avoid
> duplicating comments, would that work?
Just the above comment somewhere would work.
I wouldn't worry about duplication. This is tricky stuff.
When gcc folks get around implementing kcfi they will find it useful too.
On Wed, Nov 22, 2023 at 4:41 AM Peter Zijlstra <[email protected]> wrote:
>
>
> +/*
> + * Emit the various CFI preambles, see the large comment about FineIBT
> + * in arch/x86/kernel/alternative.c
.. and in cfi.h ..
which will have a copy-paste from your other email?
> prog->bpf_func = (void *)image + ctx.prog_offset;
> prog->jited = 1;
> prog->jited_len = proglen - ctx.prog_offset; // XXX?
Just drop XXX.
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1431,6 +1431,9 @@ struct bpf_prog_aux {
> struct bpf_kfunc_desc_tab *kfunc_tab;
> struct bpf_kfunc_btf_tab *kfunc_btf_tab;
> u32 size_poke_tab;
> +#ifdef CONFIG_FINEIBT
> + struct bpf_ksym ksym_prefix;
> +#endif
> struct bpf_ksym ksym;
> const struct bpf_prog_ops *ops;
> struct bpf_map **used_maps;
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -683,6 +683,23 @@ void bpf_prog_kallsyms_add(struct bpf_pr
> fp->aux->ksym.prog = true;
>
> bpf_ksym_add(&fp->aux->ksym);
> +
> +#ifdef CONFIG_FINEIBT
> + /*
> + * When FineIBT, code in the __cfi_foo() symbols can get executed
> + * and hence unwinder needs help.
> + */
I like the idea!
> + if (cfi_mode != CFI_FINEIBT)
> + return;
The cfi_mode var needs to be global along with enum ?
Or some new helper function from arch/x86 ?
> +
> + snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
> + "__cfi_%s", fp->aux->ksym.name);
> +
> + prog->aux->ksym_prefix.start = (unsigned long) prog->bpf_func - 16;
> + prog->aux->ksym_prefix.end = (unsigned long) prog->bpf_func;
> +
> + bpf_ksym_add(&fp->aux->ksym_prefix);
> +#endif
> }
>
> void bpf_prog_kallsyms_del(struct bpf_prog *fp)
and handle deletion of ksym_prefix here.
I think it's shaping up nicely.
Pls resend both patches as a set and cc bpf @ vger.
BPF CI will pick it up and test on arm64, x86-64, s390 with gcc and clang.
We don't do CONFIG_*IBT testing automatically, but
I can manually try that after the holidays.
On Wed, Nov 22, 2023 at 3:15 AM Peter Zijlstra <[email protected]> wrote:
>
> So the kCFI thing is 'new' but readily inspected by objdump or godbolt:
>
> https://godbolt.org/z/sGe18z3ca
>
> (@Sami, that .Ltmp15 thing, I don't see that in the kernel, what
> compiler flag makes that go away?)
Hmm, that looks like that's what we emit to .kcfi_traps. I suppose
Godbolt just doesn't show the section directives?
https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/X86/kcfi.ll#L30
Sami
On Fri, Dec 1, 2023 at 3:54 PM Sami Tolvanen <[email protected]> wrote:
>
> On Wed, Nov 22, 2023 at 3:15 AM Peter Zijlstra <[email protected]> wrote:
> >
> > So the kCFI thing is 'new' but readily inspected by objdump or godbolt:
> >
> > https://godbolt.org/z/sGe18z3ca
> >
> > (@Sami, that .Ltmp15 thing, I don't see that in the kernel, what
> > compiler flag makes that go away?)
>
> Hmm, that looks like that's what we emit to .kcfi_traps. I suppose
> Godbolt just doesn't show the section directives?
Filter > [uncheck] Directives
>
> https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/X86/kcfi.ll#L30
>
> Sami
--
Thanks,
~Nick Desaulniers