Changes in V2
V1: https://lore.kernel.org/bpf/[email protected]/
- Used S7 in place of S11 for storing the kernel_vm_start
- Used 17 in place of 16 for DONT_CLEAR marker
- Remove an unused variable
- Removed some misleading information from the commit message.
This series adds the support for PROBE_MEM32 and bpf_addr_space_cast
instructions to the RISCV BPF JIT. These two instructions allow the
enablement of BPF Arena.
All arena related selftests are passing:
root@rv-tester:~/bpf# uname -p
riscv64
root@rv-tester:~/bpf# ./test_progs -a "*arena*"
#3/1 arena_htab/arena_htab_llvm:OK
#3/2 arena_htab/arena_htab_asm:OK
#3 arena_htab:OK
#4/1 arena_list/arena_list_1:OK
#4/2 arena_list/arena_list_1000:OK
#4 arena_list:OK
#434/1 verifier_arena/basic_alloc1:OK
#434/2 verifier_arena/basic_alloc2:OK
#434/3 verifier_arena/basic_alloc3:OK
#434/4 verifier_arena/iter_maps1:OK
#434/5 verifier_arena/iter_maps2:OK
#434/6 verifier_arena/iter_maps3:OK
#434 verifier_arena:OK
Summary: 3/10 PASSED, 0 SKIPPED, 0 FAILED
This needs the patch[1] that adds the insn_is_cast_user() helper to build.
It also needs the following patches/commits [2][3] to work correctly.
[1] https://lore.kernel.org/bpf/[email protected]/
[2] https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/commit/?id=f7f5d1808b1b66935a24dd796dd1a0612ca9c147
[3] https://lore.kernel.org/bpf/[email protected]/
Puranjay Mohan (2):
bpf,riscv: Implement PROBE_MEM32 pseudo instructions
bpf,riscv: Implement bpf_addr_space_cast instruction
arch/riscv/net/bpf_jit.h | 2 +
arch/riscv/net/bpf_jit_comp64.c | 207 +++++++++++++++++++++++++++++++-
arch/riscv/net/bpf_jit_core.c | 2 +
3 files changed, 208 insertions(+), 3 deletions(-)
--
2.40.1
Add support for [LDX | STX | ST], PROBE_MEM32, [B | H | W | DW]
instructions. They are similar to PROBE_MEM instructions with the
following differences:
- PROBE_MEM32 supports store.
- PROBE_MEM32 relies on the verifier to clear upper 32-bit of the
src/dst register
- PROBE_MEM32 adds 64-bit kern_vm_start address (which is stored in S7
in the prologue). Due to bpf_arena constructions such S7 + reg +
off16 access is guaranteed to be within arena virtual range, so no
address check at run-time.
- S7 is a free callee-saved register, so it is used to store kern_vm_start
- PROBE_MEM32 allows STX and ST. If they fault the store is a nop. When
LDX faults the destination register is zeroed.
To support these on riscv, we do tmp = S7 + src/dst reg and then use
tmp2 as the new src/dst register. This allows us to reuse most of the
code for normal [LDX | STX | ST].
Signed-off-by: Puranjay Mohan <[email protected]>
---
arch/riscv/net/bpf_jit.h | 1 +
arch/riscv/net/bpf_jit_comp64.c | 193 +++++++++++++++++++++++++++++++-
arch/riscv/net/bpf_jit_core.c | 1 +
3 files changed, 192 insertions(+), 3 deletions(-)
diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
index f4b6b3b9edda..8a47da08dd9c 100644
--- a/arch/riscv/net/bpf_jit.h
+++ b/arch/riscv/net/bpf_jit.h
@@ -81,6 +81,7 @@ struct rv_jit_context {
int nexentries;
unsigned long flags;
int stack_size;
+ u64 arena_vm_start;
};
/* Convert from ninsns to bytes. */
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 1adf2f39ce59..0c0588e327af 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -255,6 +255,10 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
emit_ld(RV_REG_S6, store_offset, RV_REG_SP, ctx);
store_offset -= 8;
}
+ if (ctx->arena_vm_start) {
+ emit_ld(RV_REG_S7, store_offset, RV_REG_SP, ctx);
+ store_offset -= 8;
+ }
emit_addi(RV_REG_SP, RV_REG_SP, stack_adjust, ctx);
/* Set return value. */
@@ -548,6 +552,7 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64,
#define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0)
#define BPF_FIXUP_REG_MASK GENMASK(31, 27)
+#define DONT_CLEAR 17 /* RV_REG_A7 unused in pt_regmap */
bool ex_handler_bpf(const struct exception_table_entry *ex,
struct pt_regs *regs)
@@ -555,7 +560,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex,
off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup);
int regs_offset = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup);
- *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
+ if (regs_offset != DONT_CLEAR)
+ *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
regs->epc = (unsigned long)&ex->fixup - offset;
return true;
@@ -572,7 +578,8 @@ static int add_exception_handler(const struct bpf_insn *insn,
off_t fixup_offset;
if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable ||
- (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX))
+ (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX &&
+ BPF_MODE(insn->code) != BPF_PROBE_MEM32))
return 0;
if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries))
@@ -622,6 +629,9 @@ static int add_exception_handler(const struct bpf_insn *insn,
ex->insn = ins_offset;
+ if (BPF_CLASS(insn->code) != BPF_LDX)
+ dst_reg = DONT_CLEAR;
+
ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) |
FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg);
ex->type = EX_TYPE_BPF;
@@ -1063,7 +1073,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
BPF_CLASS(insn->code) == BPF_JMP;
int s, e, rvoff, ret, i = insn - ctx->prog->insnsi;
struct bpf_prog_aux *aux = ctx->prog->aux;
- u8 rd = -1, rs = -1, code = insn->code;
+ u8 rd = -1, rs = -1, code = insn->code, reg_arena_vm_start = RV_REG_S7;
s16 off = insn->off;
s32 imm = insn->imm;
@@ -1539,6 +1549,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
+ /* LDX | PROBE_MEM32: dst = *(unsigned size *)(src + S7 + off)*/
+ case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
+ case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
+ case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
+ case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
{
int insn_len, insns_start;
bool sign_ext;
@@ -1546,6 +1561,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
sign_ext = BPF_MODE(insn->code) == BPF_MEMSX ||
BPF_MODE(insn->code) == BPF_PROBE_MEMSX;
+ if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) {
+ emit_add(RV_REG_T2, rs, reg_arena_vm_start, ctx);
+ rs = RV_REG_T2;
+ }
+
switch (BPF_SIZE(code)) {
case BPF_B:
if (is_12b_int(off)) {
@@ -1682,6 +1702,87 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
break;
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
+ {
+ int insn_len, insns_start;
+
+ emit_add(RV_REG_T3, rd, reg_arena_vm_start, ctx);
+ rd = RV_REG_T3;
+
+ /* Load imm to a register then store it */
+ emit_imm(RV_REG_T1, imm, ctx);
+
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ if (is_12b_int(off)) {
+ insns_start = ctx->ninsns;
+ emit(rv_sb(rd, off, RV_REG_T1), ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ emit_imm(RV_REG_T2, off, ctx);
+ emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+ insns_start = ctx->ninsns;
+ emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx);
+ insn_len = ctx->ninsns - insns_start;
+
+ break;
+
+ case BPF_H:
+ if (is_12b_int(off)) {
+ insns_start = ctx->ninsns;
+ emit(rv_sh(rd, off, RV_REG_T1), ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ emit_imm(RV_REG_T2, off, ctx);
+ emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+ insns_start = ctx->ninsns;
+ emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ case BPF_W:
+ if (is_12b_int(off)) {
+ insns_start = ctx->ninsns;
+ emit_sw(rd, off, RV_REG_T1, ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ emit_imm(RV_REG_T2, off, ctx);
+ emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+ insns_start = ctx->ninsns;
+ emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ case BPF_DW:
+ if (is_12b_int(off)) {
+ insns_start = ctx->ninsns;
+ emit_sd(rd, off, RV_REG_T1, ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ emit_imm(RV_REG_T2, off, ctx);
+ emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+ insns_start = ctx->ninsns;
+ emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ ret = add_exception_handler(insn, ctx, rd, insn_len);
+ if (ret)
+ return ret;
+
+ break;
+ }
+
/* STX: *(size *)(dst + off) = src */
case BPF_STX | BPF_MEM | BPF_B:
if (is_12b_int(off)) {
@@ -1728,6 +1829,83 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
emit_atomic(rd, rs, off, imm,
BPF_SIZE(code) == BPF_DW, ctx);
break;
+
+ case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
+ case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
+ case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
+ case BPF_STX | BPF_PROBE_MEM32 | BPF_DW:
+ {
+ int insn_len, insns_start;
+
+ emit_add(RV_REG_T2, rd, reg_arena_vm_start, ctx);
+ rd = RV_REG_T2;
+
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ if (is_12b_int(off)) {
+ insns_start = ctx->ninsns;
+ emit(rv_sb(rd, off, rs), ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ emit_imm(RV_REG_T1, off, ctx);
+ emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
+ insns_start = ctx->ninsns;
+ emit(rv_sb(RV_REG_T1, 0, rs), ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ case BPF_H:
+ if (is_12b_int(off)) {
+ insns_start = ctx->ninsns;
+ emit(rv_sh(rd, off, rs), ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ emit_imm(RV_REG_T1, off, ctx);
+ emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
+ insns_start = ctx->ninsns;
+ emit(rv_sh(RV_REG_T1, 0, rs), ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ case BPF_W:
+ if (is_12b_int(off)) {
+ insns_start = ctx->ninsns;
+ emit_sw(rd, off, rs, ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ emit_imm(RV_REG_T1, off, ctx);
+ emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
+ insns_start = ctx->ninsns;
+ emit_sw(RV_REG_T1, 0, rs, ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ case BPF_DW:
+ if (is_12b_int(off)) {
+ insns_start = ctx->ninsns;
+ emit_sd(rd, off, rs, ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ emit_imm(RV_REG_T1, off, ctx);
+ emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
+ insns_start = ctx->ninsns;
+ emit_sd(RV_REG_T1, 0, rs, ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ ret = add_exception_handler(insn, ctx, rd, insn_len);
+ if (ret)
+ return ret;
+
+ break;
+ }
+
default:
pr_err("bpf-jit: unknown opcode %02x\n", code);
return -EINVAL;
@@ -1759,6 +1937,8 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog)
stack_adjust += 8;
if (seen_reg(RV_REG_S6, ctx))
stack_adjust += 8;
+ if (ctx->arena_vm_start)
+ stack_adjust += 8;
stack_adjust = round_up(stack_adjust, 16);
stack_adjust += bpf_stack_adjust;
@@ -1810,6 +1990,10 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog)
emit_sd(RV_REG_SP, store_offset, RV_REG_S6, ctx);
store_offset -= 8;
}
+ if (ctx->arena_vm_start) {
+ emit_sd(RV_REG_SP, store_offset, RV_REG_S7, ctx);
+ store_offset -= 8;
+ }
emit_addi(RV_REG_FP, RV_REG_SP, stack_adjust, ctx);
@@ -1823,6 +2007,9 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog)
emit_mv(RV_REG_TCC_SAVED, RV_REG_TCC, ctx);
ctx->stack_size = stack_adjust;
+
+ if (ctx->arena_vm_start)
+ emit_imm(RV_REG_S7, ctx->arena_vm_start, ctx);
}
void bpf_jit_build_epilogue(struct rv_jit_context *ctx)
diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c
index 6b3acac30c06..9ab739b9f9a2 100644
--- a/arch/riscv/net/bpf_jit_core.c
+++ b/arch/riscv/net/bpf_jit_core.c
@@ -80,6 +80,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
goto skip_init_ctx;
}
+ ctx->arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena);
ctx->prog = prog;
ctx->offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
if (!ctx->offset) {
--
2.40.1
LLVM generates bpf_addr_space_cast instruction while translating
pointers between native (zero) address space and
__attribute__((address_space(N))). The addr_space=0 is reserved as
bpf_arena address space.
rY = addr_space_cast(rX, 0, 1) is processed by the verifier and
converted to normal 32-bit move: wX = wY
rY = addr_space_cast(rX, 1, 0) has to be converted by JIT.
Signed-off-by: Puranjay Mohan <[email protected]>
---
arch/riscv/net/bpf_jit.h | 1 +
arch/riscv/net/bpf_jit_comp64.c | 14 ++++++++++++++
arch/riscv/net/bpf_jit_core.c | 1 +
3 files changed, 16 insertions(+)
diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
index 8a47da08dd9c..5fc374ed98ea 100644
--- a/arch/riscv/net/bpf_jit.h
+++ b/arch/riscv/net/bpf_jit.h
@@ -82,6 +82,7 @@ struct rv_jit_context {
unsigned long flags;
int stack_size;
u64 arena_vm_start;
+ u64 user_vm_start;
};
/* Convert from ninsns to bytes. */
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 0c0588e327af..a5f049e72da2 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -1083,6 +1083,15 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
/* dst = src */
case BPF_ALU | BPF_MOV | BPF_X:
case BPF_ALU64 | BPF_MOV | BPF_X:
+ if (insn_is_cast_user(insn)) {
+ emit_mv(RV_REG_T1, rs, ctx);
+ emit_zextw(RV_REG_T1, RV_REG_T1, ctx);
+ emit_imm(rd, (ctx->user_vm_start >> 32) << 32, ctx);
+ emit(rv_beq(RV_REG_T1, RV_REG_ZERO, 4), ctx);
+ emit_or(RV_REG_T1, rd, RV_REG_T1, ctx);
+ emit_mv(rd, RV_REG_T1, ctx);
+ break;
+ }
if (imm == 1) {
/* Special mov32 for zext */
emit_zextw(rd, rd, ctx);
@@ -2026,3 +2035,8 @@ bool bpf_jit_supports_ptr_xchg(void)
{
return true;
}
+
+bool bpf_jit_supports_arena(void)
+{
+ return true;
+}
diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c
index 9ab739b9f9a2..8a69d6d81e32 100644
--- a/arch/riscv/net/bpf_jit_core.c
+++ b/arch/riscv/net/bpf_jit_core.c
@@ -81,6 +81,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
}
ctx->arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena);
+ ctx->user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
ctx->prog = prog;
ctx->offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
if (!ctx->offset) {
--
2.40.1
Björn Töpel <[email protected]> writes:
> Puranjay Mohan <[email protected]> writes:
>
>> Add support for [LDX | STX | ST], PROBE_MEM32, [B | H | W | DW]
>> instructions. They are similar to PROBE_MEM instructions with the
>> following differences:
>> - PROBE_MEM32 supports store.
>> - PROBE_MEM32 relies on the verifier to clear upper 32-bit of the
>> src/dst register
>> - PROBE_MEM32 adds 64-bit kern_vm_start address (which is stored in S7
>> in the prologue). Due to bpf_arena constructions such S7 + reg +
>> off16 access is guaranteed to be within arena virtual range, so no
>> address check at run-time.
>> - S7 is a free callee-saved register, so it is used to store kern_vm_start
>> - PROBE_MEM32 allows STX and ST. If they fault the store is a nop. When
>> LDX faults the destination register is zeroed.
>>
>> To support these on riscv, we do tmp = S7 + src/dst reg and then use
>> tmp2 as the new src/dst register. This allows us to reuse most of the
>> code for normal [LDX | STX | ST].
>
> Cool to see the RV BPF JIT keeping up with x86 features! ;-) Nice work!
It is my self proclaimed duty to make sure that all 64-bit JITs have
feature parity. :D
>
> A couple of minor comments below.
>
>> Signed-off-by: Puranjay Mohan <[email protected]>
>> ---
>> arch/riscv/net/bpf_jit.h | 1 +
>> arch/riscv/net/bpf_jit_comp64.c | 193 +++++++++++++++++++++++++++++++-
>> arch/riscv/net/bpf_jit_core.c | 1 +
>> 3 files changed, 192 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
>> index f4b6b3b9edda..8a47da08dd9c 100644
>> --- a/arch/riscv/net/bpf_jit.h
>> +++ b/arch/riscv/net/bpf_jit.h
>> @@ -81,6 +81,7 @@ struct rv_jit_context {
>> int nexentries;
>> unsigned long flags;
>> int stack_size;
>> + u64 arena_vm_start;
>> };
>>
>> /* Convert from ninsns to bytes. */
>> diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
>> index 1adf2f39ce59..0c0588e327af 100644
>> --- a/arch/riscv/net/bpf_jit_comp64.c
>> +++ b/arch/riscv/net/bpf_jit_comp64.c
>> @@ -255,6 +255,10 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
>> emit_ld(RV_REG_S6, store_offset, RV_REG_SP, ctx);
>> store_offset -= 8;
>> }
>> + if (ctx->arena_vm_start) {
>> + emit_ld(RV_REG_S7, store_offset, RV_REG_SP, ctx);
>> + store_offset -= 8;
>> + }
>>
>> emit_addi(RV_REG_SP, RV_REG_SP, stack_adjust, ctx);
>> /* Set return value. */
>> @@ -548,6 +552,7 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64,
>>
>> #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0)
>> #define BPF_FIXUP_REG_MASK GENMASK(31, 27)
>> +#define DONT_CLEAR 17 /* RV_REG_A7 unused in pt_regmap */
>
> Hmm, so this is just a a sentinel node, right? Isn't it more robust to
> use, say REG_ZERO which will never be used? Maybe REG_DONT_CLEAR_MARKER
> or smth, so it's obvious how it's used?
Yes, I agree, RV_REG_ZERO would be the best thing to use here.
>
>
>> bool ex_handler_bpf(const struct exception_table_entry *ex,
>> struct pt_regs *regs)
>> @@ -555,7 +560,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex,
>> off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup);
>> int regs_offset = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup);
>>
>> - *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
>> + if (regs_offset != DONT_CLEAR)
>> + *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
>> regs->epc = (unsigned long)&ex->fixup - offset;
>>
>> return true;
>> @@ -572,7 +578,8 @@ static int add_exception_handler(const struct bpf_insn *insn,
>> off_t fixup_offset;
>>
>> if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable ||
>> - (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX))
>> + (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX &&
>> + BPF_MODE(insn->code) != BPF_PROBE_MEM32))
>> return 0;
>>
>> if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries))
>> @@ -622,6 +629,9 @@ static int add_exception_handler(const struct bpf_insn *insn,
>>
>> ex->insn = ins_offset;
>>
>> + if (BPF_CLASS(insn->code) != BPF_LDX)
>> + dst_reg = DONT_CLEAR;
>> +
>
> Instead of having a side-effect, and passing a dummy dst_reg for the
> probe_mem32, just explicitly add DONT_CLEAR when calling
> add_exception_handler(). It's more obvious to me at least.
Sure, will do that in the next version.
>
>> ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) |
>> FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg);
>> ex->type = EX_TYPE_BPF;
>> @@ -1063,7 +1073,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>> BPF_CLASS(insn->code) == BPF_JMP;
>> int s, e, rvoff, ret, i = insn - ctx->prog->insnsi;
>> struct bpf_prog_aux *aux = ctx->prog->aux;
>> - u8 rd = -1, rs = -1, code = insn->code;
>> + u8 rd = -1, rs = -1, code = insn->code, reg_arena_vm_start = RV_REG_S7;
>> s16 off = insn->off;
>> s32 imm = insn->imm;
>>
>> @@ -1539,6 +1549,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>> case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
>> case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
>> case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
>> + /* LDX | PROBE_MEM32: dst = *(unsigned size *)(src + S7 + off)*/
>> + case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
>> + case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
>> + case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
>> + case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
>> {
>> int insn_len, insns_start;
>> bool sign_ext;
>> @@ -1546,6 +1561,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>> sign_ext = BPF_MODE(insn->code) == BPF_MEMSX ||
>> BPF_MODE(insn->code) == BPF_PROBE_MEMSX;
>>
>> + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) {
>> + emit_add(RV_REG_T2, rs, reg_arena_vm_start, ctx);
>> + rs = RV_REG_T2;
>> + }
>> +
>> switch (BPF_SIZE(code)) {
>> case BPF_B:
>> if (is_12b_int(off)) {
>> @@ -1682,6 +1702,87 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>> emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
>> break;
>>
>> + case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
>> + case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
>> + case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
>> + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
>> + {
>> + int insn_len, insns_start;
>> +
>> + emit_add(RV_REG_T3, rd, reg_arena_vm_start, ctx);
>> + rd = RV_REG_T3;
>> +
>> + /* Load imm to a register then store it */
>> + emit_imm(RV_REG_T1, imm, ctx);
>> +
>> + switch (BPF_SIZE(code)) {
>> + case BPF_B:
>> + if (is_12b_int(off)) {
>> + insns_start = ctx->ninsns;
>> + emit(rv_sb(rd, off, RV_REG_T1), ctx);
>> + insn_len = ctx->ninsns - insns_start;
>> + break;
>> + }
>> +
>> + emit_imm(RV_REG_T2, off, ctx);
>> + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>> + insns_start = ctx->ninsns;
>> + emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx);
>> + insn_len = ctx->ninsns - insns_start;
>> +
>> + break;
>> +
>> + case BPF_H:
>> + if (is_12b_int(off)) {
>> + insns_start = ctx->ninsns;
>> + emit(rv_sh(rd, off, RV_REG_T1), ctx);
>> + insn_len = ctx->ninsns - insns_start;
>> + break;
>> + }
>> +
>> + emit_imm(RV_REG_T2, off, ctx);
>> + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>> + insns_start = ctx->ninsns;
>> + emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx);
>> + insn_len = ctx->ninsns - insns_start;
>> + break;
>> + case BPF_W:
>> + if (is_12b_int(off)) {
>> + insns_start = ctx->ninsns;
>> + emit_sw(rd, off, RV_REG_T1, ctx);
>> + insn_len = ctx->ninsns - insns_start;
>> + break;
>> + }
>> +
>> + emit_imm(RV_REG_T2, off, ctx);
>> + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>> + insns_start = ctx->ninsns;
>> + emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx);
>> + insn_len = ctx->ninsns - insns_start;
>> + break;
>> + case BPF_DW:
>> + if (is_12b_int(off)) {
>> + insns_start = ctx->ninsns;
>> + emit_sd(rd, off, RV_REG_T1, ctx);
>> + insn_len = ctx->ninsns - insns_start;
>> + break;
>> + }
>> +
>> + emit_imm(RV_REG_T2, off, ctx);
>> + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>> + insns_start = ctx->ninsns;
>> + emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
>> + insn_len = ctx->ninsns - insns_start;
>> + break;
>> + }
>
> A lot of similar code, with emit of different sizes. Possible to move
> move out to a function, and wrap the emits? The main loop is hard read
> already!
I thought about this as well. My plan is to refactor the whole thing in a
seperate patch. I did not do it with this feature as it will cause a lot
of unrelated code churn.
Thanks,
Puranjay
On Mon, Mar 25, 2024 at 03:54:34PM +0000, Puranjay Mohan wrote:
> LLVM generates bpf_addr_space_cast instruction while translating
> pointers between native (zero) address space and
> __attribute__((address_space(N))). The addr_space=0 is reserved as
> bpf_arena address space.
>
> rY = addr_space_cast(rX, 0, 1) is processed by the verifier and
> converted to normal 32-bit move: wX = wY
>
> rY = addr_space_cast(rX, 1, 0) has to be converted by JIT.
>
> Signed-off-by: Puranjay Mohan <[email protected]>
Doesn't compile for allmodconfig:
../arch/riscv/net/bpf_jit_comp64.c:1086:7: error: call to undeclared function 'insn_is_cast_user'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
Cheers,
Conor.
On Mon, Mar 25, 2024 at 8:10 PM Conor Dooley <[email protected]> wrote:
>
> On Mon, Mar 25, 2024 at 03:54:34PM +0000, Puranjay Mohan wrote:
> > LLVM generates bpf_addr_space_cast instruction while translating
> > pointers between native (zero) address space and
> > __attribute__((address_space(N))). The addr_space=0 is reserved as
> > bpf_arena address space.
> >
> > rY = addr_space_cast(rX, 0, 1) is processed by the verifier and
> > converted to normal 32-bit move: wX = wY
> >
> > rY = addr_space_cast(rX, 1, 0) has to be converted by JIT.
> >
> > Signed-off-by: Puranjay Mohan <[email protected]>
>
> Doesn't compile for allmodconfig:
> ../arch/riscv/net/bpf_jit_comp64.c:1086:7: error: call to undeclared function 'insn_is_cast_user'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
>
> Cheers,
> Conor.
Yes,
I mentioned in the cover letter that a patch is required.
It just got merged in bpf-next/master:
https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=770546ae9f4c1ae1ebcaf0874f0dd9631d77ec97
So, rebasing on the latest bpf-next/master should fix the issue.
Thanks,
Puranjay
On Mon, Mar 25, 2024 at 08:13:10PM +0100, Puranjay Mohan wrote:
> On Mon, Mar 25, 2024 at 8:10 PM Conor Dooley <[email protected]> wrote:
> >
> > On Mon, Mar 25, 2024 at 03:54:34PM +0000, Puranjay Mohan wrote:
> > > LLVM generates bpf_addr_space_cast instruction while translating
> > > pointers between native (zero) address space and
> > > __attribute__((address_space(N))). The addr_space=0 is reserved as
> > > bpf_arena address space.
> > >
> > > rY = addr_space_cast(rX, 0, 1) is processed by the verifier and
> > > converted to normal 32-bit move: wX = wY
> > >
> > > rY = addr_space_cast(rX, 1, 0) has to be converted by JIT.
> > >
> > > Signed-off-by: Puranjay Mohan <[email protected]>
> >
> > Doesn't compile for allmodconfig:
> > ../arch/riscv/net/bpf_jit_comp64.c:1086:7: error: call to undeclared function 'insn_is_cast_user'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
> >
> > Cheers,
> > Conor.
>
> Yes,
> I mentioned in the cover letter that a patch is required.
> It just got merged in bpf-next/master:
> https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=770546ae9f4c1ae1ebcaf0874f0dd9631d77ec97
>
> So, rebasing on the latest bpf-next/master should fix the issue.
Ah, I see now that there was a mention in the cover letter that I did
not see first time around.
Bjorn, do you think there's anything we can do about these kinda
misleading CI failures for bpf stuff? Some stuff that touches bpf
definitely is worth us building, but should we try and build it on top
of the bpf tree instead?
On Mon, Mar 25, 2024 at 8:19 PM Conor Dooley <[email protected]> wrote:
>
> On Mon, Mar 25, 2024 at 08:13:10PM +0100, Puranjay Mohan wrote:
> > On Mon, Mar 25, 2024 at 8:10 PM Conor Dooley <[email protected]> wrote:
> > >
> > > On Mon, Mar 25, 2024 at 03:54:34PM +0000, Puranjay Mohan wrote:
> > > > LLVM generates bpf_addr_space_cast instruction while translating
> > > > pointers between native (zero) address space and
> > > > __attribute__((address_space(N))). The addr_space=0 is reserved as
> > > > bpf_arena address space.
> > > >
> > > > rY = addr_space_cast(rX, 0, 1) is processed by the verifier and
> > > > converted to normal 32-bit move: wX = wY
> > > >
> > > > rY = addr_space_cast(rX, 1, 0) has to be converted by JIT.
> > > >
> > > > Signed-off-by: Puranjay Mohan <[email protected]>
> > >
> > > Doesn't compile for allmodconfig:
> > > ../arch/riscv/net/bpf_jit_comp64.c:1086:7: error: call to undeclared function 'insn_is_cast_user'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
> > >
> > > Cheers,
> > > Conor.
> >
> > Yes,
> > I mentioned in the cover letter that a patch is required.
> > It just got merged in bpf-next/master:
> > https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=770546ae9f4c1ae1ebcaf0874f0dd9631d77ec97
> >
> > So, rebasing on the latest bpf-next/master should fix the issue.
>
> Ah, I see now that there was a mention in the cover letter that I did
> not see first time around.
>
> Bjorn, do you think there's anything we can do about these kinda
> misleading CI failures for bpf stuff? Some stuff that touches bpf
> definitely is worth us building, but should we try and build it on top
> of the bpf tree instead?
Is there a separate CI for RISCV related stuff? is it public?
I would be interested in adding RISC-V support in
https://github.com/kernel-patches/bpf
Is someone already working on this?
Conor Dooley <[email protected]> writes:
> Bjorn, do you think there's anything we can do about these kinda
> misleading CI failures for bpf stuff? Some stuff that touches bpf
> definitely is worth us building, but should we try and build it on top
> of the bpf tree instead?
IMO: The way to go is enabling RV support in the BPF CI (I'll expand on
this in Puranjay's later mail), and ignore BPF series for the RV
patchwork CI. I think having multiple trees in the RV CI is not worth
the pain...
Sort of related is that I think it could be worthwhile only building
series that had some human interaction (a pair of eyes -- "yes, this
does make sense to build"). Right now we're just building everything,
and we have to pay (money *and* time) for it.
..and then the BPF series would e.g. not be built at the RV PW CI.
(But mostly me thinking out loud! ;-))
Björn
Puranjay Mohan <[email protected]> writes:
> Is there a separate CI for RISCV related stuff? is it public?
>
> I would be interested in adding RISC-V support in
> https://github.com/kernel-patches/bpf
> Is someone already working on this?
+Cc Manu/Mykola/Nico who's doing all of the awesome BPF CI work at Meta,
and can keep me honest. ;-)
I did some early hacks for to add RISC-V support for the BPF CI, but
haven't had time to work on it recently. :-(
[1] https://github.com/libbpf/ci/pull/87
[2] https://github.com/kernel-patches/vmtest/pull/194
I've been talking recently to Lehui about it as well.
Two major things are missing:
1. Cross-compilation support (expand on [1])
2. Align the rootfs with what the other arch are using, to run the tests
on Qemu/TCG (and proper HW at some point!). RISC-V does not have
Debian Stable support, and would probably need Ubuntu or Debian Sid
snapshop. Manu outlines some issues here:
https://github.com/libbpf/ci/pull/83
Having to manually run BPF tests ("non-official RISC-V BPF CI") is a
mess!
Björn
Puranjay Mohan <[email protected]> writes:
> Add support for [LDX | STX | ST], PROBE_MEM32, [B | H | W | DW]
> instructions. They are similar to PROBE_MEM instructions with the
> following differences:
> - PROBE_MEM32 supports store.
> - PROBE_MEM32 relies on the verifier to clear upper 32-bit of the
> src/dst register
> - PROBE_MEM32 adds 64-bit kern_vm_start address (which is stored in S7
> in the prologue). Due to bpf_arena constructions such S7 + reg +
> off16 access is guaranteed to be within arena virtual range, so no
> address check at run-time.
> - S7 is a free callee-saved register, so it is used to store kern_vm_start
> - PROBE_MEM32 allows STX and ST. If they fault the store is a nop. When
> LDX faults the destination register is zeroed.
>
> To support these on riscv, we do tmp = S7 + src/dst reg and then use
> tmp2 as the new src/dst register. This allows us to reuse most of the
> code for normal [LDX | STX | ST].
Cool to see the RV BPF JIT keeping up with x86 features! ;-) Nice work!
A couple of minor comments below.
> Signed-off-by: Puranjay Mohan <[email protected]>
> ---
> arch/riscv/net/bpf_jit.h | 1 +
> arch/riscv/net/bpf_jit_comp64.c | 193 +++++++++++++++++++++++++++++++-
> arch/riscv/net/bpf_jit_core.c | 1 +
> 3 files changed, 192 insertions(+), 3 deletions(-)
>
> diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
> index f4b6b3b9edda..8a47da08dd9c 100644
> --- a/arch/riscv/net/bpf_jit.h
> +++ b/arch/riscv/net/bpf_jit.h
> @@ -81,6 +81,7 @@ struct rv_jit_context {
> int nexentries;
> unsigned long flags;
> int stack_size;
> + u64 arena_vm_start;
> };
>
> /* Convert from ninsns to bytes. */
> diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
> index 1adf2f39ce59..0c0588e327af 100644
> --- a/arch/riscv/net/bpf_jit_comp64.c
> +++ b/arch/riscv/net/bpf_jit_comp64.c
> @@ -255,6 +255,10 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
> emit_ld(RV_REG_S6, store_offset, RV_REG_SP, ctx);
> store_offset -= 8;
> }
> + if (ctx->arena_vm_start) {
> + emit_ld(RV_REG_S7, store_offset, RV_REG_SP, ctx);
> + store_offset -= 8;
> + }
>
> emit_addi(RV_REG_SP, RV_REG_SP, stack_adjust, ctx);
> /* Set return value. */
> @@ -548,6 +552,7 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64,
>
> #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0)
> #define BPF_FIXUP_REG_MASK GENMASK(31, 27)
> +#define DONT_CLEAR 17 /* RV_REG_A7 unused in pt_regmap */
Hmm, so this is just a a sentinel node, right? Isn't it more robust to
use, say REG_ZERO which will never be used? Maybe REG_DONT_CLEAR_MARKER
or smth, so it's obvious how it's used?
> bool ex_handler_bpf(const struct exception_table_entry *ex,
> struct pt_regs *regs)
> @@ -555,7 +560,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex,
> off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup);
> int regs_offset = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup);
>
> - *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
> + if (regs_offset != DONT_CLEAR)
> + *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
> regs->epc = (unsigned long)&ex->fixup - offset;
>
> return true;
> @@ -572,7 +578,8 @@ static int add_exception_handler(const struct bpf_insn *insn,
> off_t fixup_offset;
>
> if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable ||
> - (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX))
> + (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX &&
> + BPF_MODE(insn->code) != BPF_PROBE_MEM32))
> return 0;
>
> if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries))
> @@ -622,6 +629,9 @@ static int add_exception_handler(const struct bpf_insn *insn,
>
> ex->insn = ins_offset;
>
> + if (BPF_CLASS(insn->code) != BPF_LDX)
> + dst_reg = DONT_CLEAR;
> +
Instead of having a side-effect, and passing a dummy dst_reg for the
probe_mem32, just explicitly add DONT_CLEAR when calling
add_exception_handler(). It's more obvious to me at least.
> ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) |
> FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg);
> ex->type = EX_TYPE_BPF;
> @@ -1063,7 +1073,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
> BPF_CLASS(insn->code) == BPF_JMP;
> int s, e, rvoff, ret, i = insn - ctx->prog->insnsi;
> struct bpf_prog_aux *aux = ctx->prog->aux;
> - u8 rd = -1, rs = -1, code = insn->code;
> + u8 rd = -1, rs = -1, code = insn->code, reg_arena_vm_start = RV_REG_S7;
> s16 off = insn->off;
> s32 imm = insn->imm;
>
> @@ -1539,6 +1549,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
> case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
> case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
> case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
> + /* LDX | PROBE_MEM32: dst = *(unsigned size *)(src + S7 + off)*/
> + case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
> + case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
> + case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
> + case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
> {
> int insn_len, insns_start;
> bool sign_ext;
> @@ -1546,6 +1561,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
> sign_ext = BPF_MODE(insn->code) == BPF_MEMSX ||
> BPF_MODE(insn->code) == BPF_PROBE_MEMSX;
>
> + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) {
> + emit_add(RV_REG_T2, rs, reg_arena_vm_start, ctx);
> + rs = RV_REG_T2;
> + }
> +
> switch (BPF_SIZE(code)) {
> case BPF_B:
> if (is_12b_int(off)) {
> @@ -1682,6 +1702,87 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
> emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
> break;
>
> + case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
> + case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
> + case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
> + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
> + {
> + int insn_len, insns_start;
> +
> + emit_add(RV_REG_T3, rd, reg_arena_vm_start, ctx);
> + rd = RV_REG_T3;
> +
> + /* Load imm to a register then store it */
> + emit_imm(RV_REG_T1, imm, ctx);
> +
> + switch (BPF_SIZE(code)) {
> + case BPF_B:
> + if (is_12b_int(off)) {
> + insns_start = ctx->ninsns;
> + emit(rv_sb(rd, off, RV_REG_T1), ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + }
> +
> + emit_imm(RV_REG_T2, off, ctx);
> + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
> + insns_start = ctx->ninsns;
> + emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx);
> + insn_len = ctx->ninsns - insns_start;
> +
> + break;
> +
> + case BPF_H:
> + if (is_12b_int(off)) {
> + insns_start = ctx->ninsns;
> + emit(rv_sh(rd, off, RV_REG_T1), ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + }
> +
> + emit_imm(RV_REG_T2, off, ctx);
> + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
> + insns_start = ctx->ninsns;
> + emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + case BPF_W:
> + if (is_12b_int(off)) {
> + insns_start = ctx->ninsns;
> + emit_sw(rd, off, RV_REG_T1, ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + }
> +
> + emit_imm(RV_REG_T2, off, ctx);
> + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
> + insns_start = ctx->ninsns;
> + emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + case BPF_DW:
> + if (is_12b_int(off)) {
> + insns_start = ctx->ninsns;
> + emit_sd(rd, off, RV_REG_T1, ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + }
> +
> + emit_imm(RV_REG_T2, off, ctx);
> + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
> + insns_start = ctx->ninsns;
> + emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + }
A lot of similar code, with emit of different sizes. Possible to move
move out to a function, and wrap the emits? The main loop is hard read
already!
> +
> + ret = add_exception_handler(insn, ctx, rd, insn_len);
> + if (ret)
> + return ret;
> +
> + break;
> + }
> +
> /* STX: *(size *)(dst + off) = src */
> case BPF_STX | BPF_MEM | BPF_B:
> if (is_12b_int(off)) {
> @@ -1728,6 +1829,83 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
> emit_atomic(rd, rs, off, imm,
> BPF_SIZE(code) == BPF_DW, ctx);
> break;
> +
> + case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
> + case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
> + case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
> + case BPF_STX | BPF_PROBE_MEM32 | BPF_DW:
> + {
> + int insn_len, insns_start;
> +
> + emit_add(RV_REG_T2, rd, reg_arena_vm_start, ctx);
> + rd = RV_REG_T2;
> +
> + switch (BPF_SIZE(code)) {
> + case BPF_B:
> + if (is_12b_int(off)) {
> + insns_start = ctx->ninsns;
> + emit(rv_sb(rd, off, rs), ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + }
> +
> + emit_imm(RV_REG_T1, off, ctx);
> + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
> + insns_start = ctx->ninsns;
> + emit(rv_sb(RV_REG_T1, 0, rs), ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + case BPF_H:
> + if (is_12b_int(off)) {
> + insns_start = ctx->ninsns;
> + emit(rv_sh(rd, off, rs), ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + }
> +
> + emit_imm(RV_REG_T1, off, ctx);
> + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
> + insns_start = ctx->ninsns;
> + emit(rv_sh(RV_REG_T1, 0, rs), ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + case BPF_W:
> + if (is_12b_int(off)) {
> + insns_start = ctx->ninsns;
> + emit_sw(rd, off, rs, ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + }
> +
> + emit_imm(RV_REG_T1, off, ctx);
> + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
> + insns_start = ctx->ninsns;
> + emit_sw(RV_REG_T1, 0, rs, ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + case BPF_DW:
> + if (is_12b_int(off)) {
> + insns_start = ctx->ninsns;
> + emit_sd(rd, off, rs, ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + }
> +
> + emit_imm(RV_REG_T1, off, ctx);
> + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
> + insns_start = ctx->ninsns;
> + emit_sd(RV_REG_T1, 0, rs, ctx);
> + insn_len = ctx->ninsns - insns_start;
> + break;
> + }
Same comment as above.
Björn
Conor Dooley <[email protected]> writes:
>> Is there a separate CI for RISCV related stuff? is it public?
>
> It's based outta patchwork, just like the netdev/bpf stuff:
> https://patchwork.kernel.org/project/linux-riscv/list/
*salesman mode on*
..and more information on the CI (and source!) can be found here:
https://github.com/linux-riscv/github-ci/wiki
https://wiki.riseproject.dev/display/HOME/PoC+Github+Runners+on+GKE
Improvements are very much welcome. It's pretty much only Conor and me
doing best effort hacking on the CI.
Björn
On Mon, Mar 25, 2024 at 08:23:18PM +0100, Puranjay Mohan wrote:
> On Mon, Mar 25, 2024 at 8:19 PM Conor Dooley <[email protected]> wrote:
> >
> > On Mon, Mar 25, 2024 at 08:13:10PM +0100, Puranjay Mohan wrote:
> > > On Mon, Mar 25, 2024 at 8:10 PM Conor Dooley <[email protected]> wrote:
> > > >
> > > > On Mon, Mar 25, 2024 at 03:54:34PM +0000, Puranjay Mohan wrote:
> > > > > LLVM generates bpf_addr_space_cast instruction while translating
> > > > > pointers between native (zero) address space and
> > > > > __attribute__((address_space(N))). The addr_space=0 is reserved as
> > > > > bpf_arena address space.
> > > > >
> > > > > rY = addr_space_cast(rX, 0, 1) is processed by the verifier and
> > > > > converted to normal 32-bit move: wX = wY
> > > > >
> > > > > rY = addr_space_cast(rX, 1, 0) has to be converted by JIT.
> > > > >
> > > > > Signed-off-by: Puranjay Mohan <[email protected]>
> > > >
> > > > Doesn't compile for allmodconfig:
> > > > ../arch/riscv/net/bpf_jit_comp64.c:1086:7: error: call to undeclared function 'insn_is_cast_user'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
> > > >
> > > > Cheers,
> > > > Conor.
> > >
> > > Yes,
> > > I mentioned in the cover letter that a patch is required.
> > > It just got merged in bpf-next/master:
> > > https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=770546ae9f4c1ae1ebcaf0874f0dd9631d77ec97
> > >
> > > So, rebasing on the latest bpf-next/master should fix the issue.
> >
> > Ah, I see now that there was a mention in the cover letter that I did
> > not see first time around.
> >
> > Bjorn, do you think there's anything we can do about these kinda
> > misleading CI failures for bpf stuff? Some stuff that touches bpf
> > definitely is worth us building, but should we try and build it on top
> > of the bpf tree instead?
>
> Is there a separate CI for RISCV related stuff? is it public?
It's based outta patchwork, just like the netdev/bpf stuff:
https://patchwork.kernel.org/project/linux-riscv/list/
> I would be interested in adding RISC-V support in
> https://github.com/kernel-patches/bpf
> Is someone already working on this?
If anyone is, it is probably something Bjorn knows about!
On 2024/3/26 4:31, Björn Töpel wrote:
> Puranjay Mohan <[email protected]> writes:
>
>> Is there a separate CI for RISCV related stuff? is it public?
>>
>> I would be interested in adding RISC-V support in
>> https://github.com/kernel-patches/bpf
>> Is someone already working on this?
>
> +Cc Manu/Mykola/Nico who's doing all of the awesome BPF CI work at Meta,
> and can keep me honest. ;-)
>
> I did some early hacks for to add RISC-V support for the BPF CI, but
> haven't had time to work on it recently. :-(
>
> [1] https://github.com/libbpf/ci/pull/87
> [2] https://github.com/kernel-patches/vmtest/pull/194
>
> I've been talking recently to Lehui about it as well.
>
> Two major things are missing:
>
> 1. Cross-compilation support (expand on [1])
> 2. Align the rootfs with what the other arch are using, to run the tests
> on Qemu/TCG (and proper HW at some point!). RISC-V does not have
> Debian Stable support, and would probably need Ubuntu or Debian Sid
> snapshop. Manu outlines some issues here:
> https://github.com/libbpf/ci/pull/83
yeah, the current issue below is fixed and I think we can move forward.
https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/commit/?id=443574b033876c85a35de4c65c14f7fe092222b2
>
> Having to manually run BPF tests ("non-official RISC-V BPF CI") is a
> mess!
>
>
> Björn
On 2024/3/26 1:15, Puranjay Mohan wrote:
> Björn Töpel <[email protected]> writes:
>
>> Puranjay Mohan <[email protected]> writes:
>>
>>> Add support for [LDX | STX | ST], PROBE_MEM32, [B | H | W | DW]
>>> instructions. They are similar to PROBE_MEM instructions with the
>>> following differences:
>>> - PROBE_MEM32 supports store.
>>> - PROBE_MEM32 relies on the verifier to clear upper 32-bit of the
>>> src/dst register
>>> - PROBE_MEM32 adds 64-bit kern_vm_start address (which is stored in S7
>>> in the prologue). Due to bpf_arena constructions such S7 + reg +
>>> off16 access is guaranteed to be within arena virtual range, so no
>>> address check at run-time.
>>> - S7 is a free callee-saved register, so it is used to store kern_vm_start
>>> - PROBE_MEM32 allows STX and ST. If they fault the store is a nop. When
>>> LDX faults the destination register is zeroed.
>>>
>>> To support these on riscv, we do tmp = S7 + src/dst reg and then use
>>> tmp2 as the new src/dst register. This allows us to reuse most of the
>>> code for normal [LDX | STX | ST].
>>
>> Cool to see the RV BPF JIT keeping up with x86 features! ;-) Nice work!
>
> It is my self proclaimed duty to make sure that all 64-bit JITs have
> feature parity. :D
>
>>
>> A couple of minor comments below.
>>
>>> Signed-off-by: Puranjay Mohan <[email protected]>
>>> ---
>>> arch/riscv/net/bpf_jit.h | 1 +
>>> arch/riscv/net/bpf_jit_comp64.c | 193 +++++++++++++++++++++++++++++++-
>>> arch/riscv/net/bpf_jit_core.c | 1 +
>>> 3 files changed, 192 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
>>> index f4b6b3b9edda..8a47da08dd9c 100644
>>> --- a/arch/riscv/net/bpf_jit.h
>>> +++ b/arch/riscv/net/bpf_jit.h
>>> @@ -81,6 +81,7 @@ struct rv_jit_context {
>>> int nexentries;
>>> unsigned long flags;
>>> int stack_size;
>>> + u64 arena_vm_start;
>>> };
>>>
>>> /* Convert from ninsns to bytes. */
>>> diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
>>> index 1adf2f39ce59..0c0588e327af 100644
>>> --- a/arch/riscv/net/bpf_jit_comp64.c
>>> +++ b/arch/riscv/net/bpf_jit_comp64.c
>>> @@ -255,6 +255,10 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
>>> emit_ld(RV_REG_S6, store_offset, RV_REG_SP, ctx);
>>> store_offset -= 8;
>>> }
>>> + if (ctx->arena_vm_start) {
>>> + emit_ld(RV_REG_S7, store_offset, RV_REG_SP, ctx);
>>> + store_offset -= 8;
>>> + }
As RV_REG_S7 is only for bpf arena, how about define this register as
bellow, like RV_REG_TCC
#define RV_REG_ARENA RV_REG_S7
>>>
>>> emit_addi(RV_REG_SP, RV_REG_SP, stack_adjust, ctx);
>>> /* Set return value. */
>>> @@ -548,6 +552,7 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64,
>>>
>>> #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0)
>>> #define BPF_FIXUP_REG_MASK GENMASK(31, 27)
>>> +#define DONT_CLEAR 17 /* RV_REG_A7 unused in pt_regmap */
>>
>> Hmm, so this is just a a sentinel node, right? Isn't it more robust to
>> use, say REG_ZERO which will never be used? Maybe REG_DONT_CLEAR_MARKER
>> or smth, so it's obvious how it's used?
>
> Yes, I agree, RV_REG_ZERO would be the best thing to use here.
>
>>
>>
>>> bool ex_handler_bpf(const struct exception_table_entry *ex,
>>> struct pt_regs *regs)
>>> @@ -555,7 +560,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex,
>>> off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup);
>>> int regs_offset = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup);
>>>
>>> - *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
>>> + if (regs_offset != DONT_CLEAR)
>>> + *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
>>> regs->epc = (unsigned long)&ex->fixup - offset;
>>>
>>> return true;
>>> @@ -572,7 +578,8 @@ static int add_exception_handler(const struct bpf_insn *insn,
>>> off_t fixup_offset;
>>>
>>> if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable ||
>>> - (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX))
>>> + (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX &&
>>> + BPF_MODE(insn->code) != BPF_PROBE_MEM32))
>>> return 0;
>>>
>>> if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries))
>>> @@ -622,6 +629,9 @@ static int add_exception_handler(const struct bpf_insn *insn,
>>>
>>> ex->insn = ins_offset;
>>>
>>> + if (BPF_CLASS(insn->code) != BPF_LDX)
>>> + dst_reg = DONT_CLEAR;
>>> +
>>
>> Instead of having a side-effect, and passing a dummy dst_reg for the
>> probe_mem32, just explicitly add DONT_CLEAR when calling
>> add_exception_handler(). It's more obvious to me at least.
>
> Sure, will do that in the next version.
>
>>
>>> ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) |
>>> FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg);
>>> ex->type = EX_TYPE_BPF;
>>> @@ -1063,7 +1073,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>>> BPF_CLASS(insn->code) == BPF_JMP;
>>> int s, e, rvoff, ret, i = insn - ctx->prog->insnsi;
>>> struct bpf_prog_aux *aux = ctx->prog->aux;
>>> - u8 rd = -1, rs = -1, code = insn->code;
>>> + u8 rd = -1, rs = -1, code = insn->code, reg_arena_vm_start = RV_REG_S7;
>>> s16 off = insn->off;
>>> s32 imm = insn->imm;
>>>
>>> @@ -1539,6 +1549,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>>> case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
>>> case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
>>> case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
>>> + /* LDX | PROBE_MEM32: dst = *(unsigned size *)(src + S7 + off)*/
>>> + case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
>>> + case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
>>> + case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
>>> + case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
>>> {
>>> int insn_len, insns_start;
>>> bool sign_ext;
>>> @@ -1546,6 +1561,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>>> sign_ext = BPF_MODE(insn->code) == BPF_MEMSX ||
>>> BPF_MODE(insn->code) == BPF_PROBE_MEMSX;
>>>
>>> + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) {
>>> + emit_add(RV_REG_T2, rs, reg_arena_vm_start, ctx);
>>> + rs = RV_REG_T2;
>>> + }
>>> +
>>> switch (BPF_SIZE(code)) {
>>> case BPF_B:
>>> if (is_12b_int(off)) {
>>> @@ -1682,6 +1702,87 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>>> emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
>>> break;
>>>
>>> + case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
>>> + case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
>>> + case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
>>> + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
>>> + {
>>> + int insn_len, insns_start;
>>> +
>>> + emit_add(RV_REG_T3, rd, reg_arena_vm_start, ctx);
>>> + rd = RV_REG_T3;
>>> +
>>> + /* Load imm to a register then store it */
>>> + emit_imm(RV_REG_T1, imm, ctx);
>>> +
>>> + switch (BPF_SIZE(code)) {
>>> + case BPF_B:
>>> + if (is_12b_int(off)) {
>>> + insns_start = ctx->ninsns;
>>> + emit(rv_sb(rd, off, RV_REG_T1), ctx);
>>> + insn_len = ctx->ninsns - insns_start;
>>> + break;
>>> + }
>>> +
>>> + emit_imm(RV_REG_T2, off, ctx);
>>> + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>>> + insns_start = ctx->ninsns;
>>> + emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx);
>>> + insn_len = ctx->ninsns - insns_start;
>>> +
>>> + break;
>>> +
>>> + case BPF_H:
>>> + if (is_12b_int(off)) {
>>> + insns_start = ctx->ninsns;
>>> + emit(rv_sh(rd, off, RV_REG_T1), ctx);
>>> + insn_len = ctx->ninsns - insns_start;
>>> + break;
>>> + }
>>> +
>>> + emit_imm(RV_REG_T2, off, ctx);
>>> + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>>> + insns_start = ctx->ninsns;
>>> + emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx);
>>> + insn_len = ctx->ninsns - insns_start;
>>> + break;
>>> + case BPF_W:
>>> + if (is_12b_int(off)) {
>>> + insns_start = ctx->ninsns;
>>> + emit_sw(rd, off, RV_REG_T1, ctx);
>>> + insn_len = ctx->ninsns - insns_start;
>>> + break;
>>> + }
>>> +
>>> + emit_imm(RV_REG_T2, off, ctx);
>>> + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>>> + insns_start = ctx->ninsns;
>>> + emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx);
>>> + insn_len = ctx->ninsns - insns_start;
>>> + break;
>>> + case BPF_DW:
>>> + if (is_12b_int(off)) {
>>> + insns_start = ctx->ninsns;
>>> + emit_sd(rd, off, RV_REG_T1, ctx);
>>> + insn_len = ctx->ninsns - insns_start;
>>> + break;
>>> + }
>>> +
>>> + emit_imm(RV_REG_T2, off, ctx);
>>> + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>>> + insns_start = ctx->ninsns;
>>> + emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
>>> + insn_len = ctx->ninsns - insns_start;
>>> + break;
>>> + }
>>
>> A lot of similar code, with emit of different sizes. Possible to move
>> move out to a function, and wrap the emits? The main loop is hard read
>> already!
>
> I thought about this as well. My plan is to refactor the whole thing in a
> seperate patch. I did not do it with this feature as it will cause a lot
> of unrelated code churn.
Yeah, I think we could do that factor out for LDX, ST, STX, while I had
done it before another riscv bpf arena. BUT, looking forword to your
implementation.????
>
> Thanks,
> Puranjay