State of a register doesn't matter if it wasn't read in reaching an exit;
a write screens off all reads downstream of it from all explored_states
upstream of it.
This allows us to prune many more branches; here are some processed insn
counts for some Cilium programs:
Program before after
bpf_lb_opt_-DLB_L3.o 6515 3361
bpf_lb_opt_-DLB_L4.o 8976 5176
bpf_lb_opt_-DUNKNOWN.o 2960 1137
bpf_lxc_opt_-DDROP_ALL.o 95412 48537
bpf_lxc_opt_-DUNKNOWN.o 141706 78718
bpf_netdev.o 24251 17995
bpf_overlay.o 10999 9385
The runtime is also improved; here are 'time' results in ms:
Program before after
bpf_lb_opt_-DLB_L3.o 24 6
bpf_lb_opt_-DLB_L4.o 26 11
bpf_lb_opt_-DUNKNOWN.o 11 2
bpf_lxc_opt_-DDROP_ALL.o 1288 139
bpf_lxc_opt_-DUNKNOWN.o 1768 234
bpf_netdev.o 62 31
bpf_overlay.o 15 13
Signed-off-by: Edward Cree <[email protected]>
---
v3: cleaner code around marking caller-saved regs, again spotted by Daniel
Borkmann. Should have no functional effect.
v2: update liveness in LD_ABS|IND, as pointed out by Daniel Borkmann. The
numbers are mostly unchanged; bpf_lxc_opt_-DUNKNOWN.o dropped about 300
insns and 20ms, while bpf_lxc_opt_-DDROP_ALL.o (despite not changing its
#insns) also dropped 13ms.
include/linux/bpf_verifier.h | 11 ++-
kernel/bpf/verifier.c | 189 +++++++++++++++++++++++++++++++++----------
2 files changed, 156 insertions(+), 44 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c61c3033..91d07ef 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -21,6 +21,12 @@
*/
#define BPF_MAX_VAR_SIZ INT_MAX
+enum bpf_reg_liveness {
+ REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
+ REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
+ REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
+};
+
struct bpf_reg_state {
enum bpf_reg_type type;
union {
@@ -40,7 +46,7 @@ struct bpf_reg_state {
* came from, when one is tested for != NULL.
*/
u32 id;
- /* These five fields must be last. See states_equal() */
+ /* Ordering of fields matters. See states_equal() */
/* For scalar types (SCALAR_VALUE), this represents our knowledge of
* the actual value.
* For pointer types, this represents the variable part of the offset
@@ -57,6 +63,8 @@ struct bpf_reg_state {
s64 smax_value; /* maximum possible (s64)value */
u64 umin_value; /* minimum possible (u64)value */
u64 umax_value; /* maximum possible (u64)value */
+ /* This field must be last, for states_equal() reasons. */
+ enum bpf_reg_liveness live;
};
enum bpf_stack_slot_type {
@@ -74,6 +82,7 @@ struct bpf_verifier_state {
struct bpf_reg_state regs[MAX_BPF_REG];
u8 stack_slot_type[MAX_BPF_STACK];
struct bpf_reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
+ struct bpf_verifier_state *parent;
};
/* linked list of verifier states used to prune search */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ecc590e..7dd96d0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -629,8 +629,10 @@ static void init_reg_state(struct bpf_reg_state *regs)
{
int i;
- for (i = 0; i < MAX_BPF_REG; i++)
+ for (i = 0; i < MAX_BPF_REG; i++) {
mark_reg_not_init(regs, i);
+ regs[i].live = REG_LIVE_NONE;
+ }
/* frame pointer */
regs[BPF_REG_FP].type = PTR_TO_STACK;
@@ -647,9 +649,26 @@ enum reg_arg_type {
DST_OP_NO_MARK /* same as above, check only, don't mark */
};
-static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
+static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
+{
+ struct bpf_verifier_state *parent = state->parent;
+
+ while (parent) {
+ /* if read wasn't screened by an earlier write ... */
+ if (state->regs[regno].live & REG_LIVE_WRITTEN)
+ break;
+ /* ... then we depend on parent's value */
+ parent->regs[regno].live |= REG_LIVE_READ;
+ state = parent;
+ parent = state->parent;
+ }
+}
+
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
enum reg_arg_type t)
{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+
if (regno >= MAX_BPF_REG) {
verbose("R%d is invalid\n", regno);
return -EINVAL;
@@ -661,12 +680,14 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
verbose("R%d !read_ok\n", regno);
return -EACCES;
}
+ mark_reg_read(&env->cur_state, regno);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
verbose("frame pointer is read only\n");
return -EACCES;
}
+ regs[regno].live |= REG_LIVE_WRITTEN;
if (t == DST_OP)
mark_reg_unknown(regs, regno);
}
@@ -695,7 +716,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
static int check_stack_write(struct bpf_verifier_state *state, int off,
int size, int value_regno)
{
- int i;
+ int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
@@ -710,15 +731,14 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
}
/* save register state */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- state->regs[value_regno];
+ state->spilled_regs[spi] = state->regs[value_regno];
+ state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
for (i = 0; i < BPF_REG_SIZE; i++)
state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
} else {
/* regular write of data into stack */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- (struct bpf_reg_state) {};
+ state->spilled_regs[spi] = (struct bpf_reg_state) {};
for (i = 0; i < size; i++)
state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
@@ -726,11 +746,26 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
return 0;
}
+static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot)
+{
+ struct bpf_verifier_state *parent = state->parent;
+
+ while (parent) {
+ /* if read wasn't screened by an earlier write ... */
+ if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+ break;
+ /* ... then we depend on parent's value */
+ parent->spilled_regs[slot].live |= REG_LIVE_READ;
+ state = parent;
+ parent = state->parent;
+ }
+}
+
static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
u8 *slot_type;
- int i;
+ int i, spi;
slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
@@ -746,10 +781,13 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
}
}
- if (value_regno >= 0)
+ spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+
+ if (value_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] =
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
+ state->regs[value_regno] = state->spilled_regs[spi];
+ mark_stack_slot_read(state, spi);
+ }
return 0;
} else {
for (i = 0; i < size; i++) {
@@ -1167,7 +1205,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
int err;
if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
@@ -1177,12 +1214,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -1297,10 +1334,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_DONTCARE)
return 0;
- if (type == NOT_INIT) {
- verbose("R%d !read_ok\n", regno);
- return -EACCES;
- }
+ err = check_reg_arg(env, regno, SRC_OP);
+ if (err)
+ return err;
if (arg_type == ARG_ANYTHING) {
if (is_pointer_value(env, regno)) {
@@ -1639,10 +1675,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
}
/* reset caller saved regs */
- for (i = 0; i < CALLER_SAVED_REGS; i++)
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
- /* update return register */
+ /* update return register (already marked as written above) */
if (fn->ret_type == RET_INTEGER) {
/* sets type to SCALAR_VALUE */
mark_reg_unknown(regs, BPF_REG_0);
@@ -2250,7 +2288,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check src operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -2261,7 +2299,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
@@ -2274,7 +2312,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check src operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
} else {
@@ -2285,7 +2323,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
@@ -2328,7 +2366,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
} else {
@@ -2339,7 +2377,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -2360,7 +2398,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err)
return err;
@@ -2717,7 +2755,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
@@ -2734,7 +2772,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -2851,7 +2889,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
@@ -2917,7 +2955,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check whether implicit source operand (register R6) is readable */
- err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+ err = check_reg_arg(env, BPF_REG_6, SRC_OP);
if (err)
return err;
@@ -2928,17 +2966,20 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (mode == BPF_IND) {
/* check explicit source operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
}
/* reset caller saved regs to unreadable */
- for (i = 0; i < CALLER_SAVED_REGS; i++)
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
/* mark destination R0 register as readable, since it contains
- * the value fetched from the packet
+ * the value fetched from the packet.
+ * Already marked as written above.
*/
mark_reg_unknown(regs, BPF_REG_0);
return 0;
@@ -3194,7 +3235,11 @@ static bool regsafe(struct bpf_reg_state *rold,
struct bpf_reg_state *rcur,
bool varlen_map_access, struct idpair *idmap)
{
- if (memcmp(rold, rcur, sizeof(*rold)) == 0)
+ if (!(rold->live & REG_LIVE_READ))
+ /* explored state didn't use this */
+ return true;
+
+ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0)
return true;
if (rold->type == NOT_INIT)
@@ -3372,10 +3417,56 @@ static bool states_equal(struct bpf_verifier_env *env,
return ret;
}
+static bool do_propagate_liveness(const struct bpf_verifier_state *state,
+ struct bpf_verifier_state *parent)
+{
+ bool touched = false; /* any changes made? */
+ int i;
+
+ if (!parent)
+ return touched;
+ /* Propagate read liveness of registers... */
+ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
+ /* We don't need to worry about FP liveness because it's read-only */
+ for (i = 0; i < BPF_REG_FP; i++) {
+ if (parent->regs[i].live & REG_LIVE_READ)
+ continue;
+ if (state->regs[i].live == REG_LIVE_READ) {
+ parent->regs[i].live |= REG_LIVE_READ;
+ touched = true;
+ }
+ }
+ /* ... and stack slots */
+ for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
+ if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ continue;
+ if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ continue;
+ if (parent->spilled_regs[i].live & REG_LIVE_READ)
+ continue;
+ if (state->spilled_regs[i].live == REG_LIVE_READ) {
+ parent->regs[i].live |= REG_LIVE_READ;
+ touched = true;
+ }
+ }
+ return touched;
+}
+
+static void propagate_liveness(const struct bpf_verifier_state *state,
+ struct bpf_verifier_state *parent)
+{
+ while (do_propagate_liveness(state, parent)) {
+ /* Something changed, so we need to feed those changes onward */
+ state = parent;
+ parent = state->parent;
+ }
+}
+
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
+ int i;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -3385,11 +3476,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
return 0;
while (sl != STATE_LIST_MARK) {
- if (states_equal(env, &sl->state, &env->cur_state))
+ if (states_equal(env, &sl->state, &env->cur_state)) {
/* reached equivalent register/stack state,
- * prune the search
+ * prune the search.
+ * Registers read by the continuation are read by us.
*/
+ propagate_liveness(&sl->state, &env->cur_state);
return 1;
+ }
sl = sl->next;
}
@@ -3407,6 +3501,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
new_sl->next = env->explored_states[insn_idx];
env->explored_states[insn_idx] = new_sl;
+ /* connect new state to parentage chain */
+ env->cur_state.parent = &new_sl->state;
+ /* clear liveness marks in current state */
+ for (i = 0; i < BPF_REG_FP; i++)
+ env->cur_state.regs[i].live = REG_LIVE_NONE;
+ for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
+ if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
+ env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
return 0;
}
@@ -3430,6 +3532,7 @@ static int do_check(struct bpf_verifier_env *env)
bool do_print_state = false;
init_reg_state(regs);
+ state->parent = NULL;
insn_idx = 0;
env->varlen_map_value_access = false;
for (;;) {
@@ -3500,11 +3603,11 @@ static int do_check(struct bpf_verifier_env *env)
/* check for reserved fields is already done */
/* check src operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
- err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err)
return err;
@@ -3554,11 +3657,11 @@ static int do_check(struct bpf_verifier_env *env)
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -3589,7 +3692,7 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
/* check src operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -3643,7 +3746,7 @@ static int do_check(struct bpf_verifier_env *env)
* of bpf_exit, which means that program wrote
* something into it earlier
*/
- err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
if (err)
return err;
On 08/15/2017 09:34 PM, Edward Cree wrote:
> State of a register doesn't matter if it wasn't read in reaching an exit;
> a write screens off all reads downstream of it from all explored_states
> upstream of it.
> This allows us to prune many more branches; here are some processed insn
> counts for some Cilium programs:
> Program before after
> bpf_lb_opt_-DLB_L3.o 6515 3361
> bpf_lb_opt_-DLB_L4.o 8976 5176
> bpf_lb_opt_-DUNKNOWN.o 2960 1137
> bpf_lxc_opt_-DDROP_ALL.o 95412 48537
> bpf_lxc_opt_-DUNKNOWN.o 141706 78718
> bpf_netdev.o 24251 17995
> bpf_overlay.o 10999 9385
>
> The runtime is also improved; here are 'time' results in ms:
> Program before after
> bpf_lb_opt_-DLB_L3.o 24 6
> bpf_lb_opt_-DLB_L4.o 26 11
> bpf_lb_opt_-DUNKNOWN.o 11 2
> bpf_lxc_opt_-DDROP_ALL.o 1288 139
> bpf_lxc_opt_-DUNKNOWN.o 1768 234
> bpf_netdev.o 62 31
> bpf_overlay.o 15 13
>
> Signed-off-by: Edward Cree <[email protected]>
Acked-by: Daniel Borkmann <[email protected]>
From: Daniel Borkmann <[email protected]>
Date: Wed, 16 Aug 2017 00:12:58 +0200
> On 08/15/2017 09:34 PM, Edward Cree wrote:
>> State of a register doesn't matter if it wasn't read in reaching an
>> exit;
>> a write screens off all reads downstream of it from all
>> explored_states
>> upstream of it.
>> This allows us to prune many more branches; here are some processed
>> insn
>> counts for some Cilium programs:
>> Program before after
>> bpf_lb_opt_-DLB_L3.o 6515 3361
>> bpf_lb_opt_-DLB_L4.o 8976 5176
>> bpf_lb_opt_-DUNKNOWN.o 2960 1137
>> bpf_lxc_opt_-DDROP_ALL.o 95412 48537
>> bpf_lxc_opt_-DUNKNOWN.o 141706 78718
>> bpf_netdev.o 24251 17995
>> bpf_overlay.o 10999 9385
>>
>> The runtime is also improved; here are 'time' results in ms:
>> Program before after
>> bpf_lb_opt_-DLB_L3.o 24 6
>> bpf_lb_opt_-DLB_L4.o 26 11
>> bpf_lb_opt_-DUNKNOWN.o 11 2
>> bpf_lxc_opt_-DDROP_ALL.o 1288 139
>> bpf_lxc_opt_-DUNKNOWN.o 1768 234
>> bpf_netdev.o 62 31
>> bpf_overlay.o 15 13
>>
>> Signed-off-by: Edward Cree <[email protected]>
>
> Acked-by: Daniel Borkmann <[email protected]>
Applied, nice work Edward.
On 8/15/17 12:34 PM, Edward Cree wrote:
> State of a register doesn't matter if it wasn't read in reaching an exit;
> a write screens off all reads downstream of it from all explored_states
> upstream of it.
> This allows us to prune many more branches; here are some processed insn
> counts for some Cilium programs:
> Program before after
> bpf_lb_opt_-DLB_L3.o 6515 3361
> bpf_lb_opt_-DLB_L4.o 8976 5176
> bpf_lb_opt_-DUNKNOWN.o 2960 1137
> bpf_lxc_opt_-DDROP_ALL.o 95412 48537
> bpf_lxc_opt_-DUNKNOWN.o 141706 78718
> bpf_netdev.o 24251 17995
> bpf_overlay.o 10999 9385
>
> The runtime is also improved; here are 'time' results in ms:
> Program before after
> bpf_lb_opt_-DLB_L3.o 24 6
> bpf_lb_opt_-DLB_L4.o 26 11
> bpf_lb_opt_-DUNKNOWN.o 11 2
> bpf_lxc_opt_-DDROP_ALL.o 1288 139
> bpf_lxc_opt_-DUNKNOWN.o 1768 234
> bpf_netdev.o 62 31
> bpf_overlay.o 15 13
>
> Signed-off-by: Edward Cree <[email protected]>
this is one ingenious hack. Love it!
I took me whole day to understand most of it, but I still have
few questions:
> +
> +static void propagate_liveness(const struct bpf_verifier_state *state,
> + struct bpf_verifier_state *parent)
here the name 'parent' is very confusing, since for the first
iteration of the loop below it transfers lives from 'neighbor'
state to the current state and only then traverses the link
of parents in the current.
Would be good to document it, since I was struggling the most
with this name until I realized that the way you build parent link list
in is_state_visited() is actual sequence of roughly basic blocks and
the name 'parent' applies there, but not for the first iteration
of this function.
> @@ -3407,6 +3501,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
> memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
> new_sl->next = env->explored_states[insn_idx];
> env->explored_states[insn_idx] = new_sl;
> + /* connect new state to parentage chain */
> + env->cur_state.parent = &new_sl->state;
> + /* clear liveness marks in current state */
> + for (i = 0; i < BPF_REG_FP; i++)
> + env->cur_state.regs[i].live = REG_LIVE_NONE;
> + for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
> + if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
> + env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
and this part I don't get at all.
It seems you're trying to sort-of do per-fake-basic block liveness
analysis, but our state_list_marks are not correct if we go with
canonical basic block definition, since we mark the jump insn and
not insn after the branch and not every basic block boundary is
properly detected.
So if algorithm should only work for basic blocks (for sequences of
instructions without control flow changes) then it's broken.
If it should work with control flow insns then it should also work
for the whole chain of insns from the first one till bpf_exit...
So I tried removing two above clearing loops and results are much
better:
before after
bpf_lb-DLB_L3.o 2604 1120
bpf_lb-DLB_L4.o 11159 1371
bpf_lb-DUNKNOWN.o 1116 485
bpf_lxc-DDROP_ALL.o 34566 12758
bpf_lxc-DUNKNOWN.o 53267 18337
bpf_netdev.o 17843 10564
bpf_overlay.o 8672 5513
but it feels too good to be true and probably not correct.
So either way we need to fix something it seems.
On 18/08/17 04:21, Alexei Starovoitov wrote:
> On 8/15/17 12:34 PM, Edward Cree wrote:
>> State of a register doesn't matter if it wasn't read in reaching an exit;
>> a write screens off all reads downstream of it from all explored_states
>> upstream of it.
>> This allows us to prune many more branches; here are some processed insn
>> counts for some Cilium programs:
>> Program before after
>> bpf_lb_opt_-DLB_L3.o 6515 3361
>> bpf_lb_opt_-DLB_L4.o 8976 5176
>> bpf_lb_opt_-DUNKNOWN.o 2960 1137
>> bpf_lxc_opt_-DDROP_ALL.o 95412 48537
>> bpf_lxc_opt_-DUNKNOWN.o 141706 78718
>> bpf_netdev.o 24251 17995
>> bpf_overlay.o 10999 9385
>>
>> The runtime is also improved; here are 'time' results in ms:
>> Program before after
>> bpf_lb_opt_-DLB_L3.o 24 6
>> bpf_lb_opt_-DLB_L4.o 26 11
>> bpf_lb_opt_-DUNKNOWN.o 11 2
>> bpf_lxc_opt_-DDROP_ALL.o 1288 139
>> bpf_lxc_opt_-DUNKNOWN.o 1768 234
>> bpf_netdev.o 62 31
>> bpf_overlay.o 15 13
>>
>> Signed-off-by: Edward Cree <[email protected]>
>
> this is one ingenious hack. Love it!
> I took me whole day to understand most of it, but I still have
> few questions:
>
>> +
>> +static void propagate_liveness(const struct bpf_verifier_state *state,
>> + struct bpf_verifier_state *parent)
>
> here the name 'parent' is very confusing, since for the first
> iteration of the loop below it transfers lives from 'neighbor'
> state to the current state and only then traverses the link
> of parents in the current.
> Would be good to document it, since I was struggling the most
> with this name until I realized that the way you build parent link list
> in is_state_visited() is actual sequence of roughly basic blocks and
> the name 'parent' applies there, but not for the first iteration
> of this function.
In the way I think about it, it really is a parent, by which I mean "a
block whose output registers are our inputs". When we call
propagate_liveness(), we're saying "here's another block whose outputs
could be our inputs". So while the 'state->parent' datastructure is a
linked list, the 'parentage' relationship is really a DAG.
While 'state->parent' always has to point at an explored_state (which are
roughly immutable), the 'parent' we pass into propagate_liveness is just
env->cur_state which is mutable. The weird "it's not actually
state->parent" comes from (a) there's only room for one state->parent and
(b) we didn't create a new_sl for it because we're pruning.
I agree this is not at all explained in the code or comments, except for
the glib "Registers read by the continuation are read by us". I will try
to write some comments and/or documentation explaining how and why the
liveness tracking works, because it _is_ subtle and a week from now _I_
probably won't understand it either.
>> @@ -3407,6 +3501,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
>> memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
>> new_sl->next = env->explored_states[insn_idx];
>> env->explored_states[insn_idx] = new_sl;
>> + /* connect new state to parentage chain */
>> + env->cur_state.parent = &new_sl->state;
>> + /* clear liveness marks in current state */
>> + for (i = 0; i < BPF_REG_FP; i++)
>> + env->cur_state.regs[i].live = REG_LIVE_NONE;
>> + for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
>> + if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
>> + env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
>
> and this part I don't get at all.
The idea behind the liveness marks is that 'writes go down and reads go up
and up'. That is, each state 'tells' its parent state 'I read from you'
(which can then end up recursing), but it remembers for itself that it
wrote to a register and therefore should swallow subsequent reads rather
than forwarding them to its parent.
While a block is being walked, its liveness marks _only_ record writes, and
then only writes _made in that block_. The read marks go to the parent,
which is "some block that has been walked", but whose continuations haven't
all been walked yet so (by looplessness) won't show up as a pruning
opportunity. An explored_state's liveness marks record the writes _done in
reaching that state_ from its parent, but the reads _done by the state's
children_. A cur_state's liveness marks do the same, but it doesn't have
any children yet so it never gets read-marks (until it gets turned into an
explored_state, of course).
We clear our liveness marks because the writes our parent block did are not
writes we did, so they don't screen off our reads from our parent; they
only screen off our (or our parent's) reads from our grandparent.
> It seems you're trying to sort-of do per-fake-basic block liveness
> analysis, but our state_list_marks are not correct if we go with
> canonical basic block definition, since we mark the jump insn and
> not insn after the branch and not every basic block boundary is
> properly detected.
I think the reason this works is that jump insns can't do writes.
Whenever we pop a branch and restore its register state, we _also_ restore
its parentage state. Then if we decide to prune, we're saying that
whatever it takes to get from sl->state to an exit, we must also do.
A read mark on sl->state says that in the process of getting to an exit, the
register was written before it was read. A write mark on sl->state says
that the straight-line code resulting in sl->state wrote to the register,
so reads shouldn't propagate to its ->parent. But by the time we're using
sl->state in is_state_visited(), its continuations have all been probed, so
it can never gain more reads, so its write marks are irrelevant; they
mustn't stop the state's reads propagating to new 'pruning parents' because
those didn't arrive at the state through the straight-line code.
So maybe the first iteration through propagate_liveness() really _is_
special, and that if it were done differently (ignoring write marks) then
it really wouldn't matter at all where our state_list_marks were in
relation to the basic blocks. But I _think_ that, because we mark jump
insns as well as their destinations (and a popped branch is always a
destination, rather than the 'insn after the branch'), the sl->state will
never have any write marks and it'll all just work.
But I should really test that!
> So if algorithm should only work for basic blocks (for sequences of
> instructions without control flow changes) then it's broken.
> If it should work with control flow insns then it should also work
> for the whole chain of insns from the first one till bpf_exit...
> So I tried removing two above clearing loops and results are much
> better:
> before after
> bpf_lb-DLB_L3.o 2604 1120
> bpf_lb-DLB_L4.o 11159 1371
> bpf_lb-DUNKNOWN.o 1116 485
> bpf_lxc-DDROP_ALL.o 34566 12758
> bpf_lxc-DUNKNOWN.o 53267 18337
> bpf_netdev.o 17843 10564
> bpf_overlay.o 8672 5513
>
> but it feels too good to be true and probably not correct.
> So either way we need to fix something it seems.
Without that clearing, write marks will never be cleared, meaning that once
a register has been written to it will _never again_ forward a read. Since
every register (except ctx and fp) must be written before it can be read,
no register (except ctx) will ever be marked as read, and all the branches
will be pruned away.
Reads are a property of a state - roughly, "what do we read in getting from
this state to a BPF_EXIT?" - whereas writes are a property of a block,
"what do we write in getting from the parent to here?"
Thus, reads may (indeed must) propagate (upwards), but writes must _never_
spread beyond the 'end-state' of their block.
I hope this answers your questions, because I'm not entirely sure _I_
understand it either. Or rather, when I think about it for an hour, it
makes sense for about fifteen seconds, and then it's gone again and all I
can remember is "write down, read up".
-Ed
On 8/18/17 7:16 AM, Edward Cree wrote:
> On 18/08/17 04:21, Alexei Starovoitov wrote:
>> On 8/15/17 12:34 PM, Edward Cree wrote:
>>> State of a register doesn't matter if it wasn't read in reaching an exit;
>>> a write screens off all reads downstream of it from all explored_states
>>> upstream of it.
>>> This allows us to prune many more branches; here are some processed insn
>>> counts for some Cilium programs:
>>> Program before after
>>> bpf_lb_opt_-DLB_L3.o 6515 3361
>>> bpf_lb_opt_-DLB_L4.o 8976 5176
>>> bpf_lb_opt_-DUNKNOWN.o 2960 1137
>>> bpf_lxc_opt_-DDROP_ALL.o 95412 48537
>>> bpf_lxc_opt_-DUNKNOWN.o 141706 78718
>>> bpf_netdev.o 24251 17995
>>> bpf_overlay.o 10999 9385
>>>
>>> The runtime is also improved; here are 'time' results in ms:
>>> Program before after
>>> bpf_lb_opt_-DLB_L3.o 24 6
>>> bpf_lb_opt_-DLB_L4.o 26 11
>>> bpf_lb_opt_-DUNKNOWN.o 11 2
>>> bpf_lxc_opt_-DDROP_ALL.o 1288 139
>>> bpf_lxc_opt_-DUNKNOWN.o 1768 234
>>> bpf_netdev.o 62 31
>>> bpf_overlay.o 15 13
>>>
>>> Signed-off-by: Edward Cree <[email protected]>
>>
>> this is one ingenious hack. Love it!
>> I took me whole day to understand most of it, but I still have
>> few questions:
>>
>>> +
>>> +static void propagate_liveness(const struct bpf_verifier_state *state,
>>> + struct bpf_verifier_state *parent)
>>
>> here the name 'parent' is very confusing, since for the first
>> iteration of the loop below it transfers lives from 'neighbor'
>> state to the current state and only then traverses the link
>> of parents in the current.
>> Would be good to document it, since I was struggling the most
>> with this name until I realized that the way you build parent link list
>> in is_state_visited() is actual sequence of roughly basic blocks and
>> the name 'parent' applies there, but not for the first iteration
>> of this function.
> In the way I think about it, it really is a parent, by which I mean "a
> block whose output registers are our inputs". When we call
> propagate_liveness(), we're saying "here's another block whose outputs
> could be our inputs". So while the 'state->parent' datastructure is a
> linked list, the 'parentage' relationship is really a DAG.
> While 'state->parent' always has to point at an explored_state (which are
> roughly immutable), the 'parent' we pass into propagate_liveness is just
> env->cur_state which is mutable. The weird "it's not actually
> state->parent" comes from (a) there's only room for one state->parent and
> (b) we didn't create a new_sl for it because we're pruning.
> I agree this is not at all explained in the code or comments, except for
> the glib "Registers read by the continuation are read by us". I will try
> to write some comments and/or documentation explaining how and why the
> liveness tracking works, because it _is_ subtle and a week from now _I_
> probably won't understand it either.
a bit twisted, but yeah agree with this angle of thinking about it :)
>
>>> @@ -3407,6 +3501,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
>>> memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
>>> new_sl->next = env->explored_states[insn_idx];
>>> env->explored_states[insn_idx] = new_sl;
>>> + /* connect new state to parentage chain */
>>> + env->cur_state.parent = &new_sl->state;
>>> + /* clear liveness marks in current state */
>>> + for (i = 0; i < BPF_REG_FP; i++)
>>> + env->cur_state.regs[i].live = REG_LIVE_NONE;
>>> + for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
>>> + if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
>>> + env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
>>
>> and this part I don't get at all.
> The idea behind the liveness marks is that 'writes go down and reads go up
> and up'. That is, each state 'tells' its parent state 'I read from you'
> (which can then end up recursing), but it remembers for itself that it
> wrote to a register and therefore should swallow subsequent reads rather
> than forwarding them to its parent.
> While a block is being walked, its liveness marks _only_ record writes, and
> then only writes _made in that block_. The read marks go to the parent,
> which is "some block that has been walked", but whose continuations haven't
> all been walked yet so (by looplessness) won't show up as a pruning
> opportunity. An explored_state's liveness marks record the writes _done in
> reaching that state_ from its parent, but the reads _done by the state's
> children_. A cur_state's liveness marks do the same, but it doesn't have
> any children yet so it never gets read-marks (until it gets turned into an
> explored_state, of course).
> We clear our liveness marks because the writes our parent block did are not
> writes we did, so they don't screen off our reads from our parent;
got it. makes sense. would be good to document it.
>> It seems you're trying to sort-of do per-fake-basic block liveness
>> analysis, but our state_list_marks are not correct if we go with
>> canonical basic block definition, since we mark the jump insn and
>> not insn after the branch and not every basic block boundary is
>> properly detected.
> I think the reason this works is that jump insns can't do writes.
> Whenever we pop a branch and restore its register state, we _also_ restore
> its parentage state. Then if we decide to prune, we're saying that
> whatever it takes to get from sl->state to an exit, we must also do.
> A read mark on sl->state says that in the process of getting to an exit, the
> register was written before it was read. A write mark on sl->state says
> that the straight-line code resulting in sl->state wrote to the register,
> so reads shouldn't propagate to its ->parent. But by the time we're using
> sl->state in is_state_visited(), its continuations have all been probed, so
> it can never gain more reads, so its write marks are irrelevant; they
> mustn't stop the state's reads propagating to new 'pruning parents' because
> those didn't arrive at the state through the straight-line code.
> So maybe the first iteration through propagate_liveness() really _is_
> special, and that if it were done differently (ignoring write marks) then
> it really wouldn't matter at all where our state_list_marks were in
> relation to the basic blocks. But I _think_ that, because we mark jump
> insns as well as their destinations (and a popped branch is always a
> destination, rather than the 'insn after the branch'), the sl->state will
> never have any write marks and it'll all just work.
> But I should really test that!
also makes sense.
>> So if algorithm should only work for basic blocks (for sequences of
>> instructions without control flow changes) then it's broken.
>> If it should work with control flow insns then it should also work
>> for the whole chain of insns from the first one till bpf_exit...
>> So I tried removing two above clearing loops and results are much
>> better:
>> before after
>> bpf_lb-DLB_L3.o 2604 1120
>> bpf_lb-DLB_L4.o 11159 1371
>> bpf_lb-DUNKNOWN.o 1116 485
>> bpf_lxc-DDROP_ALL.o 34566 12758
>> bpf_lxc-DUNKNOWN.o 53267 18337
>> bpf_netdev.o 17843 10564
>> bpf_overlay.o 8672 5513
>>
>> but it feels too good to be true and probably not correct.
>> So either way we need to fix something it seems.
> Without that clearing, write marks will never be cleared, meaning that once
> a register has been written to it will _never again_ forward a read. Since
> every register (except ctx and fp) must be written before it can be read,
> no register (except ctx) will ever be marked as read, and all the branches
> will be pruned away.
>
> Reads are a property of a state - roughly, "what do we read in getting from
> this state to a BPF_EXIT?" - whereas writes are a property of a block,
> "what do we write in getting from the parent to here?"
> Thus, reads may (indeed must) propagate (upwards), but writes must _never_
> spread beyond the 'end-state' of their block.
>
> I hope this answers your questions, because I'm not entirely sure _I_
> understand it either. Or rather, when I think about it for an hour, it
> makes sense for about fifteen seconds, and then it's gone again and all I
> can remember is "write down, read up".
yes. thank a lot for explanation.
I've been playing with algo for the whole day and yes indeed it's
magically doesn't depend on proper basic block boundaries.
So having cfg jump into or out of a block of instructions actually
doesn't break the algo. My understanding this is correct, since
we only create such blocks at state_list_marks and we also check
for state equality at the same state_list_marks, so if control flow
jumps into the middle of the block, there is no equal state to find
and it will continue as normal.
I've tried moving state_list_mark all over the place. Made them
control flow accurate and it improved cilium prog load stats,
but not a lot, so I will hold on such patch for now.
Then I tried to add state_list_marks for every odd insn and
unfortunately it uncovered a bug in this liveness logic.
The following test can reproduce it with existing code (no extra hacks)
{
"grr",
.insns = {
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
BPF_MOV32_IMM(BPF_REG_2, MAX_ENTRIES),
BPF_JMP_REG(BPF_JSGT, BPF_REG_2, BPF_REG_1, 1),
BPF_MOV32_IMM(BPF_REG_1, 0),
BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 2),
BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
BPF_JMP_IMM(BPF_JA, 0, 0, 0),
BPF_ST_MEM(BPF_DW, BPF_REG_0, 0,
offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
.fixup_map2 = { 3 },
.errstr_unpriv = "R0 leaks addr",
.errstr = "R0 unbounded memory access",
.result_unpriv = REJECT,
.result = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
},
the trick here is extra BPF_JMP_IMM(BPF_JA, 0, 0, 0)
which is a nop, but it forces our state_list_mark logic to
mark subsequent ST_MEM(DW) insn and liveness logic finds old
state at this insn and declares it equivalent.
The output looks like:
0: (7a) *(u64 *)(r10 -8) = 0
1: (bf) r2 = r10
2: (07) r2 += -8
3: (18) r1 = 0xffff880139d45200
5: (85) call bpf_map_lookup_elem#1
6: (15) if r0 == 0x0 goto pc+8
R0=map_value(id=0,off=0,ks=8,vs=48,imm=0) R10=fp0
7: (79) r1 = *(u64 *)(r0 +0)
R0=map_value(id=0,off=0,ks=8,vs=48,imm=0) R10=fp0
8: (b4) (u32) r2 = (u32) 11
9: (6d) if r2 s> r1 goto pc+1
R0=map_value(id=0,off=0,ks=8,vs=48,imm=0)
R1=inv(id=0,umin_value=11,umax_value=9223372036854775807,var_off=(0x0;
0x7fffffffffffffff)) R2=inv11 R10=fp0
10: (b4) (u32) r1 = (u32) 0
11: (64) (u32) r1 <<= (u32) 2
12: (0f) r0 += r1
13: (05) goto pc+0
14: (7a) *(u64 *)(r0 +0) = 4
R0=map_value(id=0,off=0,ks=8,vs=48,imm=0) R1=inv0 R2=inv11 R10=fp0
15: (95) exit
from 9 to 11: R0=map_value(id=0,off=0,ks=8,vs=48,imm=0)
R1=inv(id=0,smax_value=10) R2=inv11 R10=fp0
11: (64) (u32) r1 <<= (u32) 2
12: (0f) r0 += r1
13: (05) goto pc+0
14: safe
from 6 to 15: R0=inv0 R10=fp0
15: (95) exit
that '14: safe' above is not correct.
Disabling liveness as:
@@ -3282,7 +3288,7 @@ static bool regsafe(struct bpf_reg_state *rold,
struct bpf_reg_state *rcur,
bool varlen_map_access, struct idpair *idmap)
{
- if (!(rold->live & REG_LIVE_READ))
+ if (0 && !(rold->live & REG_LIVE_READ))
makes the test work properly and proper verifier output is:
from 9 to 11: R0=map_value(id=0,off=0,ks=8,vs=48,imm=0)
R1=inv(id=0,smax_value=10) R2=inv11 R10=fp0
11: (64) (u32) r1 <<= (u32) 2
12: (0f) r0 += r1
13: (05) goto pc+0
14: (7a) *(u64 *)(r0 +0) = 4
R0=map_value(id=0,off=0,ks=8,vs=48,umax_value=17179869180,var_off=(0x0;
0x3fffffffc)) R1=inv(id=0,umax_value=17179869180,var_off=(0x0;
0x3fffffffc)) R2=inv11 R10=fp0
R0 unbounded memory access, make sure to bounds check any array access
into a map
I don't yet understand the underlying reason. R0 should have been
marked as LIVE_READ by ST_MEM...
Please help debug it further.
On 19/08/17 00:37, Alexei Starovoitov wrote:
> that '14: safe' above is not correct.
>
> Disabling liveness as:
> @@ -3282,7 +3288,7 @@ static bool regsafe(struct bpf_reg_state *rold,
> struct bpf_reg_state *rcur,
> bool varlen_map_access, struct idpair *idmap)
> {
> - if (!(rold->live & REG_LIVE_READ))
> + if (0 && !(rold->live & REG_LIVE_READ))
>
> makes the test work properly and proper verifier output is:
> from 9 to 11: R0=map_value(id=0,off=0,ks=8,vs=48,imm=0) R1=inv(id=0,smax_value=10) R2=inv11 R10=fp0
> 11: (64) (u32) r1 <<= (u32) 2
> 12: (0f) r0 += r1
> 13: (05) goto pc+0
> 14: (7a) *(u64 *)(r0 +0) = 4
>
> R0=map_value(id=0,off=0,ks=8,vs=48,umax_value=17179869180,var_off=(0x0; 0x3fffffffc)) R1=inv(id=0,umax_value=17179869180,var_off=(0x0; 0x3fffffffc)) R2=inv11 R10=fp0
> R0 unbounded memory access, make sure to bounds check any array access into a map
>
> I don't yet understand the underlying reason. R0 should have been
> marked as LIVE_READ by ST_MEM...
> Please help debug it further.
>
Having added a bunch of debugging, I found out that indeed R0 _had_ been
marked as LIVE_READ. The problem was that env->varlen_map_value_access
wasn't set, because the access was at a constant offset (imm=0), but then
when we compare register states we just say "oh yeah, it's a map_value,
we don't need to look at the var_off".
This probably results from my unifying PTR_TO_MAP_VALUE with
PTR_TO_MAP_VALUE_ADJ; before that the old and new R0 would have different
reg->type so wouldn't match.
I'm tempted to just rip out env->varlen_map_value_access and always check
the whole thing, because honestly I don't know what it was meant to do
originally or how it can ever do any useful pruning. While drastic, it
does cause your test case to pass.
I'm not quite sure why your test passed when you disabled liveness, though;
that I can't explain.
-Ed
On 18/08/17 15:16, Edward Cree wrote:
> On 18/08/17 04:21, Alexei Starovoitov wrote:
>> It seems you're trying to sort-of do per-fake-basic block liveness
>> analysis, but our state_list_marks are not correct if we go with
>> canonical basic block definition, since we mark the jump insn and
>> not insn after the branch and not every basic block boundary is
>> properly detected.
> I think the reason this works is that jump insns can't do writes.
> [snip]
> the sl->state will never have any write marks and it'll all just work.
> But I should really test that!
I tested this, and found that, no, sl->state can have write marks, and the
algorithm will get the wrong answer in that case. So I've got a patch to
make the first iteration ignore write marks, as part of a series which I
will post shortly. When I do so, please re-do your tests with adding
state_list_marks in strange and exciting places; it should work wherever
you put them. Like you say, it "magically doesn't depend on proper basic
block boundaries", and that's because really pruning is just a kind of
checkpointing that just happens to be most effective when done just after
a jump (pop_stack).
Can I have a SOB for your "grr" test program, so I can include it in the
series?
-Ed
On 08/21/2017 08:36 PM, Edward Cree wrote:
> On 19/08/17 00:37, Alexei Starovoitov wrote:
[...]
> I'm tempted to just rip out env->varlen_map_value_access and always check
> the whole thing, because honestly I don't know what it was meant to do
> originally or how it can ever do any useful pruning. While drastic, it
> does cause your test case to pass.
Original intention from 484611357c19 ("bpf: allow access into map
value arrays") was that it wouldn't potentially make pruning worse
if PTR_TO_MAP_VALUE_ADJ was not used, meaning that we wouldn't need
to take reg state's min_value and max_value into account for state
checking; this was basically due to min_value / max_value is being
adjusted/tracked on every alu/jmp ops for involved regs (e.g.
adjust_reg_min_max_vals() and others that mangle them) even if we
have the case that no actual dynamic map access is used throughout
the program. To give an example on net tree, the bpf_lxc.o prog's
section increases from 36,386 to 68,226 when env->varlen_map_value_access
is always true, so it does have an effect. Did you do some checks
on this on net-next?
On 21/08/17 21:27, Daniel Borkmann wrote:
> On 08/21/2017 08:36 PM, Edward Cree wrote:
>> On 19/08/17 00:37, Alexei Starovoitov wrote:
> [...]
>> I'm tempted to just rip out env->varlen_map_value_access and always check
>> the whole thing, because honestly I don't know what it was meant to do
>> originally or how it can ever do any useful pruning. While drastic, it
>> does cause your test case to pass.
>
> Original intention from 484611357c19 ("bpf: allow access into map
> value arrays") was that it wouldn't potentially make pruning worse
> if PTR_TO_MAP_VALUE_ADJ was not used, meaning that we wouldn't need
> to take reg state's min_value and max_value into account for state
> checking; this was basically due to min_value / max_value is being
> adjusted/tracked on every alu/jmp ops for involved regs (e.g.
> adjust_reg_min_max_vals() and others that mangle them) even if we
> have the case that no actual dynamic map access is used throughout
> the program. To give an example on net tree, the bpf_lxc.o prog's
> section increases from 36,386 to 68,226 when env->varlen_map_value_access
> is always true, so it does have an effect. Did you do some checks
> on this on net-next?
I tested with the cilium progs and saw no change in insn count. I
suspect that for the normal case I already killed this optimisation
when I did my unification patch, it was previously about ignoring
min/max values on all regs (including scalars), whereas on net-next
it only ignores them on map_value pointers; in practice this is
useless because we tend to still have the offset scalar sitting in
a register somewhere. (Come to think of it, this may have been
behind a large chunk of the #insn increase that my patches caused.)
Since we use umax_value in find_good_pkt_pointers() now (to check
against MAX_PACKET_OFF and ensure our reg->range is really ok), we
can't just stop caring about all min/max values just because we
haven't done any variable map accesses.
I don't see a way around this.
-Ed
On 08/21/2017 10:44 PM, Edward Cree wrote:
> On 21/08/17 21:27, Daniel Borkmann wrote:
>> On 08/21/2017 08:36 PM, Edward Cree wrote:
>>> On 19/08/17 00:37, Alexei Starovoitov wrote:
>> [...]
>>> I'm tempted to just rip out env->varlen_map_value_access and always check
>>> the whole thing, because honestly I don't know what it was meant to do
>>> originally or how it can ever do any useful pruning. While drastic, it
>>> does cause your test case to pass.
>>
>> Original intention from 484611357c19 ("bpf: allow access into map
>> value arrays") was that it wouldn't potentially make pruning worse
>> if PTR_TO_MAP_VALUE_ADJ was not used, meaning that we wouldn't need
>> to take reg state's min_value and max_value into account for state
>> checking; this was basically due to min_value / max_value is being
>> adjusted/tracked on every alu/jmp ops for involved regs (e.g.
>> adjust_reg_min_max_vals() and others that mangle them) even if we
>> have the case that no actual dynamic map access is used throughout
>> the program. To give an example on net tree, the bpf_lxc.o prog's
>> section increases from 36,386 to 68,226 when env->varlen_map_value_access
>> is always true, so it does have an effect. Did you do some checks
>> on this on net-next?
> I tested with the cilium progs and saw no change in insn count. I
> suspect that for the normal case I already killed this optimisation
> when I did my unification patch, it was previously about ignoring
> min/max values on all regs (including scalars), whereas on net-next
> it only ignores them on map_value pointers; in practice this is
> useless because we tend to still have the offset scalar sitting in
> a register somewhere. (Come to think of it, this may have been
> behind a large chunk of the #insn increase that my patches caused.)
Yeah, this would seem plausible.
> Since we use umax_value in find_good_pkt_pointers() now (to check
> against MAX_PACKET_OFF and ensure our reg->range is really ok), we
> can't just stop caring about all min/max values just because we
> haven't done any variable map accesses.
> I don't see a way around this.
Agree, was thinking the same. If there's not really a regression in
terms of complexity, then lets kill the flag.
On 8/21/17 1:24 PM, Edward Cree wrote:
> On 18/08/17 15:16, Edward Cree wrote:
>> On 18/08/17 04:21, Alexei Starovoitov wrote:
>>> It seems you're trying to sort-of do per-fake-basic block liveness
>>> analysis, but our state_list_marks are not correct if we go with
>>> canonical basic block definition, since we mark the jump insn and
>>> not insn after the branch and not every basic block boundary is
>>> properly detected.
>> I think the reason this works is that jump insns can't do writes.
>> [snip]
>> the sl->state will never have any write marks and it'll all just work.
>> But I should really test that!
> I tested this, and found that, no, sl->state can have write marks, and the
> algorithm will get the wrong answer in that case. So I've got a patch to
> make the first iteration ignore write marks, as part of a series which I
> will post shortly. When I do so, please re-do your tests with adding
> state_list_marks in strange and exciting places; it should work wherever
> you put them. Like you say, it "magically doesn't depend on proper basic
> block boundaries", and that's because really pruning is just a kind of
> checkpointing that just happens to be most effective when done just after
> a jump (pop_stack).
>
> Can I have a SOB for your "grr" test program, so I can include it in the
> series?
yes. of course. just give the test some reasonable name :)
On 8/21/17 2:00 PM, Daniel Borkmann wrote:
> On 08/21/2017 10:44 PM, Edward Cree wrote:
>> On 21/08/17 21:27, Daniel Borkmann wrote:
>>> On 08/21/2017 08:36 PM, Edward Cree wrote:
>>>> On 19/08/17 00:37, Alexei Starovoitov wrote:
>>> [...]
>>>> I'm tempted to just rip out env->varlen_map_value_access and always
>>>> check
>>>> the whole thing, because honestly I don't know what it was meant
>>>> to do
>>>> originally or how it can ever do any useful pruning. While
>>>> drastic, it
>>>> does cause your test case to pass.
>>>
>>> Original intention from 484611357c19 ("bpf: allow access into map
>>> value arrays") was that it wouldn't potentially make pruning worse
>>> if PTR_TO_MAP_VALUE_ADJ was not used, meaning that we wouldn't need
>>> to take reg state's min_value and max_value into account for state
>>> checking; this was basically due to min_value / max_value is being
>>> adjusted/tracked on every alu/jmp ops for involved regs (e.g.
>>> adjust_reg_min_max_vals() and others that mangle them) even if we
>>> have the case that no actual dynamic map access is used throughout
>>> the program. To give an example on net tree, the bpf_lxc.o prog's
>>> section increases from 36,386 to 68,226 when
>>> env->varlen_map_value_access
>>> is always true, so it does have an effect. Did you do some checks
>>> on this on net-next?
>> I tested with the cilium progs and saw no change in insn count. I
>> suspect that for the normal case I already killed this optimisation
>> when I did my unification patch, it was previously about ignoring
>> min/max values on all regs (including scalars), whereas on net-next
>> it only ignores them on map_value pointers; in practice this is
>> useless because we tend to still have the offset scalar sitting in
>> a register somewhere. (Come to think of it, this may have been
>> behind a large chunk of the #insn increase that my patches caused.)
>
> Yeah, this would seem plausible.
>
>> Since we use umax_value in find_good_pkt_pointers() now (to check
>> against MAX_PACKET_OFF and ensure our reg->range is really ok), we
>> can't just stop caring about all min/max values just because we
>> haven't done any variable map accesses.
>> I don't see a way around this.
>
> Agree, was thinking the same. If there's not really a regression in
> terms of complexity, then lets kill the flag.
+1
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2489e67b65f6..908d13b2a2aa 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3582,7 +3582,7 @@ static int do_check(struct bpf_verifier_env *env)
init_reg_state(regs);
state->parent = NULL;
insn_idx = 0;
- env->varlen_map_value_access = false;
+ env->varlen_map_value_access = true;
makes _zero_ difference on cilium*.o tests, so let's just kill
that workaround.