This patch set is based on https://lkml.org/lkml/2015/4/30/264
By using bpf 'config' section like this:
char _config2[] SEC("config") = "generic_perform_write=generic_perform_write+122 file->f_mapping->a_ops bytes offset";
SEC("generic_perform_write")
int NODE_generic_perform_write (struct pt_regs *ctx, void *a_ops, void *bytes, void* offset) {
char fmt[] = "NODE_generic_perform_write, a_ops=%p, bytes=%p, offset=%p\n";
bpf_trace_printk(fmt, sizeof(fmt), a_ops, bytes, offset);
return 1;
}
In this example, 'bytes' and 'offset' are local variables, a_ops is in
the structure field of file parameter, and we probe in the body of the
generic_perform_write() function.
Perf can fetch and convert all the arguments and then we translate them
into bpf bytecode as a prologue before calling bpf body functions. In
the prologue, we fetch arguments from bpf context register and place
them according to bpf calling conventions so the body function can
access them as formal parameters.
The perf command is as following:
$ perf bpf -v bpf_bytecode.o
...
bpf_prologue: insn num=26
(bf) r6 = r1
(79) r3 = *(u64 *)(r6 +112)
(07) r3 += 248
(b7) r1 = 0
(7b) *(u64 *)(r10 -8) = r1
(bf) r1 = r10
(07) r1 += -8
(b7) r2 = 8
(85) call 4
(79) r3 = *(u64 *)(r10 -8)
(07) r3 += 104
(b7) r1 = 0
(7b) *(u64 *)(r10 -8) = r1
(bf) r1 = r10
(07) r1 += -8
(b7) r2 = 8
(85) call 4
(79) r3 = *(u64 *)(r10 -8)
(bf) r7 = r3
(79) r3 = *(u64 *)(r6 +24)
(bf) r8 = r3
(79) r3 = *(u64 *)(r6 +88)
(bf) r9 = r3
(bf) r2 = r7
(bf) r3 = r8
(bf) r4 = r9
...
Added new event:
Writing event: p:perf_bpf_probe/generic_perform_write _stext+1257282 a_ops=+104(+248(%di)):u64 bytes=%r12:u64 offset=%cx:u64
perf_bpf_probe:generic_perform_write (on generic_perform_write+122 with a_ops=file->f_mapping->a_ops bytes offset)
The trace output:
sh-1260 [000] d... 112592.463169: : NODE_generic_perform_write, a_ops=ffffffff81a20160, bytes=0000000000000017, offset=000000000000031d
sh-1260 [000] d... 112593.155105: : NODE_generic_perform_write, a_ops=ffffffff81a20160, bytes=000000000000000a, offset=0000000000000334
sh-1260 [000] d... 112599.015993: : NODE_generic_perform_write, a_ops=ffffffff81a20160, bytes=0000000000000017, offset=000000000000033e
sh-1260 [000] d... 112600.790977: : NODE_generic_perform_write, a_ops=ffffffff81a20160, bytes=000000000000000a, offset=0000000000000355
He Kuang (6):
perf bpf: Add headers for generate bpf bytecode
perf bpf: Add pt_regs convert table for x86
perf bpf: Save pt_regs info from debuginfo
perf bpf: Convert arglist to bpf prologue
perf bpf: Process debuginfo for generating bpf prologue
perf bpf: Generate bpf prologue for arguments
tools/perf/arch/x86/util/dwarf-regs.c | 31 ++++++
tools/perf/util/bpf-loader.c | 66 ++++++++++++
tools/perf/util/bpf-loader.h | 188 ++++++++++++++++++++++++++++++++++
tools/perf/util/include/dwarf-regs.h | 2 +
tools/perf/util/probe-event.c | 121 ++++++++++++++++++++++
tools/perf/util/probe-event.h | 12 +++
tools/perf/util/probe-finder.c | 101 ++++++++++++++++++
tools/perf/util/probe-finder.h | 4 +
8 files changed, 525 insertions(+)
--
1.8.5.2
Including bpf instruction macros and register alias.
Signed-off-by: He Kuang <[email protected]>
---
tools/perf/util/bpf-loader.h | 188 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 188 insertions(+)
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h
index baa4404..840f96d 100644
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -70,4 +70,192 @@ struct bpf_obj {
} elf;
};
#define obj_elf_valid(o) ((o)->elf.fd >= 0)
+
+/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
+
+#define BPF_ALU64_REG(OP, DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+#define BPF_ALU32_REG(OP, DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
+
+#define BPF_ALU64_IMM(OP, DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+#define BPF_ALU32_IMM(OP, DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* Short form of mov, dst_reg = src_reg */
+
+#define BPF_MOV64_REG(DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_MOV | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+/* Short form of mov, dst_reg = imm32 */
+
+#define BPF_MOV64_IMM(DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_MOV | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
+#define BPF_LD_IMM64(DST, IMM) \
+ BPF_LD_IMM64_RAW(DST, 0, IMM)
+
+#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
+ (((struct bpf_insn) { \
+ .code = BPF_LD | BPF_DW | BPF_IMM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = (__u32) (IMM) }), \
+ ((struct bpf_insn) { \
+ .code = 0, /* zero is reserved opcode */ \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = ((__u64) (IMM)) >> 32 }))
+
+#ifndef BPF_PSEUDO_MAP_FD
+# define BPF_PSEUDO_MAP_FD 1
+#endif
+
+/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
+#define BPF_LD_MAP_FD(DST, MAP_FD) \
+ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
+
+
+/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
+
+#define BPF_LD_ABS(SIZE, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
+
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
+
+#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
+
+#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Conditional jumps against registers,
+ if (dst_reg 'op' src_reg) goto pc + off16 */
+
+#define BPF_JMP_REG(OP, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Conditional jumps against immediates,
+ if (dst_reg 'op' imm32) goto pc + off16 */
+
+#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Raw code statement block */
+
+#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \
+ ((struct bpf_insn) { \
+ .code = CODE, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Program exit */
+
+#define BPF_EXIT_INSN() \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_EXIT, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = 0 })
+
+/* Function call */
+
+#define BPF_EMIT_CALL(FUNC) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_CALL, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = FUNC })
+
+/* ArgX, context and stack frame pointer register positions. Note,
+ * Arg1, Arg2, Arg3, etc are used as argument mappings of function
+ * calls in BPF_CALL instruction.
+ */
+#define BPF_REG_ARG1 BPF_REG_1
+#define BPF_REG_ARG2 BPF_REG_2
+#define BPF_REG_ARG3 BPF_REG_3
+#define BPF_REG_ARG4 BPF_REG_4
+#define BPF_REG_ARG5 BPF_REG_5
+#define BPF_REG_CTX BPF_REG_6
+#define BPF_REG_FP BPF_REG_10
+
#endif
--
1.8.5.2
Convert register number in debuginfo to its index in pt_regs.
Signed-off-by: He Kuang <[email protected]>
---
tools/perf/arch/x86/util/dwarf-regs.c | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/tools/perf/arch/x86/util/dwarf-regs.c b/tools/perf/arch/x86/util/dwarf-regs.c
index be22dd4..e586a47 100644
--- a/tools/perf/arch/x86/util/dwarf-regs.c
+++ b/tools/perf/arch/x86/util/dwarf-regs.c
@@ -59,10 +59,31 @@ const char *x86_64_regs_table[X86_64_MAX_REGS] = {
"%r15",
};
+const char x86_64_pt_regs_table[X86_64_MAX_REGS] = {
+ 10,
+ 12,
+ 11,
+ 5,
+ 13,
+ 14,
+ 4,
+ 19,
+ 9,
+ 8,
+ 7,
+ 6,
+ 3,
+ 2,
+ 1,
+ 0,
+};
+
/* TODO: switching by dwarf address size */
#ifdef __x86_64__
#define ARCH_MAX_REGS X86_64_MAX_REGS
#define arch_regs_table x86_64_regs_table
+#define arch_pt_regs_table x86_64_pt_regs_table
+#define arch_pt_regs_type unsigned long
#else
#define ARCH_MAX_REGS X86_32_MAX_REGS
#define arch_regs_table x86_32_regs_table
@@ -73,3 +94,13 @@ const char *get_arch_regstr(unsigned int n)
{
return (n <= ARCH_MAX_REGS) ? arch_regs_table[n] : NULL;
}
+
+char get_arch_pt_regs(unsigned int n)
+{
+ return (n <= ARCH_MAX_REGS) ? arch_pt_regs_table[n] : -1;
+}
+
+int get_arch_pt_regs_size(void)
+{
+ return sizeof(arch_pt_regs_type);
+}
--
1.8.5.2
Save pt_regs number and size information in function
convert_variable_location() instead of the register string name, so we
can fetch the target register from bpf context register later.
Signed-off-by: He Kuang <[email protected]>
---
tools/perf/util/include/dwarf-regs.h | 2 ++
tools/perf/util/probe-event.h | 4 ++++
tools/perf/util/probe-finder.c | 11 +++++++++++
3 files changed, 17 insertions(+)
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index 8f14965..166bec2 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -3,6 +3,8 @@
#ifdef HAVE_DWARF_SUPPORT
const char *get_arch_regstr(unsigned int n);
+char get_arch_pt_regs(unsigned int n);
+int get_arch_pt_regs_size(void);
#endif
#endif
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index d6b7834..842d6c4 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -29,6 +29,10 @@ struct probe_trace_arg {
char *value; /* Base value */
char *type; /* Type name */
struct probe_trace_arg_ref *ref; /* Referencing offset */
+ unsigned int regn;
+ unsigned int reg_size;
+ char *insns;
+ int insns_cnt;
};
/* kprobe-tracer and uprobe-tracer tracing event (point + arg) */
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 2a76e14..249e6cb 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -159,6 +159,15 @@ static struct probe_trace_arg_ref *alloc_trace_arg_ref(long offs)
return ref;
}
+char __attribute__ ((weak)) get_arch_pt_regs(unsigned int n) {
+ n = n;
+ return -1;
+}
+
+int __attribute__ ((weak)) get_arch_pt_regs_size(void) {
+ return -1;
+}
+
/*
* Convert a location into trace_arg.
* If tvar == NULL, this just checks variable can be converted.
@@ -260,6 +269,8 @@ static_var:
return -ERANGE;
}
+ tvar->regn = get_arch_pt_regs(regn);
+ tvar->reg_size = get_arch_pt_regs_size();
tvar->value = strdup(regs);
if (tvar->value == NULL)
return -ENOMEM;
--
1.8.5.2
When all arguments in bpf config section are collected in register and
offset form, this patch will fetch them from bpf context register and
place them as bpf input parameters.
Bpf prologue is generated as the following steps:
1. alloc dst address in stack -> r1
2. set size -> r2
3. fetch base register and offset -> r3
4. call BPF_FUNC_probe_read
5. loop 1
6. save intermediate result and process next arg
7. restore intermediate result to arg2~5
Signed-off-by: He Kuang <[email protected]>
---
tools/perf/util/probe-event.c | 94 +++++++++++++++++++++++++++++++++++++++++++
tools/perf/util/probe-event.h | 6 +++
2 files changed, 100 insertions(+)
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index d05b77c..c307cd7 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -46,6 +46,7 @@
#include "probe-event.h"
#include "probe-finder.h"
#include "session.h"
+#include "bpf-loader.h"
#define MAX_CMDLEN 256
#define PERFPROBE_GROUP "probe"
@@ -1684,6 +1685,99 @@ static int synthesize_probe_trace_arg(struct probe_trace_arg *arg,
return buf - tmp;
}
+#define BPF_STRBUF_ADD(buf, statement) \
+ strbuf_add(buf, \
+ &statement, \
+ sizeof(struct bpf_insn))
+
+#define BPF_STRBUF_ADD_BUF(statement) BPF_STRBUF_ADD(buf, statement)
+
+int synthesize_probe_trace_arg_bpf_begin(struct strbuf *buf)
+{
+ /* save arg1 to ctx */
+ BPF_STRBUF_ADD_BUF(BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1));
+ return 0;
+}
+
+int synthesize_probe_trace_arg_bpf_end(struct probe_trace_arg *arg,
+ struct strbuf *buf, int num)
+{
+ int i;
+
+ for (i = 0; i < num; i++) {
+ /* restore r7~10 to arg2~5*/
+ BPF_STRBUF_ADD_BUF(
+ BPF_MOV64_REG(BPF_REG_ARG2 + i, BPF_REG_7 + i));
+ }
+
+ /* assign result to tvar */
+ arg->insns = buf->buf;
+ arg->insns_cnt = (int)(buf->len/sizeof(struct bpf_insn));
+
+ return 0;
+}
+
+int synthesize_probe_trace_arg_bpf(struct probe_trace_arg *arg,
+ struct strbuf *buf, int index)
+{
+ struct probe_trace_arg_ref *ref = arg->ref;
+ int depth = 0;
+ int size = arg->reg_size;
+
+ if (size < 0)
+ return -EINVAL;
+
+ /* Special case: @XXX */
+ if (arg->value[0] == '@' && arg->ref)
+ ref = ref->next;
+
+ /* Print argument value */
+ if (arg->value[0] == '@' && arg->ref)
+ /* TODO parse other arguments */
+ pr_debug("%d%+ld", arg->regn,
+ arg->ref->offset);
+ else {
+ /* load ctx to r3 */
+ BPF_STRBUF_ADD_BUF(
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_CTX,
+ arg->regn * size));
+ }
+
+ /* parse and traslate to bpf bytecode */
+ while (ref) {
+ /* r3 offset */
+ BPF_STRBUF_ADD_BUF(
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, ref->offset));
+
+ /* clear stack space */
+ BPF_STRBUF_ADD_BUF(BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 0));
+ BPF_STRBUF_ADD_BUF(
+ BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1, -size));
+
+ /* set stack space to r1 */
+ BPF_STRBUF_ADD_BUF(BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP));
+ BPF_STRBUF_ADD_BUF(BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -size));
+
+ /* r2 for size */
+ BPF_STRBUF_ADD_BUF(BPF_ALU64_IMM(BPF_MOV, BPF_REG_2, size));
+
+ /* call BPF_FUNC_probe_read */
+ BPF_STRBUF_ADD_BUF(BPF_EMIT_CALL(BPF_FUNC_probe_read));
+
+ /* result in stack, move to r3 */
+ BPF_STRBUF_ADD_BUF(
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_FP, -size));
+
+ depth++;
+ ref = ref->next;
+ };
+
+ /* store intermediate results to r7~10 */
+ BPF_STRBUF_ADD_BUF(BPF_MOV64_REG(BPF_REG_7 + index, BPF_REG_3));
+
+ return 0;
+}
+
char *synthesize_probe_trace_command(struct probe_trace_event *tev)
{
struct probe_trace_point *tp = &tev->point;
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 842d6c4..3b6c284 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -109,6 +109,12 @@ extern char *synthesize_perf_probe_command(struct perf_probe_event *pev);
extern char *synthesize_probe_trace_command(struct probe_trace_event *tev);
extern int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf,
size_t len);
+/* generate bpf prologue from debuginfo */
+extern int synthesize_probe_trace_arg_bpf(struct probe_trace_arg *arg,
+ struct strbuf *buf, int index);
+extern int synthesize_probe_trace_arg_bpf_begin(struct strbuf *buf);
+extern int synthesize_probe_trace_arg_bpf_end(struct probe_trace_arg *arg,
+ struct strbuf *buf, int num);
/* Check the perf_probe_event needs debuginfo */
extern bool perf_probe_event_need_dwarf(struct perf_probe_event *pev);
--
1.8.5.2
Process debuginfo for bpf prologue, the process function is copied and
modified from debuginfo__find_trace_events(), but use a different
callback function for generating bpf prologue bytecode.
Signed-off-by: He Kuang <[email protected]>
---
tools/perf/util/probe-event.c | 27 +++++++++++++
tools/perf/util/probe-event.h | 2 +
tools/perf/util/probe-finder.c | 90 ++++++++++++++++++++++++++++++++++++++++++
tools/perf/util/probe-finder.h | 4 ++
4 files changed, 123 insertions(+)
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index c307cd7..fbdda4d 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -906,6 +906,33 @@ out:
return ret;
}
+int get_bpf_prologue(struct perf_probe_event *pev, char **result, int *count)
+{
+ int ret;
+ struct debuginfo *dinfo;
+ bool need_dwarf;
+
+ ret = init_symbol_maps(false);
+ if (ret < 0)
+ return ret;
+
+ need_dwarf = perf_probe_event_need_dwarf(pev);
+
+ dinfo = open_debuginfo(NULL, !need_dwarf);
+
+ if (!dinfo) {
+ if (need_dwarf)
+ return -ENOENT;
+ pr_debug("Could not open debuginfo. Try to use symbols.\n");
+ return 0;
+ }
+
+ pr_debug("Try to generate bpf prologue from debuginfo.\n");
+
+ ret = debuginfo__find_bpf_prologue(dinfo, pev, result, count);
+
+ return ret;
+}
#else /* !HAVE_DWARF_SUPPORT */
static int
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 3b6c284..5a6b86e 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -115,6 +115,8 @@ extern int synthesize_probe_trace_arg_bpf(struct probe_trace_arg *arg,
extern int synthesize_probe_trace_arg_bpf_begin(struct strbuf *buf);
extern int synthesize_probe_trace_arg_bpf_end(struct probe_trace_arg *arg,
struct strbuf *buf, int num);
+extern int get_bpf_prologue(struct perf_probe_event *pev,
+ char **result, int *count);
/* Check the perf_probe_event needs debuginfo */
extern bool perf_probe_event_need_dwarf(struct perf_probe_event *pev);
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 249e6cb..bfb7625 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1164,6 +1164,71 @@ static int expand_probe_args(Dwarf_Die *sc_die, struct probe_finder *pf,
return n;
}
+#define BPF_PROLOGUE_MAX_LEN (1024)
+static int generate_bpf_prologue(Dwarf_Die *sc_die, struct probe_finder *pf)
+{
+ struct perf_probe_arg *args;
+ int ret = 0, i;
+ Dwarf_Die vr_die;
+ struct strbuf buf;
+ int narg;
+
+ if (pf->tvar)
+ return -ERANGE;
+
+ pf->tvar = (struct probe_trace_arg *)
+ malloc(sizeof(struct probe_trace_arg));
+ if (pf->tvar == NULL)
+ return -ENOMEM;
+
+ /* Expand special probe argument if exist */
+ args = zalloc(sizeof(struct perf_probe_arg) * MAX_PROBE_ARGS);
+ if (args == NULL)
+ return -ENOMEM;
+
+ narg = expand_probe_args(sc_die, pf, args);
+ if (narg <= 0)
+ goto end;
+
+ strbuf_init(&buf, BPF_PROLOGUE_MAX_LEN);
+ synthesize_probe_trace_arg_bpf_begin(&buf);
+
+ /* Find each argument */
+ for (i = 0; i < narg; i++) {
+ pf->pvar = &args[i];
+
+ memset(pf->tvar, 0, sizeof(struct probe_trace_arg));
+ /* Search child die for local variables and parameters. */
+ if (!die_find_variable_at(sc_die, pf->pvar->var,
+ pf->addr, &vr_die)) {
+ /* Search again in global variables */
+ if (!die_find_variable_at(&pf->cu_die, pf->pvar->var,
+ 0, &vr_die)) {
+ pr_warning("Failed to find '%s' in this function.\n",
+ pf->pvar->var);
+ ret = -ENOENT;
+ }
+ }
+
+ if (ret >= 0)
+ ret = convert_variable(&vr_die, pf);
+ if (ret != 0)
+ break;
+
+ ret = synthesize_probe_trace_arg_bpf(pf->tvar, &buf, i);
+ if (ret)
+ goto end;
+ }
+
+ synthesize_probe_trace_arg_bpf_end(pf->tvar, &buf, narg);
+
+ pr_debug("bpf_prologue: insn num=%d\n", pf->tvar->insns_cnt);
+
+end:
+ free(args);
+ return ret;
+}
+
/* Add a found probe point into trace event list */
static int add_probe_trace_event(Dwarf_Die *sc_die, struct probe_finder *pf)
{
@@ -1248,6 +1313,31 @@ int debuginfo__find_trace_events(struct debuginfo *dbg,
return (ret < 0) ? ret : tf.ntevs;
}
+int debuginfo__find_bpf_prologue(struct debuginfo *dbg,
+ struct perf_probe_event *pev,
+ char **result, int *count)
+{
+ struct probe_finder pf = {.pev = pev,
+ .callback = generate_bpf_prologue};
+ int ret;
+
+ pf.tvar = NULL;
+ ret = debuginfo__find_probes(dbg, &pf);
+ if (ret == -ERANGE)
+ ret = 0;
+
+ if (pf.tvar) {
+ *result = pf.tvar->insns;
+ *count = pf.tvar->insns_cnt;
+ free(pf.tvar);
+ } else {
+ *result = NULL;
+ *count = 0;
+ }
+
+ return ret;
+}
+
#define MAX_VAR_LEN 64
/* Collect available variables in this scope */
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index ebf8c8c..28437978 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -40,6 +40,10 @@ extern int debuginfo__find_trace_events(struct debuginfo *dbg,
struct probe_trace_event **tevs,
int max_tevs);
+extern int debuginfo__find_bpf_prologue(struct debuginfo *dbg,
+ struct perf_probe_event *pev,
+ char **result, int *count);
+
/* Find a perf_probe_point from debuginfo */
extern int debuginfo__find_probe_point(struct debuginfo *dbg,
unsigned long addr,
--
1.8.5.2
Generate bpf prologue before attaching bpf progs to perf probe event. If
prologue is generated, it will be pasted in front of the original bpf
prog.
Signed-off-by: He Kuang <[email protected]>
---
tools/perf/util/bpf-loader.c | 66 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 66 insertions(+)
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index c646ca4..8c746ef 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -900,6 +900,69 @@ bpf_perf_prog_load(struct bpf_obj *obj, struct bpf_perf_prog *prog)
return 0;
}
+#ifdef HAVE_DWARF_SUPPORT
+#if 0
+static void show_bpf_insn(char *buf, int len)
+{
+ int i, j;
+ char *ptr = buf;
+
+ for (i = 0; i < len; i++) {
+ pr_debug("[%03d] ", i);
+ for (j = 0; j < 8; j++)
+ pr_debug("0x%02x, ", (*ptr++) & 0xff);
+ pr_debug("\\\n");
+ }
+}
+#endif
+
+static int bpf_obj_prologue(struct bpf_obj *obj)
+{
+ struct bpf_perf_prog *prog;
+ int ret = -1;
+
+ list_for_each_entry(prog, &obj->progs_list, list) {
+ int count;
+ char *result;
+ int new_count;
+
+ if (!prog->pev)
+ continue;
+
+ ret = get_bpf_prologue(prog->pev, &result, &count);
+ if (ret)
+ return ret;
+
+ if (count > 0) {
+ bpf_dump_prog((struct bpf_insn *)result, count);
+
+ new_count = prog->insns_cnt + count;
+ result = realloc(result,
+ new_count * sizeof(struct bpf_insn));
+ if (!result)
+ return -ENOMEM;
+
+ memcpy(result + count * sizeof(struct bpf_insn),
+ prog->insns,
+ prog->insns_cnt * sizeof(struct bpf_insn));
+
+ free(prog->insns);
+ prog->insns = (struct bpf_insn *)result;
+ prog->insns_cnt = new_count;
+ } else {
+ pr_debug("bpf: no prologue generated\n");
+ }
+ }
+
+ return ret;
+}
+#else
+static int bpf_progs_prologue(struct bpf_obj *obj __maybe_unused)
+{
+ return 0;
+}
+#endif
+
static int
bpf_obj_load_progs(struct bpf_obj *obj)
{
@@ -946,6 +1009,9 @@ int bpf__load(const char *path)
goto out;
if ((err = bpf_obj_relocate(obj)))
goto out;
+ err = bpf_obj_prologue(obj);
+ if (err)
+ goto out;
if ((err = bpf_obj_load_progs(obj)))
goto out;
--
1.8.5.2
On 5/5/15 3:10 AM, He Kuang wrote:
> This patch set is based on https://lkml.org/lkml/2015/4/30/264
>
> By using bpf 'config' section like this:
>
> char _config2[] SEC("config") = "generic_perform_write=generic_perform_write+122 file->f_mapping->a_ops bytes offset";
> SEC("generic_perform_write")
> int NODE_generic_perform_write (struct pt_regs *ctx, void *a_ops, void *bytes, void* offset) {
> char fmt[] = "NODE_generic_perform_write, a_ops=%p, bytes=%p, offset=%p\n";
> bpf_trace_printk(fmt, sizeof(fmt), a_ops, bytes, offset);
> return 1;
> }
>
> In this example, 'bytes' and 'offset' are local variables, a_ops is in
> the structure field of file parameter, and we probe in the body of the
> generic_perform_write() function.
>
> Perf can fetch and convert all the arguments and then we translate them
> into bpf bytecode as a prologue before calling bpf body functions. In
> the prologue, we fetch arguments from bpf context register and place
> them according to bpf calling conventions so the body function can
> access them as formal parameters.
great idea! Like it a lot.
Looking at current implementation I think the limit is <=3 arguments,
which I think is perfectly fine for now. Just worth mentioning in
the doc.
Two high level comments:
- can you collapse SEC("config") with SEC("func_name") ?
It seems that "func_name" is only used as reference inside "config".
I understand that you're proposing one "config" section where multiple
descriptions are strcat together, but why? Something like:
SEC("kprobe/generic_perform_write+122(file->f_mapping->a_ops, bytes,
offset)")
int func(...) { ... }
should be enough and more concise.
- please support it without debug info when kprobe is placed at the top
of the function. Like:
SEC("kprobe/generic_perform_write(void*, void*, long long)")
even without debug info it will copy ctx->di into r2, ctx->si into r3
and ctx->dx into r4.
Not everyone has access to debug info when writing programs.
Your prologue generator removes headache of figuring out cpu calling
convention and can even work for 32-bit architectures.
Also I think it belongs in tools/lib/bpf/ together with section walking
and loading bits. As we discussed in the other thread it makes to create
a proper library out of samples/bpf/libbpf*, bpf_load* and use it
everywhere. Once it's there I'd like to rewrite samples/bpf/*_kern.c
using this simplified access to function arguments.
Other comments per patch...
On 5/5/15 3:10 AM, He Kuang wrote:
> Including bpf instruction macros and register alias.
>
> Signed-off-by: He Kuang <[email protected]>
> ---
> tools/perf/util/bpf-loader.h | 188 +++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 188 insertions(+)
>
> diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h
> index baa4404..840f96d 100644
> --- a/tools/perf/util/bpf-loader.h
> +++ b/tools/perf/util/bpf-loader.h
> @@ -70,4 +70,192 @@ struct bpf_obj {
> } elf;
> };
> #define obj_elf_valid(o) ((o)->elf.fd >= 0)
> +
> +/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
> +
> +#define BPF_ALU64_REG(OP, DST, SRC) \
> + ((struct bpf_insn) { \
> + .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \
> + .dst_reg = DST, \
> + .src_reg = SRC, \
> + .off = 0, \
> + .imm = 0 })
this is massive copy-paste from samples/bpf/libbpf.h
let's create a proper library out of it and place into tools/lib/bpf/
On 5/5/15 3:10 AM, He Kuang wrote:
> Convert register number in debuginfo to its index in pt_regs.
>
> Signed-off-by: He Kuang <[email protected]>
> ---
> tools/perf/arch/x86/util/dwarf-regs.c | 31 +++++++++++++++++++++++++++++++
> 1 file changed, 31 insertions(+)
>
> diff --git a/tools/perf/arch/x86/util/dwarf-regs.c b/tools/perf/arch/x86/util/dwarf-regs.c
> index be22dd4..e586a47 100644
> --- a/tools/perf/arch/x86/util/dwarf-regs.c
> +++ b/tools/perf/arch/x86/util/dwarf-regs.c
> @@ -59,10 +59,31 @@ const char *x86_64_regs_table[X86_64_MAX_REGS] = {
> "%r15",
> };
>
> +const char x86_64_pt_regs_table[X86_64_MAX_REGS] = {
> + 10,
> + 12,
> + 11,
> + 5,
> + 13,
> + 14,
> + 4,
> + 19,
> + 9,
> + 8,
> + 7,
> + 6,
> + 3,
> + 2,
> + 1,
> + 0,
> +};
magic numbers need a comment in .c (not only in commit log)
Also I would combine them into
struct { const char *regname, int index } and
merge with x86_64_regs_table array
and would use something like offsetof(struct pt_regs, di) / 8
instead of hard coding.
In general patch 2 and 3 are in wrong order.
Generic 3 should come first. Otherwise patch 2 introduces things
that no one is using.
On 5/5/15 3:10 AM, He Kuang wrote:
> When all arguments in bpf config section are collected in register and
> offset form, this patch will fetch them from bpf context register and
> place them as bpf input parameters.
>
> Bpf prologue is generated as the following steps:
> 1. alloc dst address in stack -> r1
> 2. set size -> r2
> 3. fetch base register and offset -> r3
> 4. call BPF_FUNC_probe_read
> 5. loop 1
> 6. save intermediate result and process next arg
> 7. restore intermediate result to arg2~5
>
> Signed-off-by: He Kuang <[email protected]>
...
>
> +#define BPF_STRBUF_ADD(buf, statement) \
> + strbuf_add(buf, \
> + &statement, \
> + sizeof(struct bpf_insn))
> +
> +#define BPF_STRBUF_ADD_BUF(statement) BPF_STRBUF_ADD(buf, statement)
> +
> +int synthesize_probe_trace_arg_bpf_begin(struct strbuf *buf)
> +{
> + /* save arg1 to ctx */
> + BPF_STRBUF_ADD_BUF(BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1));
> + return 0;
> +}
the macro approach looks a bit ugly.
Why not to do it similar to net/core/filter.c style:
*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
Looks more readable to me.
> + for (i = 0; i < num; i++) {
> + /* restore r7~10 to arg2~5*/
> + BPF_STRBUF_ADD_BUF(
> + BPF_MOV64_REG(BPF_REG_ARG2 + i, BPF_REG_7 + i));
> + }
comment and/or logic is wrong. r10 is read only stack. you cannot use
it as callee-saved.
> + /* store intermediate results to r7~10 */
> + BPF_STRBUF_ADD_BUF(BPF_MOV64_REG(BPF_REG_7 + index, BPF_REG_3));
should be r7-r9.
Also is there a check somewhere that accepts only 3 debuginfo-backed
args?
On 2015/05/05 19:10, He Kuang wrote:
> This patch set is based on https://lkml.org/lkml/2015/4/30/264
>
> By using bpf 'config' section like this:
>
> char _config2[] SEC("config") = "generic_perform_write=generic_perform_write+122 file->f_mapping->a_ops bytes offset";
> SEC("generic_perform_write")
> int NODE_generic_perform_write (struct pt_regs *ctx, void *a_ops, void *bytes, void* offset) {
> char fmt[] = "NODE_generic_perform_write, a_ops=%p, bytes=%p, offset=%p\n";
> bpf_trace_printk(fmt, sizeof(fmt), a_ops, bytes, offset);
> return 1;
> }
>
> In this example, 'bytes' and 'offset' are local variables, a_ops is in
> the structure field of file parameter, and we probe in the body of the
> generic_perform_write() function.
>
> Perf can fetch and convert all the arguments and then we translate them
> into bpf bytecode as a prologue before calling bpf body functions. In
> the prologue, we fetch arguments from bpf context register and place
> them according to bpf calling conventions so the body function can
> access them as formal parameters.
>
> The perf command is as following:
>
> $ perf bpf -v bpf_bytecode.o
> ...
> bpf_prologue: insn num=26
> (bf) r6 = r1
> (79) r3 = *(u64 *)(r6 +112)
> (07) r3 += 248
> (b7) r1 = 0
> (7b) *(u64 *)(r10 -8) = r1
> (bf) r1 = r10
> (07) r1 += -8
> (b7) r2 = 8
> (85) call 4
> (79) r3 = *(u64 *)(r10 -8)
> (07) r3 += 104
> (b7) r1 = 0
> (7b) *(u64 *)(r10 -8) = r1
> (bf) r1 = r10
> (07) r1 += -8
> (b7) r2 = 8
> (85) call 4
> (79) r3 = *(u64 *)(r10 -8)
> (bf) r7 = r3
> (79) r3 = *(u64 *)(r6 +24)
> (bf) r8 = r3
> (79) r3 = *(u64 *)(r6 +88)
> (bf) r9 = r3
> (bf) r2 = r7
> (bf) r3 = r8
> (bf) r4 = r9
Hmm, this idea looks good to me, I need to learn what the bpf needs for review.
BTW, if it is so easily to compose the byte code in perf, can we do it in the
kernel too? I'd like to check the possibility of replacing the old fetch_arg
functions with these byte code for optimizing performance and reduce redundant
functionality. :)
Thank you,
--
Masami HIRAMATSU
Linux Technology Research Center, System Productivity Research Dept.
Center for Technology Innovation - Systems Engineering
Hitachi, Ltd., Research & Development Group
E-mail: [email protected]
Em Wed, May 06, 2015 at 10:37:37AM +0900, Masami Hiramatsu escreveu:
> On 2015/05/05 19:10, He Kuang wrote:
> > (79) r3 = *(u64 *)(r6 +88)
> > (bf) r9 = r3
> > (bf) r2 = r7
> > (bf) r3 = r8
> > (bf) r4 = r9
> Hmm, this idea looks good to me, I need to learn what the bpf needs for review.
> BTW, if it is so easily to compose the byte code in perf, can we do it in the
> kernel too? I'd like to check the possibility of replacing the old fetch_arg
> functions with these byte code for optimizing performance and reduce redundant
> functionality. :)
Excellent idea! ;-)
- Arnaldo
On 2015/5/6 6:31, Alexei Starovoitov wrote:
> On 5/5/15 3:10 AM, He Kuang wrote:
>> This patch set is based on https://lkml.org/lkml/2015/4/30/264
>>
>> By using bpf 'config' section like this:
>>
>> char _config2[] SEC("config") = "generic_perform_write=generic_perform_write+122 file->f_mapping->a_ops bytes offset";
>> SEC("generic_perform_write")
>> int NODE_generic_perform_write (struct pt_regs *ctx, void *a_ops, void *bytes, void* offset) {
>> char fmt[] = "NODE_generic_perform_write, a_ops=%p, bytes=%p, offset=%p\n";
>> bpf_trace_printk(fmt, sizeof(fmt), a_ops, bytes, offset);
>> return 1;
>> }
>>
>> In this example, 'bytes' and 'offset' are local variables, a_ops is in
>> the structure field of file parameter, and we probe in the body of the
>> generic_perform_write() function.
>>
>> Perf can fetch and convert all the arguments and then we translate them
>> into bpf bytecode as a prologue before calling bpf body functions. In
>> the prologue, we fetch arguments from bpf context register and place
>> them according to bpf calling conventions so the body function can
>> access them as formal parameters.
>
> great idea! Like it a lot.
> Looking at current implementation I think the limit is <=3 arguments,
> which I think is perfectly fine for now. Just worth mentioning in
> the doc.
>
> Two high level comments:
> - can you collapse SEC("config") with SEC("func_name") ?
> It seems that "func_name" is only used as reference inside "config".
> I understand that you're proposing one "config" section where multiple
> descriptions are strcat together, but why? Something like:
> SEC("kprobe/generic_perform_write+122(file->f_mapping->a_ops, bytes, offset)")
> int func(...) { ... }
> should be enough and more concise.
>
Is it possible to use such a long section name? I introduce 'config' section
since it contains C strings so I can put things to it freely. By using macro trick,
we can still use not very complex code to describe probing position like this:
#define PROBE(name, config) \
SEC("config") char name##_config[] = #name config ; \
SEC(#name)
PROBE(generic_perform_write, "kprobe: +122(file->f_mapping->a_ops, bytes, offset)")
On 5/5/15 8:58 PM, Wang Nan wrote:
>> Two high level comments:
>> - can you collapse SEC("config") with SEC("func_name") ?
>> It seems that "func_name" is only used as reference inside "config".
>> I understand that you're proposing one "config" section where multiple
>> descriptions are strcat together, but why? Something like:
>> SEC("kprobe/generic_perform_write+122(file->f_mapping->a_ops, bytes, offset)")
>> int func(...) { ... }
>> should be enough and more concise.
>>
>
> Is it possible to use such a long section name? I introduce 'config' section
yes. of course. I don't know what is the limit, but it's definitely
above 512 characters. It can contains spaces and special chars too.
> since it contains C strings so I can put things to it freely. By using macro trick,
> we can still use not very complex code to describe probing position like this:
>
> #define PROBE(name, config) \
> SEC("config") char name##_config[] = #name config ; \
> SEC(#name)
> PROBE(generic_perform_write, "kprobe: +122(file->f_mapping->a_ops, bytes, offset)")
that's even more obscure :( why hide it behind macros?
I think single 'SEC' macro is already not very clean, but I couldn't
come up with better alternative. Elf sections are free that why I used
them in samples/bpf/ examples, but let's not overuse them.
On 2015/5/6 12:10, Alexei Starovoitov wrote:
> On 5/5/15 8:58 PM, Wang Nan wrote:
>>> Two high level comments:
>>> - can you collapse SEC("config") with SEC("func_name") ?
>>> It seems that "func_name" is only used as reference inside "config".
>>> I understand that you're proposing one "config" section where multiple
>>> descriptions are strcat together, but why? Something like:
>>> SEC("kprobe/generic_perform_write+122(file->f_mapping->a_ops, bytes, offset)")
>>> int func(...) { ... }
>>> should be enough and more concise.
>>>
>>
>> Is it possible to use such a long section name? I introduce 'config' section
>
> yes. of course. I don't know what is the limit, but it's definitely
> above 512 characters. It can contains spaces and special chars too.
>
Good. Let's get rid of config section.
On 2015/05/05 19:10, He Kuang wrote:
> Convert register number in debuginfo to its index in pt_regs.
>
This seems better introduced after 3/6, since 3/6 is generic
framework. Without 3/6, this can't do anything.
> Signed-off-by: He Kuang <[email protected]>
> ---
> tools/perf/arch/x86/util/dwarf-regs.c | 31 +++++++++++++++++++++++++++++++
> 1 file changed, 31 insertions(+)
>
> diff --git a/tools/perf/arch/x86/util/dwarf-regs.c b/tools/perf/arch/x86/util/dwarf-regs.c
> index be22dd4..e586a47 100644
> --- a/tools/perf/arch/x86/util/dwarf-regs.c
> +++ b/tools/perf/arch/x86/util/dwarf-regs.c
> @@ -59,10 +59,31 @@ const char *x86_64_regs_table[X86_64_MAX_REGS] = {
> "%r15",
> };
>
> +const char x86_64_pt_regs_table[X86_64_MAX_REGS] = {
> + 10,
> + 12,
> + 11,
> + 5,
> + 13,
> + 14,
> + 4,
> + 19,
> + 9,
> + 8,
> + 7,
> + 6,
> + 3,
> + 2,
> + 1,
> + 0,
> +};
> +
> /* TODO: switching by dwarf address size */
> #ifdef __x86_64__
> #define ARCH_MAX_REGS X86_64_MAX_REGS
> #define arch_regs_table x86_64_regs_table
> +#define arch_pt_regs_table x86_64_pt_regs_table
> +#define arch_pt_regs_type unsigned long
If you need to use this as a type, please use typedef in arch header.
> #else
> #define ARCH_MAX_REGS X86_32_MAX_REGS
> #define arch_regs_table x86_32_regs_table
> @@ -73,3 +94,13 @@ const char *get_arch_regstr(unsigned int n)
> {
> return (n <= ARCH_MAX_REGS) ? arch_regs_table[n] : NULL;
> }
> +
> +char get_arch_pt_regs(unsigned int n)
Returning char is meaningless. Also this seems returning an index or
offset of pt_regs. If so, get_arch_regoffs() will be better name.
> +{
> + return (n <= ARCH_MAX_REGS) ? arch_pt_regs_table[n] : -1;
> +}
> +
> +int get_arch_pt_regs_size(void)
> +{
> + return sizeof(arch_pt_regs_type);
> +}
No, don't do that, since this always returns same value.
Thank you,
--
Masami HIRAMATSU
Linux Technology Research Center, System Productivity Research Dept.
Center for Technology Innovation - Systems Engineering
Hitachi, Ltd., Research & Development Group
E-mail: [email protected]