Previous patch v2 url:
https://lkml.org/lkml/2015/7/22/104
This patchset allows user read PMU events in the following way:
1. Open the PMU using perf_event_open() (for each CPUs or for
each processes he/she'd like to watch);
2. Create a BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map;
3. Insert FDs into the map with some key-value mapping scheme
(i.e. cpuid -> event on that CPU);
4. Load and attach eBPF programs as usual;
5. In eBPF program, get the perf_event_map_fd and index (i.e.
cpuid get from bpf_get_smp_processor_id()) then use
bpf_perf_event_read() to read from it.
6. Do anything he/her want.
changes in V3:
- collapse V2 patches 1-3 into one;
- drop the function map->ops->map_traverse_elem() and release
the struct perf_event in map_free;
- only allow to access bpf_perf_event_read() from programs;
- update the perf_event_array_map elem via xchg();
- pass index directly to bpf_perf_event_read() instead of
MAP_KEY;
changes in V2:
- put atomic_long_inc_not_zero() between fdget() and fdput();
- limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE;
- Only read the event counter on current CPU or on current
process;
- add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the
pointer to the struct perf_event;
- according to the perf_event_map_fd and key, the function
bpf_perf_event_read() can get the Hardware PMU counter value;
Patch 3/3 is a simple example and shows how to use this new eBPF
programs ability. The PMU counter data can be found in
/sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU
value when 'kprobe/sys_write' sampling)
$ cat /sys/kernel/debug/tracing/trace_pipe
$ ./tracex6
...
cat-674 [000] d..1 146.413405: : CPU-0 2558223
<...>-699 [003] d..1 146.413441: : CPU-3 2663985
cat-674 [000] d..1 146.413480: : CPU-0 2659705
<...>-699 [003] d..1 146.413516: : CPU-3 2765199
cat-674 [000] d..1 146.413555: : CPU-0 2761277
<...>-699 [003] d..1 146.413600: : CPU-3 2877051
cat-674 [000] d..1 146.413651: : CPU-0 2889668
<...>-699 [003] d..1 146.413696: : CPU-3 3006447
cat-674 [000] d..1 146.413749: : CPU-0 3021285
<...>-699 [003] d..1 146.413787: : CPU-3 3131459
cat-674 [000] d..1 146.413838: : CPU-0 3141959
<...>-699 [003] d..1 146.413886: : CPU-3 3264188
cat-674 [000] d..1 146.413930: : CPU-0 3266461
<...>-699 [003] d..1 146.413973: : CPU-3 3381038
cat-674 [000] d..1 146.414025: : CPU-0 3393730
<...>-699 [003] d..1 146.414071: : CPU-3 3514676
...
The detail of patches is as follow:
Patch 1/3 introduces a new bpf map type. This map only stores the
pointer to struct perf_event;
Patch 2/3 implement function bpf_perf_event_read() that get the selected
hardware PMU conuter;
Patch 3/3 give a simple example.
Kaixu Xia (3):
bpf: Add new bpf map type to store the pointer to struct perf_event
bpf: Implement function bpf_perf_event_read() that get the selected
hardware PMU conuter
samples/bpf: example of get selected PMU counter value
include/linux/bpf.h | 3 ++
include/linux/perf_event.h | 5 +-
include/uapi/linux/bpf.h | 2 +
kernel/bpf/arraymap.c | 113 +++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/helpers.c | 36 +++++++++++++++
kernel/bpf/verifier.c | 15 ++++++
kernel/events/core.c | 27 ++++++++++-
kernel/trace/bpf_trace.c | 2 +
samples/bpf/Makefile | 4 ++
samples/bpf/bpf_helpers.h | 2 +
samples/bpf/tracex6_kern.c | 26 +++++++++++
samples/bpf/tracex6_user.c | 67 +++++++++++++++++++++++++++
12 files changed, 299 insertions(+), 3 deletions(-)
create mode 100644 samples/bpf/tracex6_kern.c
create mode 100644 samples/bpf/tracex6_user.c
--
1.8.3.4
Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
This map only stores the pointer to struct perf_event. The
user space event FDs from perf_event_open() syscall are converted
to the pointer to struct perf_event and stored in map.
Signed-off-by: Kaixu Xia <[email protected]>
---
include/linux/bpf.h | 2 +
include/linux/perf_event.h | 2 +
include/uapi/linux/bpf.h | 1 +
kernel/bpf/arraymap.c | 113 +++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/verifier.c | 15 ++++++
kernel/events/core.c | 23 +++++++++
6 files changed, 156 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4383476..9cf74c0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -10,6 +10,7 @@
#include <uapi/linux/bpf.h>
#include <linux/workqueue.h>
#include <linux/file.h>
+#include <linux/perf_event.h>
struct bpf_map;
@@ -143,6 +144,7 @@ struct bpf_array {
union {
char value[0] __aligned(8);
struct bpf_prog *prog[0] __aligned(8);
+ struct perf_event *events[0] __aligned(8);
};
};
#define MAX_TAIL_CALL_CNT 32
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2027809..2ea4067 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -641,6 +641,7 @@ extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
+extern struct perf_event *perf_event_get(unsigned int fd);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
@@ -979,6 +980,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; }
static inline void perf_event_exit_task(struct task_struct *child) { }
static inline void perf_event_free_task(struct task_struct *task) { }
static inline void perf_event_delayed_put(struct task_struct *task) { }
+static struct perf_event *perf_event_get(unsigned int fd) { return NULL; }
static inline void perf_event_print_debug(void) { }
static inline int perf_event_task_disable(void) { return -EINVAL; }
static inline int perf_event_task_enable(void) { return -EINVAL; }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 29ef6f9..69a1f6b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_HASH,
BPF_MAP_TYPE_ARRAY,
BPF_MAP_TYPE_PROG_ARRAY,
+ BPF_MAP_TYPE_PERF_EVENT_ARRAY,
};
enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229..e97efbc 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -255,3 +255,116 @@ static int __init register_prog_array_map(void)
return 0;
}
late_initcall(register_prog_array_map);
+
+static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
+{
+ /* only the pointer to struct perf_event can be stored in
+ * perf_event_array map
+ */
+ if (attr->value_size != sizeof(u32))
+ return ERR_PTR(-EINVAL);
+
+ return array_map_alloc(attr);
+}
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_event *event;
+ int i;
+
+ synchronize_rcu();
+
+ /* release the struct perf_event in perf_event_array_map */
+ for(i = 0; i < array->map.max_entries; i++) {
+ event = array->events[i];
+ if (event)
+ perf_event_release_kernel(event);
+ }
+ kvfree(array);
+}
+
+static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -EINVAL;
+}
+
+static void *perf_event_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+static struct perf_event *convert_map_with_perf_event(void *value)
+{
+ struct perf_event *event;
+ u32 fd;
+
+ fd = *(u32 *)value;
+
+ event = perf_event_get(fd);
+ if (IS_ERR(event))
+ return NULL;
+
+ /* limit the event type to PERF_TYPE_RAW
+ * and PERF_TYPE_HARDWARE.
+ */
+ if (event->attr.type != PERF_TYPE_RAW &&
+ event->attr.type != PERF_TYPE_HARDWARE)
+ return NULL;
+
+ return event;
+}
+
+/* only called from syscall */
+static int perf_event_array_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_event *event;
+ u32 index = *(u32 *)key;
+
+ if (map_flags != BPF_ANY)
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ /* check if the value is already stored */
+ if (array->events[index])
+ return -EINVAL;
+
+ /* convert the fd to the pointer to struct perf_event */
+ event = convert_map_with_perf_event(value);
+ if (!event)
+ return -EBADF;
+
+ xchg(array->events + index, event);
+ return 0;
+}
+
+static int perf_event_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+ .map_alloc = perf_event_array_map_alloc,
+ .map_free = perf_event_array_map_free,
+ .map_get_next_key = perf_event_array_map_get_next_key,
+ .map_lookup_elem = perf_event_array_map_lookup_elem,
+ .map_update_elem = perf_event_array_map_update_elem,
+ .map_delete_elem = perf_event_array_map_delete_elem,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+ .ops = &perf_event_array_ops,
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+ bpf_register_map_type(&perf_event_array_type);
+ return 0;
+}
+late_initcall(register_perf_event_array_map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866..c70f7e7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -924,6 +924,21 @@ static int check_call(struct verifier_env *env, int func_id)
*/
return -EINVAL;
+ if (map && map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
+ func_id != BPF_FUNC_perf_event_read)
+ /* perf_event_array map type needs extra care:
+ * only allow to pass it into bpf_perf_event_read() for now.
+ * bpf_map_update/delete_elem() must only be done via syscall
+ */
+ return -EINVAL;
+
+ if (func_id == BPF_FUNC_perf_event_read &&
+ map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+ /* don't allow any other map type to be passed into
+ * bpf_perf_event_read()
+ */
+ return -EINVAL;
+
return 0;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae34..08cb467 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8574,6 +8574,29 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
+struct perf_event *perf_event_get(unsigned int fd)
+{
+ struct perf_event *event;
+ struct fd f;
+
+ f = fdget(fd);
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &perf_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ event = f.file->private_data;
+
+ atomic_long_inc(&event->refcount);
+ fdput(f);
+
+ return event;
+}
+
/*
* inherit a event from parent task to child task:
*/
--
1.8.3.4
According to the perf_event_map_fd and index, the function
bpf_perf_event_read() can convert the corresponding map
value to the pointer to struct perf_event and return the
Hardware PMU counter value.
Signed-off-by: Kaixu Xia <[email protected]>
---
include/linux/bpf.h | 1 +
include/linux/perf_event.h | 3 ++-
include/uapi/linux/bpf.h | 1 +
kernel/bpf/helpers.c | 36 ++++++++++++++++++++++++++++++++++++
kernel/events/core.c | 4 ++--
kernel/trace/bpf_trace.c | 2 ++
6 files changed, 44 insertions(+), 3 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9cf74c0..0954b8f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -187,6 +187,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
extern const struct bpf_func_proto bpf_map_update_elem_proto;
extern const struct bpf_func_proto bpf_map_delete_elem_proto;
+extern const struct bpf_func_proto bpf_perf_event_read_proto;
extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
extern const struct bpf_func_proto bpf_tail_call_proto;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2ea4067..899abcb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -662,7 +662,8 @@ extern void perf_pmu_migrate_context(struct pmu *pmu,
int src_cpu, int dst_cpu);
extern u64 perf_event_read_value(struct perf_event *event,
u64 *enabled, u64 *running);
-
+extern void __perf_event_read(void *info);
+extern u64 perf_event_count(struct perf_event *event);
struct perf_sample_data {
/*
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 69a1f6b..b9b13ce 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -250,6 +250,7 @@ enum bpf_func_id {
* Return: 0 on success
*/
BPF_FUNC_get_current_comm,
+ BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, index) */
__BPF_FUNC_MAX_ID,
};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1447ec0..aab219d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -182,3 +182,39 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg1_type = ARG_PTR_TO_STACK,
.arg2_type = ARG_CONST_STACK_SIZE,
};
+
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_event *event;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ event = array->events[index];
+ if (!event)
+ return -EBADF;
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return -ENOENT;
+
+ if (event->oncpu != raw_smp_processor_id() &&
+ event->ctx->task != current)
+ return -EINVAL;
+
+ if (event->attr.inherit)
+ return -EINVAL;
+
+ __perf_event_read(event);
+
+ return perf_event_count(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+ .func = bpf_perf_event_read,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 08cb467..c59d9c6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3177,7 +3177,7 @@ void perf_event_exec(void)
/*
* Cross CPU call to read the hardware event
*/
-static void __perf_event_read(void *info)
+void __perf_event_read(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -3204,7 +3204,7 @@ static void __perf_event_read(void *info)
raw_spin_unlock(&ctx->lock);
}
-static inline u64 perf_event_count(struct perf_event *event)
+u64 perf_event_count(struct perf_event *event)
{
if (event->pmu->count)
return event->pmu->count(event);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041a..9cf094f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -183,6 +183,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_perf_event_read:
+ return &bpf_perf_event_read_proto;
default:
return NULL;
}
--
1.8.3.4
This is a simple example and shows how to use the new ability
to get the selected Hardware PMU counter value.
Signed-off-by: Kaixu Xia <[email protected]>
---
samples/bpf/Makefile | 4 +++
samples/bpf/bpf_helpers.h | 2 ++
samples/bpf/tracex6_kern.c | 26 ++++++++++++++++++
samples/bpf/tracex6_user.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 99 insertions(+)
create mode 100644 samples/bpf/tracex6_kern.c
create mode 100644 samples/bpf/tracex6_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4450fed..63e7d50 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -12,6 +12,7 @@ hostprogs-y += tracex2
hostprogs-y += tracex3
hostprogs-y += tracex4
hostprogs-y += tracex5
+hostprogs-y += tracex6
hostprogs-y += lathist
test_verifier-objs := test_verifier.o libbpf.o
@@ -25,6 +26,7 @@ tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
+tracex6-objs := bpf_load.o libbpf.o tracex6_user.o
lathist-objs := bpf_load.o libbpf.o lathist_user.o
# Tell kbuild to always build the programs
@@ -37,6 +39,7 @@ always += tracex2_kern.o
always += tracex3_kern.o
always += tracex4_kern.o
always += tracex5_kern.o
+always += tracex6_kern.o
always += tcbpf1_kern.o
always += lathist_kern.o
@@ -51,6 +54,7 @@ HOSTLOADLIBES_tracex2 += -lelf
HOSTLOADLIBES_tracex3 += -lelf
HOSTLOADLIBES_tracex4 += -lelf -lrt
HOSTLOADLIBES_tracex5 += -lelf
+HOSTLOADLIBES_tracex6 += -lelf
HOSTLOADLIBES_lathist += -lelf
# point this to your LLVM backend with bpf support
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index bdf1c16..c8a3594 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -31,6 +31,8 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) =
(void *) BPF_FUNC_get_current_uid_gid;
static int (*bpf_get_current_comm)(void *buf, int buf_size) =
(void *) BPF_FUNC_get_current_comm;
+static int (*bpf_perf_event_read)(void *map, int index) =
+ (void *) BPF_FUNC_perf_event_read;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
new file mode 100644
index 0000000..d213161
--- /dev/null
+++ b/samples/bpf/tracex6_kern.c
@@ -0,0 +1,26 @@
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(unsigned long),
+ .max_entries = 32,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ u64 count;
+ u32 key = bpf_get_smp_processor_id();
+ char fmt[] = "CPU-%d %llu\n";
+
+ count = bpf_perf_event_read(&my_map, &key);
+ bpf_trace_printk(fmt, sizeof(fmt), key, count);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
new file mode 100644
index 0000000..30307c9
--- /dev/null
+++ b/samples/bpf/tracex6_user.c
@@ -0,0 +1,67 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+static void test_bpf_perf_event(void)
+{
+ int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ int *pmu_fd = malloc(nr_cpus * sizeof(int));
+ unsigned long value;
+ int i;
+
+ struct perf_event_attr attr_insn_pmu = {
+ .freq = 0,
+ .sample_period = 0x7fffffffffffffffULL,
+ .inherit = 0,
+ .type = PERF_TYPE_HARDWARE,
+ .read_format = 0,
+ .sample_type = 0,
+ .config = 0,/* PMU: cycles */
+ };
+
+ for(i = 0; i < nr_cpus; i++) {
+ pmu_fd[i] = perf_event_open(&attr_insn_pmu, -1/*pid*/, i/*cpu*/, -1/*group_fd*/, 0);
+ if (pmu_fd[i] < 0)
+ printf("event syscall failed ****\n");
+
+ bpf_update_elem(map_fd[0], &i, (pmu_fd + i), BPF_ANY);
+
+ ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0);
+ }
+
+ system("ls");
+ system("pwd");
+ system("sleep 2");
+
+ for(i = 0; i < nr_cpus; i++)
+ close(pmu_fd[i]);
+
+ close(map_fd);
+
+ free(pmu_fd);
+}
+
+int main(int argc, char **argv)
+{
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ test_bpf_perf_event();
+
+ return 0;
+}
--
1.8.3.4
On 7/23/15 2:42 AM, Kaixu Xia wrote:
> Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
> This map only stores the pointer to struct perf_event. The
> user space event FDs from perf_event_open() syscall are converted
> to the pointer to struct perf_event and stored in map.
...
> +static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
> +{
> + /* only the pointer to struct perf_event can be stored in
> + * perf_event_array map
> + */
> + if (attr->value_size != sizeof(u32))
> + return ERR_PTR(-EINVAL);
> +
> + return array_map_alloc(attr);
> +}
since it's exactly the same as prog_array_map_alloc(),
just rename it to something like 'fd_array_map_alloc'
and use for both types.
> +static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
> + void *next_key)
> +{
> + return -EINVAL;
> +}
> +
> +static void *perf_event_array_map_lookup_elem(struct bpf_map *map, void *key)
> +{
> + return NULL;
> +}
same for the above two.
rename prog_array_map_* into fd_array_map_* and use for both map types.
> +static struct perf_event *convert_map_with_perf_event(void *value)
> +{
> + struct perf_event *event;
> + u32 fd;
> +
> + fd = *(u32 *)value;
> +
> + event = perf_event_get(fd);
> + if (IS_ERR(event))
> + return NULL;
don't lose error code, do 'return event' instead.
> +
> + /* limit the event type to PERF_TYPE_RAW
> + * and PERF_TYPE_HARDWARE.
> + */
> + if (event->attr.type != PERF_TYPE_RAW &&
> + event->attr.type != PERF_TYPE_HARDWARE)
> + return NULL;
perf_event refcnt leak? need to do put_event.
and return ERR_PTR(-EINVAL)
> +
> + return event;
> +}
> +
> +/* only called from syscall */
> +static int perf_event_array_map_update_elem(struct bpf_map *map, void *key,
> + void *value, u64 map_flags)
> +{
> + struct bpf_array *array = container_of(map, struct bpf_array, map);
> + struct perf_event *event;
> + u32 index = *(u32 *)key;
> +
> + if (map_flags != BPF_ANY)
> + return -EINVAL;
> +
> + if (index >= array->map.max_entries)
> + return -E2BIG;
> +
> + /* check if the value is already stored */
> + if (array->events[index])
> + return -EINVAL;
> +
> + /* convert the fd to the pointer to struct perf_event */
> + event = convert_map_with_perf_event(value);
imo helper name is misleading and it's too short to be separate
function. Just inline it and you can reuse 'index' variable.
> + if (!event)
> + return -EBADF;
> +
> + xchg(array->events + index, event);
refcnt leak of old event! Please think it through.
This type of bugs I shouldn't be finding.
> +static int perf_event_array_map_delete_elem(struct bpf_map *map, void *key)
> +{
> + return -EINVAL;
> +}
no way to dec refcnt of perf_event from user space?
why not to do the same as prog_array_delete?
On 7/23/15 2:42 AM, Kaixu Xia wrote:
> According to the perf_event_map_fd and index, the function
> bpf_perf_event_read() can convert the corresponding map
> value to the pointer to struct perf_event and return the
> Hardware PMU counter value.
>
> Signed-off-by: Kaixu Xia <[email protected]>
...
> +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
> +{
> + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
> + struct bpf_array *array = container_of(map, struct bpf_array, map);
> + struct perf_event *event;
> +
> + if (index >= array->map.max_entries)
> + return -E2BIG;
> +
> + event = array->events[index];
> + if (!event)
> + return -EBADF;
probably ENOENT makes more sense here.
> +
> + if (event->state != PERF_EVENT_STATE_ACTIVE)
> + return -ENOENT;
and -EINVAL here?
On 7/23/15 2:42 AM, Kaixu Xia wrote:
> This is a simple example and shows how to use the new ability
> to get the selected Hardware PMU counter value.
>
> Signed-off-by: Kaixu Xia <[email protected]>
...
> +struct bpf_map_def SEC("maps") my_map = {
> + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> + .key_size = sizeof(int),
> + .value_size = sizeof(unsigned long),
> + .max_entries = 32,
> +};
wait. how did it work here? value_size should be u32.
于 2015/7/24 6:59, Alexei Starovoitov 写道:
> On 7/23/15 2:42 AM, Kaixu Xia wrote:
>> This is a simple example and shows how to use the new ability
>> to get the selected Hardware PMU counter value.
>>
>> Signed-off-by: Kaixu Xia <[email protected]>
> ...
>> +struct bpf_map_def SEC("maps") my_map = {
>> + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
>> + .key_size = sizeof(int),
>> + .value_size = sizeof(unsigned long),
>> + .max_entries = 32,
>> +};
>
> wait. how did it work here? value_size should be u32.
I tested the whole thing on ARM board. You are ringt, it
should be u32.
When create the array map, we choose the array->elem_size as
round_up(attr->value_size, 8), why 8?
Thanks!
>
>
>
于 2015/7/24 6:56, Alexei Starovoitov 写道:
> On 7/23/15 2:42 AM, Kaixu Xia wrote:
>> According to the perf_event_map_fd and index, the function
>> bpf_perf_event_read() can convert the corresponding map
>> value to the pointer to struct perf_event and return the
>> Hardware PMU counter value.
>>
>> Signed-off-by: Kaixu Xia <[email protected]>
> ...
>> +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
>> +{
>> + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
>> + struct bpf_array *array = container_of(map, struct bpf_array, map);
>> + struct perf_event *event;
>> +
>> + if (index >= array->map.max_entries)
>> + return -E2BIG;
>> +
>> + event = array->events[index];
>> + if (!event)
>> + return -EBADF;
>
> probably ENOENT makes more sense here.
>
>> +
>> + if (event->state != PERF_EVENT_STATE_ACTIVE)
>> + return -ENOENT;
>
> and -EINVAL here?
Yeah, the errno is better.
Thanks!
>
>
> .
>
于 2015/7/24 6:54, Alexei Starovoitov 写道:
> On 7/23/15 2:42 AM, Kaixu Xia wrote:
>> Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
>> This map only stores the pointer to struct perf_event. The
>> user space event FDs from perf_event_open() syscall are converted
>> to the pointer to struct perf_event and stored in map.
> ...
>> +static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
>> +{
>> + /* only the pointer to struct perf_event can be stored in
>> + * perf_event_array map
>> + */
>> + if (attr->value_size != sizeof(u32))
>> + return ERR_PTR(-EINVAL);
>> +
>> + return array_map_alloc(attr);
>> +}
>
> since it's exactly the same as prog_array_map_alloc(),
> just rename it to something like 'fd_array_map_alloc'
> and use for both types.
>
>> +static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
>> + void *next_key)
>> +{
>> + return -EINVAL;
>> +}
>> +
>> +static void *perf_event_array_map_lookup_elem(struct bpf_map *map, void *key)
>> +{
>> + return NULL;
>> +}
>
> same for the above two.
> rename prog_array_map_* into fd_array_map_* and use for both map types.
>
>> +static struct perf_event *convert_map_with_perf_event(void *value)
>> +{
>> + struct perf_event *event;
>> + u32 fd;
>> +
>> + fd = *(u32 *)value;
>> +
>> + event = perf_event_get(fd);
>> + if (IS_ERR(event))
>> + return NULL;
>
> don't lose error code, do 'return event' instead.
>
>> +
>> + /* limit the event type to PERF_TYPE_RAW
>> + * and PERF_TYPE_HARDWARE.
>> + */
>> + if (event->attr.type != PERF_TYPE_RAW &&
>> + event->attr.type != PERF_TYPE_HARDWARE)
>> + return NULL;
>
> perf_event refcnt leak? need to do put_event.
> and return ERR_PTR(-EINVAL)
>
>> +
>> + return event;
>> +}
>> +
>> +/* only called from syscall */
>> +static int perf_event_array_map_update_elem(struct bpf_map *map, void *key,
>> + void *value, u64 map_flags)
>> +{
>> + struct bpf_array *array = container_of(map, struct bpf_array, map);
>> + struct perf_event *event;
>> + u32 index = *(u32 *)key;
>> +
>> + if (map_flags != BPF_ANY)
>> + return -EINVAL;
>> +
>> + if (index >= array->map.max_entries)
>> + return -E2BIG;
>> +
>> + /* check if the value is already stored */
>> + if (array->events[index])
>> + return -EINVAL;
>> +
>> + /* convert the fd to the pointer to struct perf_event */
>> + event = convert_map_with_perf_event(value);
>
> imo helper name is misleading and it's too short to be separate
> function. Just inline it and you can reuse 'index' variable.
>
>> + if (!event)
>> + return -EBADF;
>> +
>> + xchg(array->events + index, event);
>
> refcnt leak of old event! Please think it through.
> This type of bugs I shouldn't be finding.
Maybe the commit message is not elaborate. Here I prevent
user space from updating the existed event, so the return
value of xchg() is NULL and no refcnt leak of old event.
I will do the same as prog_array in next version.
>
>> +static int perf_event_array_map_delete_elem(struct bpf_map *map, void *key)
>> +{
>> + return -EINVAL;
>> +}
>
> no way to dec refcnt of perf_event from user space?
> why not to do the same as prog_array_delete?
Will follow them in V4.
>
>
> .
>
On 7/23/15 6:54 PM, xiakaixu wrote:
> 于 2015/7/24 6:59, Alexei Starovoitov 写道:
>> On 7/23/15 2:42 AM, Kaixu Xia wrote:
>>> This is a simple example and shows how to use the new ability
>>> to get the selected Hardware PMU counter value.
>>>
>>> Signed-off-by: Kaixu Xia <[email protected]>
>> ...
>>> +struct bpf_map_def SEC("maps") my_map = {
>>> + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
>>> + .key_size = sizeof(int),
>>> + .value_size = sizeof(unsigned long),
>>> + .max_entries = 32,
>>> +};
>>
>> wait. how did it work here? value_size should be u32.
>
> I tested the whole thing on ARM board. You are ringt, it
> should be u32.
> When create the array map, we choose the array->elem_size as
> round_up(attr->value_size, 8), why 8?
because from user space point of view we're storing FDs
which are u32, but kernel stores pointers.
but round_up(attr->value_size, 8) is done because there
can be 8 byte fields in there and we have 8-byte load/store insns.
So whether pointer is 32 or 64-bit they still fit.
On 7/23/15 7:22 PM, xiakaixu wrote:
>>> + /* check if the value is already stored */
>>> >>+ if (array->events[index])
>>> >>+ return -EINVAL;
>>> >>+
>>> >>+ /* convert the fd to the pointer to struct perf_event */
>>> >>+ event = convert_map_with_perf_event(value);
>> >
>> >imo helper name is misleading and it's too short to be separate
>> >function. Just inline it and you can reuse 'index' variable.
>> >
>>> >>+ if (!event)
>>> >>+ return -EBADF;
>>> >>+
>>> >>+ xchg(array->events + index, event);
>> >
>> >refcnt leak of old event! Please think it through.
>> >This type of bugs I shouldn't be finding.
> Maybe the commit message is not elaborate. Here I prevent
> user space from updating the existed event, so the return
> value of xchg() is NULL and no refcnt leak of old event.
> I will do the same as prog_array in next version.
I see then it's even worse.
You think that above check:
+ if (array->events[index])
+ return -EINVAL;
will protect the double insert?
It won't, since there are no locks here.
You can have two processes both seeing empty slot and
racing to do xchg.
On Thu, Jul 23, 2015 at 09:42:41AM +0000, Kaixu Xia wrote:
> +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
> +{
> + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
> + struct bpf_array *array = container_of(map, struct bpf_array, map);
> + struct perf_event *event;
> +
> + if (index >= array->map.max_entries)
> + return -E2BIG;
> +
> + event = array->events[index];
> + if (!event)
> + return -EBADF;
> +
> + if (event->state != PERF_EVENT_STATE_ACTIVE)
> + return -ENOENT;
> +
> + if (event->oncpu != raw_smp_processor_id() &&
> + event->ctx->task != current)
> + return -EINVAL;
> +
> + if (event->attr.inherit)
> + return -EINVAL;
> +
> + __perf_event_read(event);
> +
> + return perf_event_count(event);
> +}
Please no poking of event internal state outside of perf code.
On Thu, Jul 23, 2015 at 09:42:40AM +0000, Kaixu Xia wrote:
> +static struct perf_event *convert_map_with_perf_event(void *value)
> +{
> + struct perf_event *event;
> + u32 fd;
> +
> + fd = *(u32 *)value;
> +
> + event = perf_event_get(fd);
> + if (IS_ERR(event))
> + return NULL;
> +
> + /* limit the event type to PERF_TYPE_RAW
> + * and PERF_TYPE_HARDWARE.
> + */
> + if (event->attr.type != PERF_TYPE_RAW &&
> + event->attr.type != PERF_TYPE_HARDWARE)
> + return NULL;
Aside from the ref-leak already mentioned; please introduce something
like:
const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
To avoid having to poke inside of the event outside of perf code.
> +
> + return event;
> +}
于 2015/8/3 17:34, Peter Zijlstra 写道:
> On Thu, Jul 23, 2015 at 09:42:41AM +0000, Kaixu Xia wrote:
>> +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
>> +{
>> + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
>> + struct bpf_array *array = container_of(map, struct bpf_array, map);
>> + struct perf_event *event;
>> +
>> + if (index >= array->map.max_entries)
>> + return -E2BIG;
>> +
>> + event = array->events[index];
>> + if (!event)
>> + return -EBADF;
>> +
>> + if (event->state != PERF_EVENT_STATE_ACTIVE)
>> + return -ENOENT;
>> +
>> + if (event->oncpu != raw_smp_processor_id() &&
>> + event->ctx->task != current)
>> + return -EINVAL;
>> +
>> + if (event->attr.inherit)
>> + return -EINVAL;
>> +
>> + __perf_event_read(event);
>> +
>> + return perf_event_count(event);
>> +}
>
> Please no poking of event internal state outside of perf code.
Thanks for your review. I will move it to kernel/events/core.c.
>
> .
>