Previous patch v1 url:
https://lkml.org/lkml/2015/7/17/287
This patchset allows user read PMU events in the following way:
1. Open the PMU using perf_event_open() (for each CPUs or for
each processes he/she'd like to watch);
2. Create a BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map;
3. Insert FDs into the map with some key-value mapping scheme
(i.e. cpuid -> event on that CPU);
4. Load and attach eBPF programs as usual;
5. In eBPF program, get the perf_event_map_fd and key (i.e.
cpuid get from bpf_get_smp_processor_id()) then use
bpf_perf_event_read() to read from it.
6. Do anything he/her want.
changes in V2:
- put atomic_long_inc_not_zero() between fdget() and fdput();
- limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE;
- Only read the event counter on current CPU or on current
process;
- add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the
pointer to the struct perf_event;
- according to the perf_event_map_fd and key, the function
bpf_perf_event_read() can get the Hardware PMU counter value;
Patch 5/5 is a simple example and shows how to use this new eBPF
programs ability. The PMU counter data can be found in
/sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU
value when 'kprobe/sys_write' sampling)
$ cat /sys/kernel/debug/tracing/trace_pipe
$ ./tracex6
...
cat-677 [002] d..1 210.299270: : bpf count: CPU-2 5316659
cat-677 [002] d..1 210.299316: : bpf count: CPU-2 5378639
cat-677 [002] d..1 210.299362: : bpf count: CPU-2 5440654
cat-677 [002] d..1 210.299408: : bpf count: CPU-2 5503211
cat-677 [002] d..1 210.299454: : bpf count: CPU-2 5565438
cat-677 [002] d..1 210.299500: : bpf count: CPU-2 5627433
cat-677 [002] d..1 210.299547: : bpf count: CPU-2 5690033
cat-677 [002] d..1 210.299593: : bpf count: CPU-2 5752184
cat-677 [002] d..1 210.299639: : bpf count: CPU-2 5814543
<...>-548 [009] d..1 210.299667: : bpf count: CPU-9 605418074
<...>-548 [009] d..1 210.299692: : bpf count: CPU-9 605452692
cat-677 [002] d..1 210.299700: : bpf count: CPU-2 5896319
<...>-548 [009] d..1 210.299710: : bpf count: CPU-9 605477824
<...>-548 [009] d..1 210.299728: : bpf count: CPU-9 605501726
<...>-548 [009] d..1 210.299745: : bpf count: CPU-9 605525279
<...>-548 [009] d..1 210.299762: : bpf count: CPU-9 605547817
<...>-548 [009] d..1 210.299778: : bpf count: CPU-9 605570433
<...>-548 [009] d..1 210.299795: : bpf count: CPU-9 605592743
...
The detail of patches is as follow:
Patch 1/5 introduces a new bpf map type. This map only stores the
pointer to struct perf_event;
Patch 2/5 introduces a map_traverse_elem() function for further use;
Patch 3/5 convets event file descriptors into perf_event structure when
add new element to the map;
Patch 4/5 implement function bpf_perf_event_read() that get the selected
hardware PMU conuter;
Patch 5/5 give a simple example.
Kaixu Xia (5):
bpf: Add new bpf map type to store the pointer to struct perf_event
bpf: Add function map->ops->map_traverse_elem() to traverse map elems
bpf: Save the pointer to struct perf_event to map
bpf: Implement function bpf_perf_event_read() that get the selected
hardware PMU conuter
samples/bpf: example of get selected PMU counter value
include/linux/bpf.h | 6 +++
include/linux/perf_event.h | 5 ++-
include/uapi/linux/bpf.h | 3 ++
kernel/bpf/arraymap.c | 110 +++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/helpers.c | 42 +++++++++++++++++
kernel/bpf/syscall.c | 26 +++++++++++
kernel/events/core.c | 30 ++++++++++++-
kernel/trace/bpf_trace.c | 2 +
samples/bpf/Makefile | 4 ++
samples/bpf/bpf_helpers.h | 2 +
samples/bpf/tracex6_kern.c | 27 +++++++++++
samples/bpf/tracex6_user.c | 67 +++++++++++++++++++++++++++
12 files changed, 321 insertions(+), 3 deletions(-)
create mode 100644 samples/bpf/tracex6_kern.c
create mode 100644 samples/bpf/tracex6_user.c
--
1.8.3.4
Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
This map will only store the pointer to struct perf_event.
Signed-off-by: Kaixu Xia <[email protected]>
---
include/linux/bpf.h | 2 ++
include/uapi/linux/bpf.h | 1 +
kernel/bpf/arraymap.c | 40 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 43 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4383476..f6a2442 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -11,6 +11,8 @@
#include <linux/workqueue.h>
#include <linux/file.h>
+#define MAX_PERF_EVENT_ARRAY_ENTRY (2*NR_CPUS)
+
struct bpf_map;
/* map is generic key/value storage optionally accesible by eBPF programs */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 29ef6f9..69a1f6b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_HASH,
BPF_MAP_TYPE_ARRAY,
BPF_MAP_TYPE_PROG_ARRAY,
+ BPF_MAP_TYPE_PERF_EVENT_ARRAY,
};
enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229..183c1f7 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -255,3 +255,43 @@ static int __init register_prog_array_map(void)
return 0;
}
late_initcall(register_prog_array_map);
+
+static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
+{
+ /* only the pointer to struct perf_event can be stored in
+ * perf_event_array map
+ */
+ if (attr->value_size != sizeof(void *))
+ return ERR_PTR(-EINVAL);
+
+ if (attr->max_entries > MAX_PERF_EVENT_ARRAY_ENTRY)
+ return ERR_PTR(-EINVAL);
+
+ return array_map_alloc(attr);
+}
+
+static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -EINVAL;
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+ .map_alloc = perf_event_array_map_alloc,
+ .map_free = array_map_free,
+ .map_get_next_key = perf_event_array_map_get_next_key,
+ .map_lookup_elem = array_map_lookup_elem,
+ .map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+ .ops = &perf_event_array_ops,
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+ bpf_register_map_type(&perf_event_array_type);
+ return 0;
+}
+late_initcall(register_perf_event_array_map);
--
1.8.3.4
We want to traverse the map elements and make use
of the map value one by one. So add new function
map->ops->map_traverse_elem() to traverse map elements.
Signed-off-by: Kaixu Xia <[email protected]>
---
include/linux/bpf.h | 3 +++
kernel/bpf/arraymap.c | 17 +++++++++++++++++
2 files changed, 20 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f6a2442..257149c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -15,6 +15,8 @@
struct bpf_map;
+typedef int (*bpf_map_traverse_callback)(void *value);
+
/* map is generic key/value storage optionally accesible by eBPF programs */
struct bpf_map_ops {
/* funcs callable from userspace (via syscall) */
@@ -26,6 +28,7 @@ struct bpf_map_ops {
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
int (*map_delete_elem)(struct bpf_map *map, void *key);
+ int (*map_traverse_elem)(bpf_map_traverse_callback func, struct bpf_map *map);
};
struct bpf_map {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 183c1f7..410bc40 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -276,12 +276,29 @@ static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
return -EINVAL;
}
+static int perf_event_array_map_traverse_elem(bpf_map_traverse_callback func,
+ struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *value;
+ int i;
+
+ for(i = 0; i < array->map.max_entries; i++) {
+ value = array->value + array->elem_size * i;
+
+ func(value);
+ }
+
+ return 0;
+}
+
static const struct bpf_map_ops perf_event_array_ops = {
.map_alloc = perf_event_array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = perf_event_array_map_get_next_key,
.map_lookup_elem = array_map_lookup_elem,
.map_delete_elem = array_map_delete_elem,
+ .map_traverse_elem = perf_event_array_map_traverse_elem,
};
static struct bpf_map_type_list perf_event_array_type __read_mostly = {
--
1.8.3.4
The user space event FDs from perf_event_open() syscall are
converted to the pointer to struct perf_event and stored in map.
Signed-off-by: Kaixu Xia <[email protected]>
---
include/linux/perf_event.h | 2 ++
kernel/bpf/arraymap.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 26 +++++++++++++++++++++++
kernel/events/core.c | 26 +++++++++++++++++++++++
4 files changed, 107 insertions(+)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2027809..2ea4067 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -641,6 +641,7 @@ extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
+extern struct perf_event *perf_event_get(unsigned int fd);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
@@ -979,6 +980,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; }
static inline void perf_event_exit_task(struct task_struct *child) { }
static inline void perf_event_free_task(struct task_struct *task) { }
static inline void perf_event_delayed_put(struct task_struct *task) { }
+static struct perf_event *perf_event_get(unsigned int fd) { return NULL; }
static inline void perf_event_print_debug(void) { }
static inline int perf_event_task_disable(void) { return -EINVAL; }
static inline int perf_event_task_enable(void) { return -EINVAL; }
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 410bc40..a7475ae 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/filter.h>
+#include <linux/perf_event.h>
/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -276,6 +277,57 @@ static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
return -EINVAL;
}
+static int replace_map_with_perf_event(void *value)
+{
+ struct perf_event *event;
+ u32 fd;
+
+ fd = *(u32 *)value;
+
+ event = perf_event_get(fd);
+ if (IS_ERR(event))
+ return PTR_ERR(event);
+
+ /* limit the event type to PERF_TYPE_RAW
+ * and PERF_TYPE_HARDWARE
+ */
+ if (event->attr.type != PERF_TYPE_RAW &&
+ event->attr.type != PERF_TYPE_HARDWARE)
+ return -EINVAL;
+
+ memcpy(value, &event, sizeof(struct perf_event *));
+
+ return 0;
+}
+
+static bool check_map_perf_event_stored(struct bpf_map *map, void *key)
+{
+ void *event;
+ bool is_stored = false;
+
+ rcu_read_lock();
+ event = array_map_lookup_elem(map, key);
+ if (event && (*(unsigned long *)event))
+ is_stored = true;
+ rcu_read_unlock();
+
+ return is_stored;
+}
+
+/* only called from syscall */
+static int perf_event_array_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ /* check if the value is already stored */
+ if (check_map_perf_event_stored(map, key))
+ return -EINVAL;
+
+ if (replace_map_with_perf_event(value))
+ return -EBADF;
+
+ return array_map_update_elem(map, key, value, map_flags);
+}
+
static int perf_event_array_map_traverse_elem(bpf_map_traverse_callback func,
struct bpf_map *map)
{
@@ -297,6 +349,7 @@ static const struct bpf_map_ops perf_event_array_ops = {
.map_free = array_map_free,
.map_get_next_key = perf_event_array_map_get_next_key,
.map_lookup_elem = array_map_lookup_elem,
+ .map_update_elem = perf_event_array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_traverse_elem = perf_event_array_map_traverse_elem,
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1b14d1..854f351 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -17,6 +17,7 @@
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/version.h>
+#include <linux/perf_event.h>
static LIST_HEAD(bpf_map_types);
@@ -64,6 +65,19 @@ void bpf_map_put(struct bpf_map *map)
}
}
+static int bpf_map_perf_event_put(void *value)
+{
+ struct perf_event *event;
+
+ event = (struct perf_event *)(*(unsigned long *)value);
+ if (!event)
+ return -EBADF;
+
+ perf_event_release_kernel(event);
+
+ return 0;
+}
+
static int bpf_map_release(struct inode *inode, struct file *filp)
{
struct bpf_map *map = filp->private_data;
@@ -74,6 +88,18 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
*/
bpf_prog_array_map_clear(map);
+ if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
+ if (!map->ops->map_traverse_elem)
+ return -EPERM;
+
+ rcu_read_lock();
+ if (map->ops->map_traverse_elem(bpf_map_perf_event_put, map) < 0) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ rcu_read_unlock();
+ }
+
bpf_map_put(map);
return 0;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae34..14a9924 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8574,6 +8574,32 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
+struct perf_event *perf_event_get(unsigned int fd)
+{
+ struct perf_event *event;
+ struct fd f;
+
+ f = fdget(fd);
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &perf_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ event = f.file->private_data;
+
+ if (!atomic_long_inc_not_zero(&event->refcount)) {
+ fdput(f);
+ return ERR_PTR(-ENOENT);
+ }
+
+ fdput(f);
+ return event;
+}
+
/*
* inherit a event from parent task to child task:
*/
--
1.8.3.4
According to the perf_event_map_fd and key, the function
bpf_perf_event_read() can convert the corresponding map
value to the pointer to struct perf_event and return the
Hardware PMU counter value.
The key can't be passed to bpf_perf_event_read() directly
because the function argument constraint is lacked.
Signed-off-by: Kaixu Xia <[email protected]>
---
include/linux/bpf.h | 1 +
include/linux/perf_event.h | 3 ++-
include/uapi/linux/bpf.h | 2 ++
kernel/bpf/helpers.c | 42 ++++++++++++++++++++++++++++++++++++++++++
kernel/events/core.c | 4 ++--
kernel/trace/bpf_trace.c | 2 ++
6 files changed, 51 insertions(+), 3 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 257149c..0fbff67 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -197,5 +197,6 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
+extern const struct bpf_func_proto bpf_perf_event_read_proto;
#endif /* _LINUX_BPF_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2ea4067..899abcb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -662,7 +662,8 @@ extern void perf_pmu_migrate_context(struct pmu *pmu,
int src_cpu, int dst_cpu);
extern u64 perf_event_read_value(struct perf_event *event,
u64 *enabled, u64 *running);
-
+extern void __perf_event_read(void *info);
+extern u64 perf_event_count(struct perf_event *event);
struct perf_sample_data {
/*
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 69a1f6b..e3bb181 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -250,6 +250,8 @@ enum bpf_func_id {
* Return: 0 on success
*/
BPF_FUNC_get_current_comm,
+
+ BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, &key) */
__BPF_FUNC_MAX_ID,
};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1447ec0..aef2640 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -16,6 +16,7 @@
#include <linux/ktime.h>
#include <linux/sched.h>
#include <linux/uidgid.h>
+#include <linux/perf_event.h>
/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
@@ -182,3 +183,44 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg1_type = ARG_PTR_TO_STACK,
.arg2_type = ARG_CONST_STACK_SIZE,
};
+
+static u64 bpf_perf_event_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ struct perf_event *event;
+ void *ptr;
+
+ if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+ return -EINVAL;
+
+ rcu_read_lock();
+ ptr = map->ops->map_lookup_elem(map, key);
+ rcu_read_unlock();
+ if (!ptr || !(*(unsigned long *)ptr))
+ return -EBADF;
+
+ event = (struct perf_event *)(*(unsigned long *)ptr);
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return -ENOENT;
+
+ if (event->oncpu != raw_smp_processor_id() &&
+ event->ctx->task != current)
+ return -EINVAL;
+
+ if (event->attr.inherit)
+ return -EINVAL;
+
+ __perf_event_read(event);
+
+ return perf_event_count(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+ .func = bpf_perf_event_read,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 14a9924..b4bde27 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3177,7 +3177,7 @@ void perf_event_exec(void)
/*
* Cross CPU call to read the hardware event
*/
-static void __perf_event_read(void *info)
+void __perf_event_read(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -3204,7 +3204,7 @@ static void __perf_event_read(void *info)
raw_spin_unlock(&ctx->lock);
}
-static inline u64 perf_event_count(struct perf_event *event)
+u64 perf_event_count(struct perf_event *event)
{
if (event->pmu->count)
return event->pmu->count(event);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041a..9cf094f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -183,6 +183,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_perf_event_read:
+ return &bpf_perf_event_read_proto;
default:
return NULL;
}
--
1.8.3.4
This is a simple example and shows how to use the new ability
to get the selected Hardware PMU counter value.
Signed-off-by: Kaixu Xia <[email protected]>
---
samples/bpf/Makefile | 4 +++
samples/bpf/bpf_helpers.h | 2 ++
samples/bpf/tracex6_kern.c | 27 +++++++++++++++++++
samples/bpf/tracex6_user.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 100 insertions(+)
create mode 100644 samples/bpf/tracex6_kern.c
create mode 100644 samples/bpf/tracex6_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4450fed..63e7d50 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -12,6 +12,7 @@ hostprogs-y += tracex2
hostprogs-y += tracex3
hostprogs-y += tracex4
hostprogs-y += tracex5
+hostprogs-y += tracex6
hostprogs-y += lathist
test_verifier-objs := test_verifier.o libbpf.o
@@ -25,6 +26,7 @@ tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
+tracex6-objs := bpf_load.o libbpf.o tracex6_user.o
lathist-objs := bpf_load.o libbpf.o lathist_user.o
# Tell kbuild to always build the programs
@@ -37,6 +39,7 @@ always += tracex2_kern.o
always += tracex3_kern.o
always += tracex4_kern.o
always += tracex5_kern.o
+always += tracex6_kern.o
always += tcbpf1_kern.o
always += lathist_kern.o
@@ -51,6 +54,7 @@ HOSTLOADLIBES_tracex2 += -lelf
HOSTLOADLIBES_tracex3 += -lelf
HOSTLOADLIBES_tracex4 += -lelf -lrt
HOSTLOADLIBES_tracex5 += -lelf
+HOSTLOADLIBES_tracex6 += -lelf
HOSTLOADLIBES_lathist += -lelf
# point this to your LLVM backend with bpf support
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index bdf1c16..b0037dd 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -31,6 +31,8 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) =
(void *) BPF_FUNC_get_current_uid_gid;
static int (*bpf_get_current_comm)(void *buf, int buf_size) =
(void *) BPF_FUNC_get_current_comm;
+static int (*bpf_perf_event_read)(void *map, void *key) =
+ (void *) BPF_FUNC_perf_event_read;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
new file mode 100644
index 0000000..9c3c6b2
--- /dev/null
+++ b/samples/bpf/tracex6_kern.c
@@ -0,0 +1,27 @@
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* the max_entries can't be larger than 2*NR_CPUS */
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(unsigned long),
+ .max_entries = 32,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ u64 count;
+ u32 key = bpf_get_smp_processor_id();
+ char fmt[] = "bpf count: CPU-%d %llu\n";
+
+ count = bpf_perf_event_read(&my_map, &key);
+ bpf_trace_printk(fmt, sizeof(fmt), key, count);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
new file mode 100644
index 0000000..30307c9
--- /dev/null
+++ b/samples/bpf/tracex6_user.c
@@ -0,0 +1,67 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+static void test_bpf_perf_event(void)
+{
+ int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ int *pmu_fd = malloc(nr_cpus * sizeof(int));
+ unsigned long value;
+ int i;
+
+ struct perf_event_attr attr_insn_pmu = {
+ .freq = 0,
+ .sample_period = 0x7fffffffffffffffULL,
+ .inherit = 0,
+ .type = PERF_TYPE_HARDWARE,
+ .read_format = 0,
+ .sample_type = 0,
+ .config = 0,/* PMU: cycles */
+ };
+
+ for(i = 0; i < nr_cpus; i++) {
+ pmu_fd[i] = perf_event_open(&attr_insn_pmu, -1/*pid*/, i/*cpu*/, -1/*group_fd*/, 0);
+ if (pmu_fd[i] < 0)
+ printf("event syscall failed ****\n");
+
+ bpf_update_elem(map_fd[0], &i, (pmu_fd + i), BPF_ANY);
+
+ ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0);
+ }
+
+ system("ls");
+ system("pwd");
+ system("sleep 2");
+
+ for(i = 0; i < nr_cpus; i++)
+ close(pmu_fd[i]);
+
+ close(map_fd);
+
+ free(pmu_fd);
+}
+
+int main(int argc, char **argv)
+{
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ test_bpf_perf_event();
+
+ return 0;
+}
--
1.8.3.4
On 7/22/15 1:09 AM, Kaixu Xia wrote:
> Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
> This map will only store the pointer to struct perf_event.
>
> Signed-off-by: Kaixu Xia <[email protected]>
> ---
> include/linux/bpf.h | 2 ++
> include/uapi/linux/bpf.h | 1 +
> kernel/bpf/arraymap.c | 40 ++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 43 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 4383476..f6a2442 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -11,6 +11,8 @@
> #include <linux/workqueue.h>
> #include <linux/file.h>
>
> +#define MAX_PERF_EVENT_ARRAY_ENTRY (2*NR_CPUS)
why this artificial limit?
Just drop it.
> +static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
> +{
> + /* only the pointer to struct perf_event can be stored in
> + * perf_event_array map
> + */
> + if (attr->value_size != sizeof(void *))
> + return ERR_PTR(-EINVAL);
hmm. that's odd. why reinvent things? please do the same as
prog_array does. Namely below:
> +static const struct bpf_map_ops perf_event_array_ops = {
> + .map_alloc = perf_event_array_map_alloc,
> + .map_free = array_map_free,
> + .map_get_next_key = perf_event_array_map_get_next_key,
> + .map_lookup_elem = array_map_lookup_elem,
> + .map_delete_elem = array_map_delete_elem,
this is broken. you don't want programs to manipulate
'struct perf_event *' pointers.
lookup/update/delete helpers shouldn't be accessible from the programs
then update/delete can be cleanly implemented and called via syscall.
See how prog_array does it.
Also please collapse patches 1-3 into one. Their logically one piece.
I'll comment on them as well, but it would have been easier for me
and you if their were part of one email thread.
On 7/22/15 1:09 AM, Kaixu Xia wrote:
> We want to traverse the map elements and make use
> of the map value one by one. So add new function
> map->ops->map_traverse_elem() to traverse map elements.
...
> @@ -26,6 +28,7 @@ struct bpf_map_ops {
> void *(*map_lookup_elem)(struct bpf_map *map, void *key);
> int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
> int (*map_delete_elem)(struct bpf_map *map, void *key);
> + int (*map_traverse_elem)(bpf_map_traverse_callback func, struct bpf_map *map);
I think 'traverse' is too error prone.
Better to add 'map_delete_all_elem' callback.
Then convert bpf_prog_array_map_clear() into such callback
and similar callback from perf_event_array.
Then call it with from bpf_map_release() and drop
if (map->map_type == PROG_ARRAY) there.
Just unconditionally map->ops->map_delete_all_elem().
Though looking at patch 3. I don't see why you need that.
prog_array has to do it due to potential circular dependency
between prog_fd of a program that uses a prog_array_fd and
stores prog_fd inside that prog_array.
In case of perf_event_array no such thing possible.
Just release them as part of map_free.
On 7/22/15 1:09 AM, Kaixu Xia wrote:
> The user space event FDs from perf_event_open() syscall are
> converted to the pointer to struct perf_event and stored in map.
...
> +static int replace_map_with_perf_event(void *value)
> +{
> + struct perf_event *event;
> + u32 fd;
> +
> + fd = *(u32 *)value;
> +
> + event = perf_event_get(fd);
> + if (IS_ERR(event))
> + return PTR_ERR(event);
> +
> + /* limit the event type to PERF_TYPE_RAW
> + * and PERF_TYPE_HARDWARE
> + */
> + if (event->attr.type != PERF_TYPE_RAW &&
> + event->attr.type != PERF_TYPE_HARDWARE)
> + return -EINVAL;
> +
> + memcpy(value, &event, sizeof(struct perf_event *));
> +
> + return 0;
> +}
> +
> +static bool check_map_perf_event_stored(struct bpf_map *map, void *key)
> +{
> + void *event;
> + bool is_stored = false;
> +
> + rcu_read_lock();
> + event = array_map_lookup_elem(map, key);
> + if (event && (*(unsigned long *)event))
> + is_stored = true;
> + rcu_read_unlock();
> +
> + return is_stored;
> +}
> +
> +/* only called from syscall */
> +static int perf_event_array_map_update_elem(struct bpf_map *map, void *key,
> + void *value, u64 map_flags)
> +{
> + /* check if the value is already stored */
> + if (check_map_perf_event_stored(map, key))
> + return -EINVAL;
> +
> + if (replace_map_with_perf_event(value))
> + return -EBADF;
the above three functions are broken due to races.
update_elem can be called by different user space processes.
Please see how prog_array solves it via xchg()
> + if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
> + if (!map->ops->map_traverse_elem)
> + return -EPERM;
> +
> + rcu_read_lock();
> + if (map->ops->map_traverse_elem(bpf_map_perf_event_put, map) < 0) {
> + rcu_read_unlock();
> + return -EINVAL;
> + }
> + rcu_read_unlock();
as I mentioned as part of patch 2 the above seems unnecessary.
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index d3dae34..14a9924 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -8574,6 +8574,32 @@ void perf_event_delayed_put(struct task_struct *task)
> WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
> }
>
> +struct perf_event *perf_event_get(unsigned int fd)
> +{
> + struct perf_event *event;
> + struct fd f;
> +
> + f = fdget(fd);
> +
> + if (!f.file)
> + return ERR_PTR(-EBADF);
> +
> + if (f.file->f_op != &perf_fops) {
> + fdput(f);
> + return ERR_PTR(-EINVAL);
> + }
> +
> + event = f.file->private_data;
> +
> + if (!atomic_long_inc_not_zero(&event->refcount)) {
I don't understand necessity of '_not_zero' suffix. why?
How it can possibly race to zero here if we have an fd?
> + fdput(f);
> + return ERR_PTR(-ENOENT);
> + }
> +
> + fdput(f);
> + return event;
> +}
On 2015/7/23 8:48, Alexei Starovoitov wrote:
> On 7/22/15 1:09 AM, Kaixu Xia wrote:
>> Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
>> This map will only store the pointer to struct perf_event.
>>
>> Signed-off-by: Kaixu Xia <[email protected]>
>> ---
>> include/linux/bpf.h | 2 ++
>> include/uapi/linux/bpf.h | 1 +
>> kernel/bpf/arraymap.c | 40 ++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 43 insertions(+)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 4383476..f6a2442 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -11,6 +11,8 @@
>> #include <linux/workqueue.h>
>> #include <linux/file.h>
>>
>> +#define MAX_PERF_EVENT_ARRAY_ENTRY (2*NR_CPUS)
>
> why this artificial limit?
> Just drop it.
>
Then we should find another way to prevent user create a large map but
only use
a small portion of it, since normal array map can't report whether a
slot is used
or not. When releasing the map, we have to release each perf event.
Currently the only
way is to check map value in each slot.
>> +static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
>> +{
>> + /* only the pointer to struct perf_event can be stored in
>> + * perf_event_array map
>> + */
>> + if (attr->value_size != sizeof(void *))
>> + return ERR_PTR(-EINVAL);
>
> hmm. that's odd. why reinvent things? please do the same as
> prog_array does. Namely below:
>
>> +static const struct bpf_map_ops perf_event_array_ops = {
>> + .map_alloc = perf_event_array_map_alloc,
>> + .map_free = array_map_free,
>> + .map_get_next_key = perf_event_array_map_get_next_key,
>> + .map_lookup_elem = array_map_lookup_elem,
>> + .map_delete_elem = array_map_delete_elem,
>
> this is broken. you don't want programs to manipulate
> 'struct perf_event *' pointers.
> lookup/update/delete helpers shouldn't be accessible from the programs
> then update/delete can be cleanly implemented and called via syscall.
> See how prog_array does it.
>
> Also please collapse patches 1-3 into one. Their logically one piece.
> I'll comment on them as well, but it would have been easier for me
> and you if their were part of one email thread.
>
On 7/22/15 1:09 AM, Kaixu Xia wrote:
> According to the perf_event_map_fd and key, the function
> bpf_perf_event_read() can convert the corresponding map
> value to the pointer to struct perf_event and return the
> Hardware PMU counter value.
>
> The key can't be passed to bpf_perf_event_read() directly
> because the function argument constraint is lacked.
I don't understand above sentence.
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 69a1f6b..e3bb181 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -250,6 +250,8 @@ enum bpf_func_id {
> * Return: 0 on success
> */
> BPF_FUNC_get_current_comm,
> +
> + BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, &key) */
no need for extra empty line.
> +
> +static u64 bpf_perf_event_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
> +{
> + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
> + void *key = (void *) (unsigned long) r2;
> + struct perf_event *event;
> + void *ptr;
> +
> + if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
> + return -EINVAL;
please check this statically in verifier instead of in run-time.
> +
> + rcu_read_lock();
unnecessary.
> + ptr = map->ops->map_lookup_elem(map, key);
> + rcu_read_unlock();
> + if (!ptr || !(*(unsigned long *)ptr))
> + return -EBADF;
all these casts can be removed. First cast of 'r1' into
perf_event_array will be enough.
> +const struct bpf_func_proto bpf_perf_event_read_proto = {
> + .func = bpf_perf_event_read,
> + .gpl_only = false,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_CONST_MAP_PTR,
> + .arg2_type = ARG_PTR_TO_MAP_KEY,
make it arg2_type = ARG_ANYTHING then you'll just index
into array the way prog_array does and similar to bpf_tail_call.
On 7/22/15 1:09 AM, Kaixu Xia wrote:
> +SEC("kprobe/sys_write")
> +int bpf_prog1(struct pt_regs *ctx)
> +{
> + u64 count;
> + u32 key = bpf_get_smp_processor_id();
> + char fmt[] = "bpf count: CPU-%d %llu\n";
> +
> + count = bpf_perf_event_read(&my_map, &key);
> + bpf_trace_printk(fmt, sizeof(fmt), key, count);
> +
> + return 0;
> +}
overall the whole thing looks much better, but since you can print
anything, print some sensible string here instead of 'bpf count'.
Thanks!
On 7/22/15 6:08 PM, Wangnan (F) wrote:
> Then we should find another way to prevent user create a large map but
> only use
> a small portion of it, since normal array map can't report whether a
> slot is used
> or not. When releasing the map, we have to release each perf event.
> Currently the only
> way is to check map value in each slot.
yeah. In prog_array each element is either NULL ptr or valid
'struct bpf_prog *'. Why not to do the same for perf_event_array?
于 2015/7/23 9:14, Alexei Starovoitov 写道:
> On 7/22/15 1:09 AM, Kaixu Xia wrote:
>> According to the perf_event_map_fd and key, the function
>> bpf_perf_event_read() can convert the corresponding map
>> value to the pointer to struct perf_event and return the
>> Hardware PMU counter value.
>>
>> The key can't be passed to bpf_perf_event_read() directly
>> because the function argument constraint is lacked.
>
> I don't understand above sentence.
>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 69a1f6b..e3bb181 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -250,6 +250,8 @@ enum bpf_func_id {
>> * Return: 0 on success
>> */
>> BPF_FUNC_get_current_comm,
>> +
>> + BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, &key) */
>
> no need for extra empty line.
>
>> +
>> +static u64 bpf_perf_event_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
>> +{
>> + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
>> + void *key = (void *) (unsigned long) r2;
>> + struct perf_event *event;
>> + void *ptr;
>> +
>> + if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
>> + return -EINVAL;
>
> please check this statically in verifier instead of in run-time.
>
>> +
>> + rcu_read_lock();
>
> unnecessary.
>
>> + ptr = map->ops->map_lookup_elem(map, key);
>> + rcu_read_unlock();
>> + if (!ptr || !(*(unsigned long *)ptr))
>> + return -EBADF;
>
> all these casts can be removed. First cast of 'r1' into
> perf_event_array will be enough.
So you mean like this?
u64 bpf_perf_event_read(u64 r1, u64 index,...)
{
struct bpf_perf_event_array *array = (void *) (long) r1;
struct perf_event *event;
...
event = array->events[index];
...
}
>
>> +const struct bpf_func_proto bpf_perf_event_read_proto = {
>> + .func = bpf_perf_event_read,
>> + .gpl_only = false,
>> + .ret_type = RET_INTEGER,
>> + .arg1_type = ARG_CONST_MAP_PTR,
>> + .arg2_type = ARG_PTR_TO_MAP_KEY,
>
> make it arg2_type = ARG_ANYTHING then you'll just index
> into array the way prog_array does and similar to bpf_tail_call.
ARG_ANYTHING means any (initialized) argument is ok, but we here
really want is map key. So I'm not sure ARG_ANYTHING is suitable.
You know ARG_ANYTHING is not checked enough in verifier.
Thanks.
>
>
> .
>
On 7/22/15 7:12 PM, xiakaixu wrote:
> So you mean like this?
>
> u64 bpf_perf_event_read(u64 r1, u64 index,...)
> {
> struct bpf_perf_event_array *array = (void *) (long) r1;
> struct perf_event *event;
> ...
> event = array->events[index];
> ...
> }
yes. the only thing needed is to add:
if (index >= array->map.max_entries)
return -E2BIG;
before accessing array->events[index];
>> >
>>> >>+const struct bpf_func_proto bpf_perf_event_read_proto = {
>>> >>+ .func = bpf_perf_event_read,
>>> >>+ .gpl_only = false,
>>> >>+ .ret_type = RET_INTEGER,
>>> >>+ .arg1_type = ARG_CONST_MAP_PTR,
>>> >>+ .arg2_type = ARG_PTR_TO_MAP_KEY,
>> >
>> >make it arg2_type = ARG_ANYTHING then you'll just index
>> >into array the way prog_array does and similar to bpf_tail_call.
> ARG_ANYTHING means any (initialized) argument is ok, but we here
correct.
> really want is map key. So I'm not sure ARG_ANYTHING is suitable.
> You know ARG_ANYTHING is not checked enough in verifier.
why? during perf_event_array creation time we check that key_size == u32
so we can accept any integer.
ARG_PTR_TO_MAP_KEY forces program author to use stack instead of
passing index directly. Direct index is obviously faster.
于 2015/7/23 10:22, Alexei Starovoitov 写道:
> On 7/22/15 7:12 PM, xiakaixu wrote:
>> So you mean like this?
>>
>> u64 bpf_perf_event_read(u64 r1, u64 index,...)
>> {
>> struct bpf_perf_event_array *array = (void *) (long) r1;
>> struct perf_event *event;
>> ...
>> event = array->events[index];
>> ...
>> }
>
> yes. the only thing needed is to add:
> if (index >= array->map.max_entries)
> return -E2BIG;
> before accessing array->events[index];
>
>>> >
>>>> >>+const struct bpf_func_proto bpf_perf_event_read_proto = {
>>>> >>+ .func = bpf_perf_event_read,
>>>> >>+ .gpl_only = false,
>>>> >>+ .ret_type = RET_INTEGER,
>>>> >>+ .arg1_type = ARG_CONST_MAP_PTR,
>>>> >>+ .arg2_type = ARG_PTR_TO_MAP_KEY,
>>> >
>>> >make it arg2_type = ARG_ANYTHING then you'll just index
>>> >into array the way prog_array does and similar to bpf_tail_call.
>
>> ARG_ANYTHING means any (initialized) argument is ok, but we here
>
> correct.
>
>> really want is map key. So I'm not sure ARG_ANYTHING is suitable.
>> You know ARG_ANYTHING is not checked enough in verifier.
>
> why? during perf_event_array creation time we check that key_size == u32
> so we can accept any integer.
> ARG_PTR_TO_MAP_KEY forces program author to use stack instead of
> passing index directly. Direct index is obviously faster.
Copy that. We will follow them in V3.
>
>
> .
>
On 07/22/2015 10:09 AM, Kaixu Xia wrote:
> Previous patch v1 url:
> https://lkml.org/lkml/2015/7/17/287
[ Sorry to chime in late, just noticed this series now as I wasn't in Cc for
the core BPF changes. More below ... ]
> This patchset allows user read PMU events in the following way:
> 1. Open the PMU using perf_event_open() (for each CPUs or for
> each processes he/she'd like to watch);
> 2. Create a BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map;
> 3. Insert FDs into the map with some key-value mapping scheme
> (i.e. cpuid -> event on that CPU);
> 4. Load and attach eBPF programs as usual;
> 5. In eBPF program, get the perf_event_map_fd and key (i.e.
> cpuid get from bpf_get_smp_processor_id()) then use
> bpf_perf_event_read() to read from it.
> 6. Do anything he/her want.
>
> changes in V2:
> - put atomic_long_inc_not_zero() between fdget() and fdput();
> - limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE;
> - Only read the event counter on current CPU or on current
> process;
> - add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the
> pointer to the struct perf_event;
> - according to the perf_event_map_fd and key, the function
> bpf_perf_event_read() can get the Hardware PMU counter value;
>
> Patch 5/5 is a simple example and shows how to use this new eBPF
> programs ability. The PMU counter data can be found in
> /sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU
> value when 'kprobe/sys_write' sampling)
>
> $ cat /sys/kernel/debug/tracing/trace_pipe
> $ ./tracex6
> ...
> cat-677 [002] d..1 210.299270: : bpf count: CPU-2 5316659
> cat-677 [002] d..1 210.299316: : bpf count: CPU-2 5378639
> cat-677 [002] d..1 210.299362: : bpf count: CPU-2 5440654
> cat-677 [002] d..1 210.299408: : bpf count: CPU-2 5503211
> cat-677 [002] d..1 210.299454: : bpf count: CPU-2 5565438
> cat-677 [002] d..1 210.299500: : bpf count: CPU-2 5627433
> cat-677 [002] d..1 210.299547: : bpf count: CPU-2 5690033
> cat-677 [002] d..1 210.299593: : bpf count: CPU-2 5752184
> cat-677 [002] d..1 210.299639: : bpf count: CPU-2 5814543
> <...>-548 [009] d..1 210.299667: : bpf count: CPU-9 605418074
> <...>-548 [009] d..1 210.299692: : bpf count: CPU-9 605452692
> cat-677 [002] d..1 210.299700: : bpf count: CPU-2 5896319
> <...>-548 [009] d..1 210.299710: : bpf count: CPU-9 605477824
> <...>-548 [009] d..1 210.299728: : bpf count: CPU-9 605501726
> <...>-548 [009] d..1 210.299745: : bpf count: CPU-9 605525279
> <...>-548 [009] d..1 210.299762: : bpf count: CPU-9 605547817
> <...>-548 [009] d..1 210.299778: : bpf count: CPU-9 605570433
> <...>-548 [009] d..1 210.299795: : bpf count: CPU-9 605592743
> ...
>
> The detail of patches is as follow:
>
> Patch 1/5 introduces a new bpf map type. This map only stores the
> pointer to struct perf_event;
>
> Patch 2/5 introduces a map_traverse_elem() function for further use;
>
> Patch 3/5 convets event file descriptors into perf_event structure when
> add new element to the map;
So far all the map backends are of generic nature, knowing absolutely nothing
about a particular consumer/subsystem of eBPF (tc, socket filters, etc). The
tail call is a bit special, but nevertheless generic for each user and [very]
useful, so it makes sense to inherit from the array map and move the code there.
I don't really like that we start add new _special_-cased maps here into the
eBPF core code, it seems quite hacky. :( From your rather terse commit description
where you introduce the maps, I failed to see a detailed elaboration on this i.e.
why it cannot be abstracted any different?
> Patch 4/5 implement function bpf_perf_event_read() that get the selected
> hardware PMU conuter;
>
> Patch 5/5 give a simple example.
>
> Kaixu Xia (5):
> bpf: Add new bpf map type to store the pointer to struct perf_event
> bpf: Add function map->ops->map_traverse_elem() to traverse map elems
> bpf: Save the pointer to struct perf_event to map
> bpf: Implement function bpf_perf_event_read() that get the selected
> hardware PMU conuter
> samples/bpf: example of get selected PMU counter value
>
> include/linux/bpf.h | 6 +++
> include/linux/perf_event.h | 5 ++-
> include/uapi/linux/bpf.h | 3 ++
> kernel/bpf/arraymap.c | 110 +++++++++++++++++++++++++++++++++++++++++++++
> kernel/bpf/helpers.c | 42 +++++++++++++++++
> kernel/bpf/syscall.c | 26 +++++++++++
> kernel/events/core.c | 30 ++++++++++++-
> kernel/trace/bpf_trace.c | 2 +
> samples/bpf/Makefile | 4 ++
> samples/bpf/bpf_helpers.h | 2 +
> samples/bpf/tracex6_kern.c | 27 +++++++++++
> samples/bpf/tracex6_user.c | 67 +++++++++++++++++++++++++++
> 12 files changed, 321 insertions(+), 3 deletions(-)
> create mode 100644 samples/bpf/tracex6_kern.c
> create mode 100644 samples/bpf/tracex6_user.c
>
于 2015/7/24 7:33, Daniel Borkmann 写道:
> On 07/22/2015 10:09 AM, Kaixu Xia wrote:
>> Previous patch v1 url:
>> https://lkml.org/lkml/2015/7/17/287
>
> [ Sorry to chime in late, just noticed this series now as I wasn't in Cc for
> the core BPF changes. More below ... ]
Sorry about this, will add you to the CC list:) Welcome your comments.
>
>> This patchset allows user read PMU events in the following way:
>> 1. Open the PMU using perf_event_open() (for each CPUs or for
>> each processes he/she'd like to watch);
>> 2. Create a BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map;
>> 3. Insert FDs into the map with some key-value mapping scheme
>> (i.e. cpuid -> event on that CPU);
>> 4. Load and attach eBPF programs as usual;
>> 5. In eBPF program, get the perf_event_map_fd and key (i.e.
>> cpuid get from bpf_get_smp_processor_id()) then use
>> bpf_perf_event_read() to read from it.
>> 6. Do anything he/her want.
>>
>> changes in V2:
>> - put atomic_long_inc_not_zero() between fdget() and fdput();
>> - limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE;
>> - Only read the event counter on current CPU or on current
>> process;
>> - add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the
>> pointer to the struct perf_event;
>> - according to the perf_event_map_fd and key, the function
>> bpf_perf_event_read() can get the Hardware PMU counter value;
>>
>> Patch 5/5 is a simple example and shows how to use this new eBPF
>> programs ability. The PMU counter data can be found in
>> /sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU
>> value when 'kprobe/sys_write' sampling)
>>
>> $ cat /sys/kernel/debug/tracing/trace_pipe
>> $ ./tracex6
>> ...
>> cat-677 [002] d..1 210.299270: : bpf count: CPU-2 5316659
>> cat-677 [002] d..1 210.299316: : bpf count: CPU-2 5378639
>> cat-677 [002] d..1 210.299362: : bpf count: CPU-2 5440654
>> cat-677 [002] d..1 210.299408: : bpf count: CPU-2 5503211
>> cat-677 [002] d..1 210.299454: : bpf count: CPU-2 5565438
>> cat-677 [002] d..1 210.299500: : bpf count: CPU-2 5627433
>> cat-677 [002] d..1 210.299547: : bpf count: CPU-2 5690033
>> cat-677 [002] d..1 210.299593: : bpf count: CPU-2 5752184
>> cat-677 [002] d..1 210.299639: : bpf count: CPU-2 5814543
>> <...>-548 [009] d..1 210.299667: : bpf count: CPU-9 605418074
>> <...>-548 [009] d..1 210.299692: : bpf count: CPU-9 605452692
>> cat-677 [002] d..1 210.299700: : bpf count: CPU-2 5896319
>> <...>-548 [009] d..1 210.299710: : bpf count: CPU-9 605477824
>> <...>-548 [009] d..1 210.299728: : bpf count: CPU-9 605501726
>> <...>-548 [009] d..1 210.299745: : bpf count: CPU-9 605525279
>> <...>-548 [009] d..1 210.299762: : bpf count: CPU-9 605547817
>> <...>-548 [009] d..1 210.299778: : bpf count: CPU-9 605570433
>> <...>-548 [009] d..1 210.299795: : bpf count: CPU-9 605592743
>> ...
>>
>> The detail of patches is as follow:
>>
>> Patch 1/5 introduces a new bpf map type. This map only stores the
>> pointer to struct perf_event;
>>
>> Patch 2/5 introduces a map_traverse_elem() function for further use;
>>
>> Patch 3/5 convets event file descriptors into perf_event structure when
>> add new element to the map;
>
> So far all the map backends are of generic nature, knowing absolutely nothing
> about a particular consumer/subsystem of eBPF (tc, socket filters, etc). The
> tail call is a bit special, but nevertheless generic for each user and [very]
> useful, so it makes sense to inherit from the array map and move the code there.
>
> I don't really like that we start add new _special_-cased maps here into the
> eBPF core code, it seems quite hacky. :( From your rather terse commit description
> where you introduce the maps, I failed to see a detailed elaboration on this i.e.
> why it cannot be abstracted any different?
It will be very useful that giving the eBPF programs the ablility to access
hardware PMU counter, just as I mentioned in V1 commit message.
Of course, there are some special code when creating the perf_event type map
in V2, but you will find less special code in the next version(V3). I have
reused most of the prog_array map implementation. We can make the perf_event
array map more generic in the future.
BR.
>
>> Patch 4/5 implement function bpf_perf_event_read() that get the selected
>> hardware PMU conuter;
>>
>> Patch 5/5 give a simple example.
>>
>> Kaixu Xia (5):
>> bpf: Add new bpf map type to store the pointer to struct perf_event
>> bpf: Add function map->ops->map_traverse_elem() to traverse map elems
>> bpf: Save the pointer to struct perf_event to map
>> bpf: Implement function bpf_perf_event_read() that get the selected
>> hardware PMU conuter
>> samples/bpf: example of get selected PMU counter value
>>
>> include/linux/bpf.h | 6 +++
>> include/linux/perf_event.h | 5 ++-
>> include/uapi/linux/bpf.h | 3 ++
>> kernel/bpf/arraymap.c | 110 +++++++++++++++++++++++++++++++++++++++++++++
>> kernel/bpf/helpers.c | 42 +++++++++++++++++
>> kernel/bpf/syscall.c | 26 +++++++++++
>> kernel/events/core.c | 30 ++++++++++++-
>> kernel/trace/bpf_trace.c | 2 +
>> samples/bpf/Makefile | 4 ++
>> samples/bpf/bpf_helpers.h | 2 +
>> samples/bpf/tracex6_kern.c | 27 +++++++++++
>> samples/bpf/tracex6_user.c | 67 +++++++++++++++++++++++++++
>> 12 files changed, 321 insertions(+), 3 deletions(-)
>> create mode 100644 samples/bpf/tracex6_kern.c
>> create mode 100644 samples/bpf/tracex6_user.c
>>
>
>
> .
>