Hi All,
v1->v2:
renamed flags for MAP_UPDATE_ELEM command to be more concise,
clarified commit logs and improved comments in patches 1,3,7
per discussions with Daniel
Old v1 cover:
this set of patches adds implementation of HASH and ARRAY types of eBPF maps
which were described in manpage in commit b4fc1a460f30("Merge branch 'bpf-next'")
The difference vs previous version of these patches from August:
- added 'flags' attribute to BPF_MAP_UPDATE_ELEM
- in HASH type implementation removed per-map kmem_cache.
I was doing kmem_cache_create() for every map to enable selective slub
debugging to check for overflows and leaks. Now it's not needed, so just
use normal kmalloc() for map elements.
- added ARRAY type which was mentioned in manpage, but wasn't public yet
- added map testsuite and removed temporary bits from test_stubs
Note, eBPF programs cannot be attached to events yet.
It will come in the next set.
Alexei Starovoitov (7):
bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command
bpf: add hashtable type of eBPF maps
bpf: add array type of eBPF maps
bpf: fix BPF_MAP_LOOKUP_ELEM command return code
bpf: add a testsuite for eBPF maps
bpf: allow eBPF programs to use maps
bpf: remove test map scaffolding and user proper types
include/linux/bpf.h | 7 +-
include/uapi/linux/bpf.h | 13 +-
kernel/bpf/Makefile | 2 +-
kernel/bpf/arraymap.c | 151 ++++++++++++++++++
kernel/bpf/hashtab.c | 362 +++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/helpers.c | 89 +++++++++++
kernel/bpf/syscall.c | 6 +-
kernel/bpf/test_stub.c | 56 ++-----
samples/bpf/Makefile | 3 +-
samples/bpf/libbpf.c | 3 +-
samples/bpf/libbpf.h | 2 +-
samples/bpf/test_maps.c | 291 ++++++++++++++++++++++++++++++++++
samples/bpf/test_verifier.c | 14 +-
13 files changed, 936 insertions(+), 63 deletions(-)
create mode 100644 kernel/bpf/arraymap.c
create mode 100644 kernel/bpf/hashtab.c
create mode 100644 kernel/bpf/helpers.c
create mode 100644 samples/bpf/test_maps.c
--
1.7.9.5
the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
either update existing map element or create a new one.
Initially the plan was to add a new command to handle the case of
'create new element if it didn't exist', but 'flags' style looks
cleaner and overall diff is much smaller (more code reused), so add 'flags'
attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
#define BPF_ANY 0 /* create new element or update existing */
#define BPF_NOEXIST 1 /* create new element if it didn't exist */
#define BPF_EXIST 2 /* update existing element */
bpf_update_elem(fd, key, value, BPF_NOEXIST) call can fail with EEXIST
if element already exists.
bpf_update_elem(fd, key, value, BPF_EXIST) can fail with ENOENT
if element doesn't exist.
Userspace will call it as:
int bpf_update_elem(int fd, void *key, void *value, __u64 flags)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
.flags = flags;
};
return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
First two bits of 'flags' are used to encode style of bpf_update_elem() command.
Bits 2-63 are reserved for future use.
Signed-off-by: Alexei Starovoitov <[email protected]>
---
patch 5 of this set includes tests of bpf_update_elem() with these flags
include/linux/bpf.h | 2 +-
include/uapi/linux/bpf.h | 8 +++++++-
kernel/bpf/syscall.c | 4 ++--
3 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3cf91754a957..51e9242e4803 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -22,7 +22,7 @@ struct bpf_map_ops {
/* funcs callable from userspace and from eBPF programs */
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
- int (*map_update_elem)(struct bpf_map *map, void *key, void *value);
+ int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
int (*map_delete_elem)(struct bpf_map *map, void *key);
};
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d18316f9e9c4..3e9e1b77f29d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -82,7 +82,7 @@ enum bpf_cmd {
/* create or update key/value pair in a given map
* err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
- * Using attr->map_fd, attr->key, attr->value
+ * Using attr->map_fd, attr->key, attr->value, attr->flags
* returns zero or negative error
*/
BPF_MAP_UPDATE_ELEM,
@@ -117,6 +117,11 @@ enum bpf_prog_type {
BPF_PROG_TYPE_UNSPEC,
};
+/* flags for BPF_MAP_UPDATE_ELEM command */
+#define BPF_ANY 0 /* create new element or update existing */
+#define BPF_NOEXIST 1 /* create new element if it didn't exist */
+#define BPF_EXIST 2 /* update existing element */
+
union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type; /* one of enum bpf_map_type */
@@ -132,6 +137,7 @@ union bpf_attr {
__aligned_u64 value;
__aligned_u64 next_key;
};
+ __u64 flags;
};
struct { /* anonymous struct used by BPF_PROG_LOAD command */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ba61c8c16032..c0d03bf317a2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -190,7 +190,7 @@ err_put:
return err;
}
-#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
static int map_update_elem(union bpf_attr *attr)
{
@@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr)
* therefore all map accessors rely on this fact, so do the same here
*/
rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value);
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
rcu_read_unlock();
free_value:
--
1.7.9.5
fix errno of BPF_MAP_LOOKUP_ELEM command as bpf manpage
described it in commit b4fc1a460f30("Merge branch 'bpf-next'"):
-----
BPF_MAP_LOOKUP_ELEM
int bpf_lookup_elem(int fd, void *key, void *value)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
};
return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}
bpf() syscall looks up an element with given key in a map fd.
If element is found it returns zero and stores element's value
into value. If element is not found it returns -1 and sets
errno to ENOENT.
and further down in manpage:
ENOENT For BPF_MAP_LOOKUP_ELEM or BPF_MAP_DELETE_ELEM, indicates that
element with given key was not found.
-----
In general all BPF commands return ENOENT when map element is not found
(including BPF_MAP_GET_NEXT_KEY and BPF_MAP_UPDATE_ELEM with
flags == BPF_MAP_UPDATE_ONLY)
Subsequent patch adds a testsuite to check return values for all of
these combinations.
Signed-off-by: Alexei Starovoitov <[email protected]>
---
I don't think this patch is needed for 'net', since 'net' has syscall shell
only. Actual map types and their implementations are being introduced by
this set of patches.
kernel/bpf/syscall.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c0d03bf317a2..088ac0b1b106 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
- err = -ESRCH;
+ err = -ENOENT;
rcu_read_lock();
value = map->ops->map_lookup_elem(map, key);
if (!value)
--
1.7.9.5
. check error conditions and sanity of hash and array map APIs
. check large maps (that kernel gracefully switches to vmalloc from kmalloc)
. check multi-process parallel access and stress test
Signed-off-by: Alexei Starovoitov <[email protected]>
---
Eventually it can be moved tools/testing/selftests/bpf/, but for now keep
it in samples/bpf/, since that's where all subsequent samples are coming to.
samples/bpf/Makefile | 3 +-
samples/bpf/libbpf.c | 3 +-
samples/bpf/libbpf.h | 2 +-
samples/bpf/test_maps.c | 291 +++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 296 insertions(+), 3 deletions(-)
create mode 100644 samples/bpf/test_maps.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 634391797856..0718d9ce4619 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -2,9 +2,10 @@
obj- := dummy.o
# List of programs to build
-hostprogs-y := test_verifier
+hostprogs-y := test_verifier test_maps
test_verifier-objs := test_verifier.o libbpf.o
+test_maps-objs := test_maps.o libbpf.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index ff6504420738..17bb520eb57f 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -27,12 +27,13 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
}
-int bpf_update_elem(int fd, void *key, void *value)
+int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
+ .flags = flags,
};
return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 8a31babeca5d..f8678e5f48bf 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -6,7 +6,7 @@ struct bpf_insn;
int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
int max_entries);
-int bpf_update_elem(int fd, void *key, void *value);
+int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags);
int bpf_lookup_elem(int fd, void *key, void *value);
int bpf_delete_elem(int fd, void *key);
int bpf_get_next_key(int fd, void *key, void *next_key);
diff --git a/samples/bpf/test_maps.c b/samples/bpf/test_maps.c
new file mode 100644
index 000000000000..e286b42307f3
--- /dev/null
+++ b/samples/bpf/test_maps.c
@@ -0,0 +1,291 @@
+/*
+ * Testsuite for eBPF maps
+ *
+ * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include "libbpf.h"
+
+/* sanity tests for map API */
+static void test_hashmap_sanity(int i, void *data)
+{
+ long long key, next_key, value;
+ int map_fd;
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 2);
+ if (map_fd < 0) {
+ printf("failed to create hashmap '%s'\n", strerror(errno));
+ exit(1);
+ }
+
+ key = 1;
+ value = 1234;
+ /* insert key=1 element */
+ assert(bpf_update_elem(map_fd, &key, &value, BPF_ANY) == 0);
+
+ value = 0;
+ /* BPF_NOEXIST means: add new element if it doesn't exist */
+ assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+ /* key=1 already exists */
+ errno == EEXIST);
+
+ assert(bpf_update_elem(map_fd, &key, &value, -1) == -1 && errno == EINVAL);
+
+ /* check that key=1 can be found */
+ assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 1234);
+
+ key = 2;
+ /* check that key=2 is not found */
+ assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+
+ /* BPF_EXIST means: update existing element */
+ assert(bpf_update_elem(map_fd, &key, &value, BPF_EXIST) == -1 &&
+ /* key=2 is not there */
+ errno == ENOENT);
+
+ /* insert key=2 element */
+ assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0);
+
+ /* key=1 and key=2 were inserted, check that key=0 cannot be inserted
+ * due to max_entries limit
+ */
+ key = 0;
+ assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+ errno == E2BIG);
+
+ /* check that key = 0 doesn't exist */
+ assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT);
+
+ /* iterate over two elements */
+ assert(bpf_get_next_key(map_fd, &key, &next_key) == 0 &&
+ next_key == 2);
+ assert(bpf_get_next_key(map_fd, &next_key, &next_key) == 0 &&
+ next_key == 1);
+ assert(bpf_get_next_key(map_fd, &next_key, &next_key) == -1 &&
+ errno == ENOENT);
+
+ /* delete both elements */
+ key = 1;
+ assert(bpf_delete_elem(map_fd, &key) == 0);
+ key = 2;
+ assert(bpf_delete_elem(map_fd, &key) == 0);
+ assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT);
+
+ key = 0;
+ /* check that map is empty */
+ assert(bpf_get_next_key(map_fd, &key, &next_key) == -1 &&
+ errno == ENOENT);
+ close(map_fd);
+}
+
+static void test_arraymap_sanity(int i, void *data)
+{
+ int key, next_key, map_fd;
+ long long value;
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), 2);
+ if (map_fd < 0) {
+ printf("failed to create arraymap '%s'\n", strerror(errno));
+ exit(1);
+ }
+
+ key = 1;
+ value = 1234;
+ /* insert key=1 element */
+ assert(bpf_update_elem(map_fd, &key, &value, BPF_ANY) == 0);
+
+ value = 0;
+ assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+ errno == EEXIST);
+
+ /* check that key=1 can be found */
+ assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 1234);
+
+ key = 0;
+ /* check that key=0 is also found and zero initialized */
+ assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 0);
+
+
+ /* key=0 and key=1 were inserted, check that key=2 cannot be inserted
+ * due to max_entries limit
+ */
+ key = 2;
+ assert(bpf_update_elem(map_fd, &key, &value, BPF_EXIST) == -1 &&
+ errno == E2BIG);
+
+ /* check that key = 2 doesn't exist */
+ assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+
+ /* iterate over two elements */
+ assert(bpf_get_next_key(map_fd, &key, &next_key) == 0 &&
+ next_key == 0);
+ assert(bpf_get_next_key(map_fd, &next_key, &next_key) == 0 &&
+ next_key == 1);
+ assert(bpf_get_next_key(map_fd, &next_key, &next_key) == -1 &&
+ errno == ENOENT);
+
+ /* delete shouldn't succeed */
+ key = 1;
+ assert(bpf_delete_elem(map_fd, &key) == -1 && errno == EINVAL);
+
+ close(map_fd);
+}
+
+#define MAP_SIZE (32 * 1024)
+static void test_map_large(void)
+{
+ struct bigkey {
+ int a;
+ char b[116];
+ long long c;
+ } key;
+ int map_fd, i, value;
+
+ /* allocate 4Mbyte of memory */
+ map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+ MAP_SIZE);
+ if (map_fd < 0) {
+ printf("failed to create large map '%s'\n", strerror(errno));
+ exit(1);
+ }
+
+ for (i = 0; i < MAP_SIZE; i++) {
+ key = (struct bigkey) {.c = i};
+ value = i;
+ assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0);
+ }
+ key.c = -1;
+ assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+ errno == E2BIG);
+
+ /* iterate through all elements */
+ for (i = 0; i < MAP_SIZE; i++)
+ assert(bpf_get_next_key(map_fd, &key, &key) == 0);
+ assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT);
+
+ key.c = 0;
+ assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 0);
+ key.a = 1;
+ assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+
+ close(map_fd);
+}
+
+/* fork N children and wait for them to complete */
+static void run_parallel(int tasks, void (*fn)(int i, void *data), void *data)
+{
+ pid_t pid[tasks];
+ int i;
+
+ for (i = 0; i < tasks; i++) {
+ pid[i] = fork();
+ if (pid[i] == 0) {
+ fn(i, data);
+ exit(0);
+ } else if (pid[i] == -1) {
+ printf("couldn't spawn #%d process\n", i);
+ exit(1);
+ }
+ }
+ for (i = 0; i < tasks; i++) {
+ int status;
+
+ assert(waitpid(pid[i], &status, 0) == pid[i]);
+ assert(status == 0);
+ }
+}
+
+static void test_map_stress(void)
+{
+ run_parallel(100, test_hashmap_sanity, NULL);
+ run_parallel(100, test_arraymap_sanity, NULL);
+}
+
+#define TASKS 1024
+#define DO_UPDATE 1
+#define DO_DELETE 0
+static void do_work(int fn, void *data)
+{
+ int map_fd = ((int *)data)[0];
+ int do_update = ((int *)data)[1];
+ int i;
+ int key, value;
+
+ for (i = fn; i < MAP_SIZE; i += TASKS) {
+ key = value = i;
+ if (do_update)
+ assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0);
+ else
+ assert(bpf_delete_elem(map_fd, &key) == 0);
+ }
+}
+
+static void test_map_parallel(void)
+{
+ int i, map_fd, key = 0, value = 0;
+ int data[2];
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+ MAP_SIZE);
+ if (map_fd < 0) {
+ printf("failed to create map for parallel test '%s'\n",
+ strerror(errno));
+ exit(1);
+ }
+
+ data[0] = map_fd;
+ data[1] = DO_UPDATE;
+ /* use the same map_fd in children to add elements to this map
+ * child_0 adds key=0, key=1024, key=2048, ...
+ * child_1 adds key=1, key=1025, key=2049, ...
+ * child_1023 adds key=1023, ...
+ */
+ run_parallel(TASKS, do_work, data);
+
+ /* check that key=0 is already there */
+ assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+ errno == EEXIST);
+
+ /* check that all elements were inserted */
+ key = -1;
+ for (i = 0; i < MAP_SIZE; i++)
+ assert(bpf_get_next_key(map_fd, &key, &key) == 0);
+ assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT);
+
+ /* another check for all elements */
+ for (i = 0; i < MAP_SIZE; i++) {
+ key = MAP_SIZE - i - 1;
+ assert(bpf_lookup_elem(map_fd, &key, &value) == 0 &&
+ value == key);
+ }
+
+ /* now let's delete all elemenets in parallel */
+ data[1] = DO_DELETE;
+ run_parallel(TASKS, do_work, data);
+
+ /* nothing should be left */
+ key = -1;
+ assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT);
+}
+
+int main(void)
+{
+ test_hashmap_sanity(0, NULL);
+ test_arraymap_sanity(0, NULL);
+ test_map_large();
+ test_map_parallel();
+ test_map_stress();
+ printf("test_maps: OK\n");
+ return 0;
+}
--
1.7.9.5
proper types and function helpers are ready. Use them in verifier testsuite.
Remove temporary stubs
Signed-off-by: Alexei Starovoitov <[email protected]>
---
kernel/bpf/test_stub.c | 56 +++++++------------------------------------
samples/bpf/test_verifier.c | 14 +++++------
2 files changed, 16 insertions(+), 54 deletions(-)
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
index fcaddff4003e..0ceae1e6e8b5 100644
--- a/kernel/bpf/test_stub.c
+++ b/kernel/bpf/test_stub.c
@@ -18,26 +18,18 @@ struct bpf_context {
u64 arg2;
};
-static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
- return 0;
-}
-
-static struct bpf_func_proto test_funcs[] = {
- [BPF_FUNC_unspec] = {
- .func = test_func,
- .gpl_only = true,
- .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
- },
-};
-
static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
{
- if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ default:
return NULL;
- return &test_funcs[func_id];
+ }
}
static const struct bpf_context_access {
@@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = {
.type = BPF_PROG_TYPE_UNSPEC,
};
-static struct bpf_map *test_map_alloc(union bpf_attr *attr)
-{
- struct bpf_map *map;
-
- map = kzalloc(sizeof(*map), GFP_USER);
- if (!map)
- return ERR_PTR(-ENOMEM);
-
- map->key_size = attr->key_size;
- map->value_size = attr->value_size;
- map->max_entries = attr->max_entries;
- return map;
-}
-
-static void test_map_free(struct bpf_map *map)
-{
- kfree(map);
-}
-
-static struct bpf_map_ops test_map_ops = {
- .map_alloc = test_map_alloc,
- .map_free = test_map_free,
-};
-
-static struct bpf_map_type_list tl_map = {
- .ops = &test_map_ops,
- .type = BPF_MAP_TYPE_UNSPEC,
-};
-
static int __init register_test_ops(void)
{
- bpf_register_map_type(&tl_map);
bpf_register_prog_type(&tl_prog);
return 0;
}
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index 63402742345e..b96175e90363 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -261,7 +261,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
- BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
},
.fixup = {2},
@@ -417,7 +417,7 @@ static struct bpf_test tests[] = {
BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
- BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_delete_elem),
BPF_EXIT_INSN(),
},
.errstr = "fd 0 is not pointing to valid bpf_map",
@@ -430,7 +430,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
- BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
BPF_EXIT_INSN(),
},
@@ -445,7 +445,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
- BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
BPF_EXIT_INSN(),
@@ -461,7 +461,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
- BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
BPF_EXIT_INSN(),
@@ -548,7 +548,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_2, -56, 0),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -56),
BPF_LD_MAP_FD(BPF_REG_1, 0),
- BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_delete_elem),
BPF_EXIT_INSN(),
},
.fixup = {24},
@@ -659,7 +659,7 @@ static int create_map(void)
long long key, value = 0;
int map_fd;
- map_fd = bpf_create_map(BPF_MAP_TYPE_UNSPEC, sizeof(key), sizeof(value), 1024);
+ map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 1024);
if (map_fd < 0) {
printf("failed to create map '%s'\n", strerror(errno));
}
--
1.7.9.5
expose bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem()
map accessors to eBPF programs
Signed-off-by: Alexei Starovoitov <[email protected]>
---
Note, these helpers are exposed as '.gpl_only = false', so non-GPL eBPF programs
can use them. That was requested by AndyL and DavidL before.
include/linux/bpf.h | 5 +++
include/uapi/linux/bpf.h | 3 ++
kernel/bpf/Makefile | 2 +-
kernel/bpf/helpers.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 98 insertions(+), 1 deletion(-)
create mode 100644 kernel/bpf/helpers.c
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 51e9242e4803..75e94eaa228b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -133,4 +133,9 @@ struct bpf_prog *bpf_prog_get(u32 ufd);
/* verify correctness of eBPF program */
int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
+/* verifier prototypes for helper functions called from eBPF programs */
+extern struct bpf_func_proto bpf_map_lookup_elem_proto;
+extern struct bpf_func_proto bpf_map_update_elem_proto;
+extern struct bpf_func_proto bpf_map_delete_elem_proto;
+
#endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0d662fe75df5..4a3d0f84f178 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -158,6 +158,9 @@ union bpf_attr {
*/
enum bpf_func_id {
BPF_FUNC_unspec,
+ BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
+ BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
+ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
__BPF_FUNC_MAX_ID,
};
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 72ec98ba2d42..a5ae60f0b0a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
ifdef CONFIG_TEST_BPF
obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
endif
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..9e3414d85459
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+
+/* If kernel subsystem is allowing eBPF programs to call this function,
+ * inside its own verifier_ops->get_func_proto() callback it should return
+ * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
+ *
+ * Different map implementations will rely on rcu in map methods
+ * lookup/update/delete, therefore eBPF programs must run under rcu lock
+ * if program is allowed to access maps, so check rcu_read_lock_held in
+ * all three functions.
+ */
+static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* verifier checked that R1 contains a valid pointer to bpf_map
+ * and R2 points to a program stack and map->key_size bytes were
+ * initialized
+ */
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ value = map->ops->map_lookup_elem(map, key);
+
+ /* lookup() returns either pointer to element value or NULL
+ * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
+ */
+ return (unsigned long) value;
+}
+
+struct bpf_func_proto bpf_map_lookup_elem_proto = {
+ .func = bpf_map_lookup_elem,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value = (void *) (unsigned long) r3;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_update_elem(map, key, value, r4);
+}
+
+struct bpf_func_proto bpf_map_update_elem_proto = {
+ .func = bpf_map_update_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_delete_elem(map, key);
+}
+
+struct bpf_func_proto bpf_map_delete_elem_proto = {
+ .func = bpf_map_delete_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
--
1.7.9.5
add new map type BPF_MAP_TYPE_ARRAY and its implementation
- optimized for fastest possible lookup()
. in the future verifier/JIT may recognize lookup() with constant key
and optimize it into constant pointer. Can optimize non-constant
key into direct pointer arithmetic as well, since pointers and
value_size are constant for the life of the eBPF program.
In other words array_map_lookup_elem() may be 'inlined' by verifier/JIT
while preserving concurrent access to this map from user space
- two main use cases for array type:
. 'global' eBPF variables: array of 1 element with key=0 and value is a
collection of 'global' variables which programs can use to keep the state
between events
. aggregation of tracing events into fixed set of buckets
- all array elements pre-allocated and zero initialized at init time
- key as an index in array and can only be 4 byte
- map_delete_elem() returns EINVAL, since elements cannot be deleted
- map_update_elem() replaces elements in an non-atomic way
(for atomic updates hashtable type should be used instead)
Signed-off-by: Alexei Starovoitov <[email protected]>
---
Note, from eBPF program and from user space, all map types are accessed
through the same API.
Example of using array type for 'global' variables from eBPF program:
struct globals {
u64 lat_ave;
u64 lat_sum;
u64 missed;
u64 max_lat;
int num_samples;
};
struct bpf_map_def SEC("maps") global_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(struct globals),
.max_entries = 1,
};
int bpf_prog(struct bpf_context *ctx)
{
...
int ind = 0;
struct globals *g = bpf_map_lookup_elem(&global_map, &ind);
if (!g)
return 0;
if (g->lat_ave == 0) {
g->num_samples++;
g->lat_sum += delta;
if (g->num_samples >= 100) {
g->lat_ave = g->lat_sum / g->num_samples;
...
The future verifier/JIT optimization will replace bpf_map_lookup_elem()
call inside eBPF program with const pointer to element value of key=0,
so that eBPF program will have no penalty whatsoever to access such
'global' variables.
At the same time user space can access this 'globals' via common map API.
Full example of both kernel and user side follows in later patches.
The array map is like C array of structures. Nothing protects concurrent access.
It's used in the cases where accuracy is not needed or when there is no
concurrent access. To compute a histogram of events in tracing the array
of integers is used. Every integer is a counter. Program increments it
(may be without using xadd) and user space periodically reads it back.
map_update_elem() is called by userspace once to initialize it if zero-init
is not enough. Programs do lookup() and modify the values.
For array type update() method is used rarely, delete() is never used and
get_next() is needed for completeness to browse maps through common map API.
include/uapi/linux/bpf.h | 1 +
kernel/bpf/Makefile | 2 +-
kernel/bpf/arraymap.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 153 insertions(+), 1 deletion(-)
create mode 100644 kernel/bpf/arraymap.c
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 03a01fd609aa..0d662fe75df5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_cmd {
enum bpf_map_type {
BPF_MAP_TYPE_UNSPEC,
BPF_MAP_TYPE_HASH,
+ BPF_MAP_TYPE_ARRAY,
};
enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 2c0ec7f9da78..72ec98ba2d42 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o
ifdef CONFIG_TEST_BPF
obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..f4f6965f86cb
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,151 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+struct bpf_array {
+ struct bpf_map map;
+ u32 elem_size;
+ char value[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_array *array;
+ u32 elem_size;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size == 0)
+ return ERR_PTR(-EINVAL);
+
+ elem_size = round_up(attr->value_size, 8);
+
+ /* allocate all map elements and zero-initialize them */
+ array = kzalloc(sizeof(*array) + attr->max_entries * elem_size,
+ GFP_USER | __GFP_NOWARN);
+ if (!array) {
+ array = vzalloc(array->map.max_entries * array->elem_size);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* copy mandatory map attributes */
+ array->map.key_size = attr->key_size;
+ array->map.value_size = attr->value_size;
+ array->map.max_entries = attr->max_entries;
+
+ array->elem_size = elem_size;
+
+ return &array->map;
+
+}
+
+/* Called from syscall or from eBPF program */
+static void *array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (index >= array->map.max_entries)
+ return NULL;
+
+ return array->value + array->elem_size * index;
+}
+
+/* Called from syscall */
+static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ u32 *next = (u32 *)next_key;
+
+ if (index >= array->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == array->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = index + 1;
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ /* all elements were pre-allocated, cannot insert a new one */
+ return -E2BIG;
+
+ if (map_flags == BPF_NOEXIST)
+ /* all elemenets already exist */
+ return -EEXIST;
+
+ memcpy(array->value + array->elem_size * index, value, array->elem_size);
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding programs to complete
+ * and free the array
+ */
+ synchronize_rcu();
+
+ kvfree(array);
+}
+
+static struct bpf_map_ops array_ops = {
+ .map_alloc = array_map_alloc,
+ .map_free = array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = array_map_lookup_elem,
+ .map_update_elem = array_map_update_elem,
+ .map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+ .ops = &array_ops,
+ .type = BPF_MAP_TYPE_ARRAY,
+};
+
+static int __init register_array_map(void)
+{
+ bpf_register_map_type(&tl);
+ return 0;
+}
+late_initcall(register_array_map);
--
1.7.9.5
add new map type BPF_MAP_TYPE_HASH and its implementation
- maps are created/destroyed by userspace. Both userspace and eBPF programs
can lookup/update/delete elements from the map
- eBPF programs can be called in_irq(), so use spin_lock_irqsave() mechanism
for concurrent updates
- key/value are opaque range of bytes (aligned to 8 bytes)
- user space provides 3 configuration attributes via BPF syscall:
key_size, value_size, max_entries
- map takes care of allocating/freeing key/value pairs
- map_update_elem() must fail to insert new element when max_entries
limit is reached to make sure that eBPF programs cannot exhaust memory
- map_update_elem() replaces elements in an atomic way
- optimized for speed of lookup() which can be called multiple times from
eBPF program which itself is triggered by high volume of events
. in the future JIT compiler may recognize lookup() call and optimize it
further, since key_size is constant for life of eBPF program
Signed-off-by: Alexei Starovoitov <[email protected]>
---
include/uapi/linux/bpf.h | 1 +
kernel/bpf/Makefile | 2 +-
kernel/bpf/hashtab.c | 362 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 364 insertions(+), 1 deletion(-)
create mode 100644 kernel/bpf/hashtab.c
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3e9e1b77f29d..03a01fd609aa 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -111,6 +111,7 @@ enum bpf_cmd {
enum bpf_map_type {
BPF_MAP_TYPE_UNSPEC,
+ BPF_MAP_TYPE_HASH,
};
enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0daf7f6ae7df..2c0ec7f9da78 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o
ifdef CONFIG_TEST_BPF
obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..d234a012f046
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,362 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+
+struct bpf_htab {
+ struct bpf_map map;
+ struct hlist_head *buckets;
+ spinlock_t lock;
+ u32 count; /* number of elements in this hashtable */
+ u32 n_buckets; /* number of hash buckets */
+ u32 elem_size; /* size of each element in bytes */
+};
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+ struct hlist_node hash_node;
+ struct rcu_head rcu;
+ u32 hash;
+ char key[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_htab *htab;
+ int err, i;
+
+ htab = kzalloc(sizeof(*htab), GFP_USER);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ htab->map.key_size = attr->key_size;
+ htab->map.value_size = attr->value_size;
+ htab->map.max_entries = attr->max_entries;
+
+ /* check sanity of attributes.
+ * value_size == 0 may be allowed in the future to use map as a set
+ */
+ err = -EINVAL;
+ if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
+ htab->map.value_size == 0)
+ goto free_htab;
+
+ /* hash table size must be power of 2 */
+ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+
+ err = -E2BIG;
+ if (htab->map.key_size > MAX_BPF_STACK)
+ /* eBPF programs initialize keys on stack, so they cannot be
+ * larger than max stack size
+ */
+ goto free_htab;
+
+ err = -ENOMEM;
+ htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+ GFP_USER | __GFP_NOWARN);
+
+ if (!htab->buckets) {
+ htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+ if (!htab->buckets)
+ goto free_htab;
+ }
+
+ for (i = 0; i < htab->n_buckets; i++)
+ INIT_HLIST_HEAD(&htab->buckets[i]);
+
+ spin_lock_init(&htab->lock);
+ htab->count = 0;
+
+ htab->elem_size = sizeof(struct htab_elem) +
+ round_up(htab->map.key_size, 8) +
+ htab->map.value_size;
+ return &htab->map;
+
+free_htab:
+ kfree(htab);
+ return ERR_PTR(err);
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+ void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node)
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+
+ return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ u32 hash, key_size;
+
+ /* Must be called with rcu_read_lock. */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l)
+ return l->key + round_up(map->key_size, 8);
+
+ return NULL;
+}
+
+/* Called from syscall */
+static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l, *next_l;
+ u32 hash, key_size;
+ int i;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ /* lookup the key */
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (!l) {
+ i = 0;
+ goto find_first_elem;
+ }
+
+ /* key was found, get next key in the same bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ struct htab_elem, hash_node);
+
+ if (next_l) {
+ /* if next elem in this hash list is non-zero, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+
+ /* no more elements in this hash list, go to the next bucket */
+ i = hash & (htab->n_buckets - 1);
+ i++;
+
+find_first_elem:
+ /* iterate over buckets */
+ for (; i < htab->n_buckets; i++) {
+ head = select_bucket(htab, i);
+
+ /* pick first element in the bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ struct htab_elem, hash_node);
+ if (next_l) {
+ /* if it's not empty, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+ }
+
+ /* itereated over all buckets and all elements */
+ return -ENOENT;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new, *l_old;
+ struct hlist_head *head;
+ unsigned long flags;
+ u32 key_size;
+ int ret;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* allocate new element outside of lock */
+ l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
+ if (!l_new)
+ return -ENOMEM;
+
+ key_size = map->key_size;
+
+ memcpy(l_new->key, key, key_size);
+ memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+
+ l_new->hash = htab_map_hash(l_new->key, key_size);
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, l_new->hash);
+
+ l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+
+ if (!l_old && unlikely(htab->count >= map->max_entries)) {
+ /* if elem with this 'key' doesn't exist and we've reached
+ * max_entries limit, fail insertion of new elem
+ */
+ ret = -E2BIG;
+ goto err;
+ }
+
+ if (l_old && map_flags == BPF_NOEXIST) {
+ /* elem already exists */
+ ret = -EEXIST;
+ goto err;
+ }
+
+ if (!l_old && map_flags == BPF_EXIST) {
+ /* elem doesn't exist, cannot update it */
+ ret = -ENOENT;
+ goto err;
+ }
+
+ /* add new element to the head of the list, so that concurrent
+ * search will find it before old elem
+ */
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ hlist_del_rcu(&l_old->hash_node);
+ kfree_rcu(l_old, rcu);
+ } else {
+ htab->count++;
+ }
+ spin_unlock_irqrestore(&htab->lock, flags);
+
+ return 0;
+err:
+ spin_unlock_irqrestore(&htab->lock, flags);
+ kfree(l_new);
+ return ret;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ unsigned long flags;
+ u32 hash, key_size;
+ int ret = -ENOENT;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree_rcu(l, rcu);
+ ret = 0;
+ }
+
+ spin_unlock_irqrestore(&htab->lock, flags);
+ return ret;
+}
+
+static void delete_all_elements(struct bpf_htab *htab)
+{
+ int i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_head *head = select_bucket(htab, i);
+ struct hlist_node *n;
+ struct htab_elem *l;
+
+ hlist_for_each_entry_safe(l, n, head, hash_node) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree(l);
+ }
+ }
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void htab_map_free(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ /* some of kfree_rcu() callbacks for elements of this map may not have
+ * executed. It's ok. Proceed to free residual elements and map itself
+ */
+ delete_all_elements(htab);
+ kvfree(htab->buckets);
+ kfree(htab);
+}
+
+static struct bpf_map_ops htab_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_map_lookup_elem,
+ .map_update_elem = htab_map_update_elem,
+ .map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+ .ops = &htab_ops,
+ .type = BPF_MAP_TYPE_HASH,
+};
+
+static int __init register_htab_map(void)
+{
+ bpf_register_map_type(&tl);
+ return 0;
+}
+late_initcall(register_htab_map);
--
1.7.9.5
On Do, 2014-11-13 at 17:36 -0800, Alexei Starovoitov wrote:
> the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
> either update existing map element or create a new one.
> Initially the plan was to add a new command to handle the case of
> 'create new element if it didn't exist', but 'flags' style looks
> cleaner and overall diff is much smaller (more code reused), so add 'flags'
> attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
> #define BPF_ANY 0 /* create new element or update existing */
> #define BPF_NOEXIST 1 /* create new element if it didn't exist */
> #define BPF_EXIST 2 /* update existing element */
Would a cmpxchg-alike function be handy here?
Bye,
Hannes
On Fri, Nov 14, 2014 at 4:11 AM, Hannes Frederic Sowa
<[email protected]> wrote:
> On Do, 2014-11-13 at 17:36 -0800, Alexei Starovoitov wrote:
>> the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
>> either update existing map element or create a new one.
>> Initially the plan was to add a new command to handle the case of
>> 'create new element if it didn't exist', but 'flags' style looks
>> cleaner and overall diff is much smaller (more code reused), so add 'flags'
>> attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
>> #define BPF_ANY 0 /* create new element or update existing */
>> #define BPF_NOEXIST 1 /* create new element if it didn't exist */
>> #define BPF_EXIST 2 /* update existing element */
>
> Would a cmpxchg-alike function be handy here?
you mean cmpxchg command in addition to
update() command ?
May be... it will have an extra 'value' argument
(key, old_value, new_value)
I don't have a use case for it yet though.
On Fr, 2014-11-14 at 07:33 -0800, Alexei Starovoitov wrote:
> On Fri, Nov 14, 2014 at 4:11 AM, Hannes Frederic Sowa
> <[email protected]> wrote:
> > On Do, 2014-11-13 at 17:36 -0800, Alexei Starovoitov wrote:
> >> the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
> >> either update existing map element or create a new one.
> >> Initially the plan was to add a new command to handle the case of
> >> 'create new element if it didn't exist', but 'flags' style looks
> >> cleaner and overall diff is much smaller (more code reused), so add 'flags'
> >> attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
> >> #define BPF_ANY 0 /* create new element or update existing */
> >> #define BPF_NOEXIST 1 /* create new element if it didn't exist */
> >> #define BPF_EXIST 2 /* update existing element */
> >
> > Would a cmpxchg-alike function be handy here?
>
> you mean cmpxchg command in addition to
> update() command ?
> May be... it will have an extra 'value' argument
> (key, old_value, new_value)
> I don't have a use case for it yet though.
I don't neither. ;)
I just wanted to bring this up before user space api might get public
and the additional argument might make problems.
Bye,
Hannes
On Fri, Nov 14, 2014 at 8:06 AM, Hannes Frederic Sowa
<[email protected]> wrote:
> On Fr, 2014-11-14 at 07:33 -0800, Alexei Starovoitov wrote:
>> On Fri, Nov 14, 2014 at 4:11 AM, Hannes Frederic Sowa
>> <[email protected]> wrote:
>> > On Do, 2014-11-13 at 17:36 -0800, Alexei Starovoitov wrote:
>> >> the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
>> >> either update existing map element or create a new one.
>> >> Initially the plan was to add a new command to handle the case of
>> >> 'create new element if it didn't exist', but 'flags' style looks
>> >> cleaner and overall diff is much smaller (more code reused), so add 'flags'
>> >> attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
>> >> #define BPF_ANY 0 /* create new element or update existing */
>> >> #define BPF_NOEXIST 1 /* create new element if it didn't exist */
>> >> #define BPF_EXIST 2 /* update existing element */
>> >
>> > Would a cmpxchg-alike function be handy here?
>>
>> you mean cmpxchg command in addition to
>> update() command ?
>> May be... it will have an extra 'value' argument
>> (key, old_value, new_value)
>> I don't have a use case for it yet though.
>
> I don't neither. ;)
>
> I just wanted to bring this up before user space api might get public
> and the additional argument might make problems.
addition of cmpxchg command won't be a problem obviously.
(just another 'new_value' field in existing struct inside bpf_attr union).
From: Alexei Starovoitov <[email protected]>
Date: Thu, 13 Nov 2014 17:36:49 -0800
> +static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
> +{
> + /* verifier checked that R1 contains a valid pointer to bpf_map
> + * and R2 points to a program stack and map->key_size bytes were
> + * initialized
> + */
> + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
> + void *key = (void *) (unsigned long) r2;
> + void *value;
> +
> + WARN_ON_ONCE(!rcu_read_lock_held());
> +
> + value = map->ops->map_lookup_elem(map, key);
> +
> + /* lookup() returns either pointer to element value or NULL
> + * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
> + */
> + return (unsigned long) value;
> +}
You should translate this into a true boolean '1' or '0' value so that
kernel pointers don't propagate to the user or his eBPF programs.
On Sun, Nov 16, 2014 at 11:04 AM, David Miller <[email protected]> wrote:
> From: Alexei Starovoitov <[email protected]>
> Date: Thu, 13 Nov 2014 17:36:49 -0800
>
>> +static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
>> +{
>> + /* verifier checked that R1 contains a valid pointer to bpf_map
>> + * and R2 points to a program stack and map->key_size bytes were
>> + * initialized
>> + */
>> + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
>> + void *key = (void *) (unsigned long) r2;
>> + void *value;
>> +
>> + WARN_ON_ONCE(!rcu_read_lock_held());
>> +
>> + value = map->ops->map_lookup_elem(map, key);
>> +
>> + /* lookup() returns either pointer to element value or NULL
>> + * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
>> + */
>> + return (unsigned long) value;
>> +}
>
> You should translate this into a true boolean '1' or '0' value so that
> kernel pointers don't propagate to the user or his eBPF programs.
that won't work. eBPF programs have to see all sorts of kernel
pointers. In this case it's a pointer to map element value
or NULL. There are pointers to stack, pointers to map root,
pointers to context, etc. Programs can read pointers from
other data structures. And in the case of tracing they can
pretty much access any kernel memory in read only way.
Just like 'perf probe' filters.
The requirement that _unprivileged_ programs should
not be able to pass all these pointers back to user is
well understood and was discussed in detail several
month back. It's verifier that will prevent leaking of
kernel addresses. Today, the whole thing is for root
only. When the infra is ready for non-root I will add
a pass to verifier, that will kick in only for unprivileged
programs. Verifier already tracks all pointers and
can prevent passing them to user. In this case
verifier knows that register R0 after a call to
bpf_map_lookup_elem() is
"either pointer to element value or NULL",
so it will prevent storing it into any memory or
doing arithmetic on it, so that user space cannot
see the pointer, whereas eBPF program can use
it to access map element value.
From: Alexei Starovoitov <[email protected]>
Date: Sun, 16 Nov 2014 13:24:53 -0800
> The requirement that _unprivileged_ programs should
> not be able to pass all these pointers back to user is
> well understood and was discussed in detail several
> month back. It's verifier that will prevent leaking of
> kernel addresses.
Ok, fair enough.
From: Alexei Starovoitov <[email protected]>
Date: Thu, 13 Nov 2014 17:36:43 -0800
> v1->v2:
> renamed flags for MAP_UPDATE_ELEM command to be more concise,
> clarified commit logs and improved comments in patches 1,3,7
> per discussions with Daniel
>
> Old v1 cover:
>
> this set of patches adds implementation of HASH and ARRAY types of eBPF maps
> which were described in manpage in commit b4fc1a460f30("Merge branch 'bpf-next'")
>
> The difference vs previous version of these patches from August:
> - added 'flags' attribute to BPF_MAP_UPDATE_ELEM
> - in HASH type implementation removed per-map kmem_cache.
> I was doing kmem_cache_create() for every map to enable selective slub
> debugging to check for overflows and leaks. Now it's not needed, so just
> use normal kmalloc() for map elements.
> - added ARRAY type which was mentioned in manpage, but wasn't public yet
> - added map testsuite and removed temporary bits from test_stubs
>
> Note, eBPF programs cannot be attached to events yet.
> It will come in the next set.
Series applied, thanks Alexei.