LinuxLists.cc - [PATCH 0/3] bpf: hash: use per-bucket spinlock

2015-12-26 09:31:45

Subject: [PATCH 0/3] bpf: hash: use per-bucket spinlock

Hi,

This patchset tries to optimize ebpf hash map, and follows
the idea:

Both htab_map_update_elem() and htab_map_delete_elem()
can be called from eBPF program, and they may be in kernel
hot path, it isn't efficient to use a per-hashtable lock
in this two helpers, so this patch converts the lock into
per-bucket spinlock.

With this patchset, looks the performance penalty from eBPF
decreased a lot, see the following test:

1) run 'tools/biolatency' of bcc before running block test;

2) run fio to test block throught over /dev/nullb0,
(randread, 16jobs, libaio, 4k bs) and the test box
is one 24cores(dual sockets) VM server:
- without patchset: 607K IOPS
- with this patchset: 1184K IOPS
- without running eBPF prog: 1492K IOPS

TODO:
- remove the per-hashtable atomic counter

kernel/bpf/hashtab.c | 49 +++++++++++++++++++++++++++++++------------------
1 file changed, 31 insertions(+), 18 deletions(-)

thanks,
Ming

2015-12-26 09:32:03

by Ming Lei

[permalink] [raw]

Subject: [PATCH 1/3] bpf: hash: use atomic count

Preparing for removing global per-hashtable lock, so
the counter need to be defined as aotmic_t first.

Signed-off-by: Ming Lei <[email protected]>
---
kernel/bpf/hashtab.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 34777b3..2615388 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,7 +18,7 @@ struct bpf_htab {
struct bpf_map map;
struct hlist_head *buckets;
raw_spinlock_t lock;
- u32 count; /* number of elements in this hashtable */
+ atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
};
@@ -106,7 +106,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
INIT_HLIST_HEAD(&htab->buckets[i]);

raw_spin_lock_init(&htab->lock);
- htab->count = 0;
+ atomic_set(&htab->count, 0);

return &htab->map;

@@ -256,7 +256,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,

l_old = lookup_elem_raw(head, l_new->hash, key, key_size);

- if (!l_old && unlikely(htab->count >= map->max_entries)) {
+ if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
/* if elem with this 'key' doesn't exist and we've reached
* max_entries limit, fail insertion of new elem
*/
@@ -284,7 +284,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_del_rcu(&l_old->hash_node);
kfree_rcu(l_old, rcu);
} else {
- htab->count++;
+ atomic_inc(&htab->count);
}
raw_spin_unlock_irqrestore(&htab->lock, flags);

@@ -319,7 +319,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)

if (l) {
hlist_del_rcu(&l->hash_node);
- htab->count--;
+ atomic_dec(&htab->count);
kfree_rcu(l, rcu);
ret = 0;
}
@@ -339,7 +339,7 @@ static void delete_all_elements(struct bpf_htab *htab)

hlist_for_each_entry_safe(l, n, head, hash_node) {
hlist_del_rcu(&l->hash_node);
- htab->count--;
+ atomic_dec(&htab->count);
kfree(l);
}
}
--
1.9.1

2015-12-26 09:32:56

by Ming Lei

[permalink] [raw]

Subject: [PATCH 2/3] bpf: hash: move select_bucket() out of htab's spinlock

The spinlock is just used for protecting the per-bucket
hlist, so it isn't needed for selecting bucket.

Signed-off-by: Ming Lei <[email protected]>
---
kernel/bpf/hashtab.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 2615388..d857fcb 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -248,12 +248,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);

l_new->hash = htab_map_hash(l_new->key, key_size);
+ head = select_bucket(htab, l_new->hash);

/* bpf_map_update_elem() can be called in_irq() */
raw_spin_lock_irqsave(&htab->lock, flags);

- head = select_bucket(htab, l_new->hash);
-
l_old = lookup_elem_raw(head, l_new->hash, key, key_size);

if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
@@ -310,11 +309,10 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
key_size = map->key_size;

hash = htab_map_hash(key, key_size);
+ head = select_bucket(htab, hash);

raw_spin_lock_irqsave(&htab->lock, flags);

- head = select_bucket(htab, hash);
-
l = lookup_elem_raw(head, hash, key, key_size);

if (l) {
--
1.9.1

2015-12-26 09:33:27

by Ming Lei

[permalink] [raw]

Subject: [PATCH 3/3] bpf: hash: use per-bucket spinlock

From: Ming Lei <[email protected]>

Both htab_map_update_elem() and htab_map_delete_elem() can be
called from eBPF program, and they may be in kernel hot path,
so it isn't efficient to use a per-hashtable lock in this two
helpers.

The per-hashtable spinlock is used just for protecting bucket's
hlist, and per-bucket lock should be enough. This patch converts
the per-hashtable lock into per-bucket spinlock, so that contention
can be decreased a lot.

Signed-off-by: Ming Lei <[email protected]>
---
kernel/bpf/hashtab.c | 35 +++++++++++++++++++++++++----------
1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d857fcb..66bad7a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -14,9 +14,14 @@
#include <linux/filter.h>
#include <linux/vmalloc.h>

+struct bucket {
+ struct hlist_head head;
+ raw_spinlock_t lock;
+};
+
struct bpf_htab {
struct bpf_map map;
- struct hlist_head *buckets;
+ struct bucket *buckets;
raw_spinlock_t lock;
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
@@ -88,24 +93,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
goto free_htab;

- htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+ htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
htab->elem_size * htab->map.max_entries,
PAGE_SIZE) >> PAGE_SHIFT;

err = -ENOMEM;
- htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+ htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
GFP_USER | __GFP_NOWARN);

if (!htab->buckets) {
- htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+ htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
if (!htab->buckets)
goto free_htab;
}

- for (i = 0; i < htab->n_buckets; i++)
- INIT_HLIST_HEAD(&htab->buckets[i]);
+ for (i = 0; i < htab->n_buckets; i++) {
+ INIT_HLIST_HEAD(&htab->buckets[i].head);
+ raw_spin_lock_init(&htab->buckets[i].lock);
+ }

- raw_spin_lock_init(&htab->lock);
atomic_set(&htab->count, 0);

return &htab->map;
@@ -120,11 +126,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len)
return jhash(key, key_len, 0);
}

-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
{
return &htab->buckets[hash & (htab->n_buckets - 1)];
}

+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
void *key, u32 key_size)
{
@@ -227,6 +238,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old;
struct hlist_head *head;
+ struct bucket *b;
unsigned long flags;
u32 key_size;
int ret;
@@ -248,7 +260,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);

l_new->hash = htab_map_hash(l_new->key, key_size);
- head = select_bucket(htab, l_new->hash);
+ b = __select_bucket(htab, l_new->hash);
+ head = &b->head;

/* bpf_map_update_elem() can be called in_irq() */
raw_spin_lock_irqsave(&htab->lock, flags);
@@ -299,6 +312,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_head *head;
+ struct bucket *b;
struct htab_elem *l;
unsigned long flags;
u32 hash, key_size;
@@ -309,7 +323,8 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
key_size = map->key_size;

hash = htab_map_hash(key, key_size);
- head = select_bucket(htab, hash);
+ b = __select_bucket(htab, hash);
+ head = &b->head;

raw_spin_lock_irqsave(&htab->lock, flags);

--
1.9.1

2015-12-28 09:03:42

by Daniel Borkmann

[permalink] [raw]

Subject: Re: [PATCH 1/3] bpf: hash: use atomic count

On 12/26/2015 10:31 AM, Ming Lei wrote:
> Preparing for removing global per-hashtable lock, so
> the counter need to be defined as aotmic_t first.
>
> Signed-off-by: Ming Lei <[email protected]>

Acked-by: Daniel Borkmann <[email protected]>

2015-12-28 09:04:56

by Daniel Borkmann

[permalink] [raw]

Subject: Re: [PATCH 2/3] bpf: hash: move select_bucket() out of htab's spinlock

On 12/26/2015 10:31 AM, Ming Lei wrote:
> The spinlock is just used for protecting the per-bucket
> hlist, so it isn't needed for selecting bucket.
>
> Signed-off-by: Ming Lei <[email protected]>

Acked-by: Daniel Borkmann <[email protected]>

2015-12-28 09:13:12

by Daniel Borkmann

[permalink] [raw]

Subject: Re: [PATCH 3/3] bpf: hash: use per-bucket spinlock

On 12/26/2015 10:31 AM, Ming Lei wrote:
> From: Ming Lei <[email protected]>
>
> Both htab_map_update_elem() and htab_map_delete_elem() can be
> called from eBPF program, and they may be in kernel hot path,
> so it isn't efficient to use a per-hashtable lock in this two
> helpers.
>
> The per-hashtable spinlock is used just for protecting bucket's
> hlist, and per-bucket lock should be enough. This patch converts
> the per-hashtable lock into per-bucket spinlock, so that contention
> can be decreased a lot.
>
> Signed-off-by: Ming Lei <[email protected]>
> ---
> kernel/bpf/hashtab.c | 35 +++++++++++++++++++++++++----------
> 1 file changed, 25 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index d857fcb..66bad7a 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -14,9 +14,14 @@
> #include <linux/filter.h>
> #include <linux/vmalloc.h>
>
> +struct bucket {
> + struct hlist_head head;
> + raw_spinlock_t lock;
> +};
> +
> struct bpf_htab {
> struct bpf_map map;
> - struct hlist_head *buckets;
> + struct bucket *buckets;
> raw_spinlock_t lock;

Shouldn't the lock member be removed from here?

> atomic_t count; /* number of elements in this hashtable */
> u32 n_buckets; /* number of hash buckets */
> @@ -88,24 +93,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> /* make sure page count doesn't overflow */
> goto free_htab;
>
> - htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
> + htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
> htab->elem_size * htab->map.max_entries,
> PAGE_SIZE) >> PAGE_SHIFT;
>
> err = -ENOMEM;
> - htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
> + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
> GFP_USER | __GFP_NOWARN);
>
> if (!htab->buckets) {
> - htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
> + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
> if (!htab->buckets)
> goto free_htab;
> }
>
> - for (i = 0; i < htab->n_buckets; i++)
> - INIT_HLIST_HEAD(&htab->buckets[i]);
> + for (i = 0; i < htab->n_buckets; i++) {
> + INIT_HLIST_HEAD(&htab->buckets[i].head);
> + raw_spin_lock_init(&htab->buckets[i].lock);
> + }
>
> - raw_spin_lock_init(&htab->lock);
> atomic_set(&htab->count, 0);
>
> return &htab->map;
> @@ -120,11 +126,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len)
> return jhash(key, key_len, 0);
> }
>
> -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
> +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
> {
> return &htab->buckets[hash & (htab->n_buckets - 1)];
> }
>
> +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
> +{
> + return &__select_bucket(htab, hash)->head;
> +}
> +
> static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
> void *key, u32 key_size)
> {
> @@ -227,6 +238,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> struct htab_elem *l_new, *l_old;
> struct hlist_head *head;
> + struct bucket *b;
> unsigned long flags;
> u32 key_size;
> int ret;
> @@ -248,7 +260,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
>
> l_new->hash = htab_map_hash(l_new->key, key_size);
> - head = select_bucket(htab, l_new->hash);
> + b = __select_bucket(htab, l_new->hash);
> + head = &b->head;
>
> /* bpf_map_update_elem() can be called in_irq() */
> raw_spin_lock_irqsave(&htab->lock, flags);

I am a bit confused on this one, though.

The raw spin lock is still per hashtable (htab->lock), and has not been converted in
this patch to the per bucket one (as opposed to what the commit message says), so
this patch seems to go into the right direction, but is a bit incomplete? Shouldn't
the above f.e. take b->lock, etc?

> @@ -299,6 +312,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
> {
> struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> struct hlist_head *head;
> + struct bucket *b;
> struct htab_elem *l;
> unsigned long flags;
> u32 hash, key_size;
> @@ -309,7 +323,8 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
> key_size = map->key_size;
>
> hash = htab_map_hash(key, key_size);
> - head = select_bucket(htab, hash);
> + b = __select_bucket(htab, hash);
> + head = &b->head;
>
> raw_spin_lock_irqsave(&htab->lock, flags);
>

Same here?

Thanks,
Daniel

2015-12-28 12:20:37

by Ming Lei

[permalink] [raw]

Subject: Re: [PATCH 3/3] bpf: hash: use per-bucket spinlock

On Mon, Dec 28, 2015 at 5:13 PM, Daniel Borkmann <[email protected]> wrote:
> On 12/26/2015 10:31 AM, Ming Lei wrote:
>>
>> From: Ming Lei <[email protected]>
>>
>> Both htab_map_update_elem() and htab_map_delete_elem() can be
>> called from eBPF program, and they may be in kernel hot path,
>> so it isn't efficient to use a per-hashtable lock in this two
>> helpers.
>>
>> The per-hashtable spinlock is used just for protecting bucket's
>> hlist, and per-bucket lock should be enough. This patch converts
>> the per-hashtable lock into per-bucket spinlock, so that contention
>> can be decreased a lot.
>>
>> Signed-off-by: Ming Lei <[email protected]>
>> ---
>> kernel/bpf/hashtab.c | 35 +++++++++++++++++++++++++----------
>> 1 file changed, 25 insertions(+), 10 deletions(-)
>>
>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>> index d857fcb..66bad7a 100644
>> --- a/kernel/bpf/hashtab.c
>> +++ b/kernel/bpf/hashtab.c
>> @@ -14,9 +14,14 @@
>> #include <linux/filter.h>
>> #include <linux/vmalloc.h>
>>
>> +struct bucket {
>> + struct hlist_head head;
>> + raw_spinlock_t lock;
>> +};
>> +
>> struct bpf_htab {
>> struct bpf_map map;
>> - struct hlist_head *buckets;
>> + struct bucket *buckets;
>> raw_spinlock_t lock;
>
>
> Shouldn't the lock member be removed from here?

Hammm, looks this one is a bad version, sorry for that, and will resend it.

>
>
>> atomic_t count; /* number of elements in this hashtable */
>> u32 n_buckets; /* number of hash buckets */
>> @@ -88,24 +93,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr
>> *attr)
>> /* make sure page count doesn't overflow */
>> goto free_htab;
>>
>> - htab->map.pages = round_up(htab->n_buckets * sizeof(struct
>> hlist_head) +
>> + htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket)
>> +
>> htab->elem_size *
>> htab->map.max_entries,
>> PAGE_SIZE) >> PAGE_SHIFT;
>>
>> err = -ENOMEM;
>> - htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct
>> hlist_head),
>> + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct
>> bucket),
>> GFP_USER | __GFP_NOWARN);
>>
>> if (!htab->buckets) {
>> - htab->buckets = vmalloc(htab->n_buckets * sizeof(struct
>> hlist_head));
>> + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct
>> bucket));
>> if (!htab->buckets)
>> goto free_htab;
>> }
>>
>> - for (i = 0; i < htab->n_buckets; i++)
>> - INIT_HLIST_HEAD(&htab->buckets[i]);
>> + for (i = 0; i < htab->n_buckets; i++) {
>> + INIT_HLIST_HEAD(&htab->buckets[i].head);
>> + raw_spin_lock_init(&htab->buckets[i].lock);
>> + }
>>
>> - raw_spin_lock_init(&htab->lock);
>> atomic_set(&htab->count, 0);
>>
>> return &htab->map;
>> @@ -120,11 +126,16 @@ static inline u32 htab_map_hash(const void *key, u32
>> key_len)
>> return jhash(key, key_len, 0);
>> }
>>
>> -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32
>> hash)
>> +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32
>> hash)
>> {
>> return &htab->buckets[hash & (htab->n_buckets - 1)];
>> }
>>
>> +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32
>> hash)
>> +{
>> + return &__select_bucket(htab, hash)->head;
>> +}
>> +
>> static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32
>> hash,
>> void *key, u32 key_size)
>> {
>> @@ -227,6 +238,7 @@ static int htab_map_update_elem(struct bpf_map *map,
>> void *key, void *value,
>> struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
>> struct htab_elem *l_new, *l_old;
>> struct hlist_head *head;
>> + struct bucket *b;
>> unsigned long flags;
>> u32 key_size;
>> int ret;
>> @@ -248,7 +260,8 @@ static int htab_map_update_elem(struct bpf_map *map,
>> void *key, void *value,
>> memcpy(l_new->key + round_up(key_size, 8), value,
>> map->value_size);
>>
>> l_new->hash = htab_map_hash(l_new->key, key_size);
>> - head = select_bucket(htab, l_new->hash);
>> + b = __select_bucket(htab, l_new->hash);
>> + head = &b->head;
>>
>> /* bpf_map_update_elem() can be called in_irq() */
>> raw_spin_lock_irqsave(&htab->lock, flags);
>
>
> I am a bit confused on this one, though.
>
> The raw spin lock is still per hashtable (htab->lock), and has not been
> converted in
> this patch to the per bucket one (as opposed to what the commit message
> says), so
> this patch seems to go into the right direction, but is a bit incomplete?
> Shouldn't
> the above f.e. take b->lock, etc?
>
>> @@ -299,6 +312,7 @@ static int htab_map_delete_elem(struct bpf_map *map,
>> void *key)
>> {
>> struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
>> struct hlist_head *head;
>> + struct bucket *b;
>> struct htab_elem *l;
>> unsigned long flags;
>> u32 hash, key_size;
>> @@ -309,7 +323,8 @@ static int htab_map_delete_elem(struct bpf_map *map,
>> void *key)
>> key_size = map->key_size;
>>
>> hash = htab_map_hash(key, key_size);
>> - head = select_bucket(htab, hash);
>> + b = __select_bucket(htab, hash);
>> + head = &b->head;
>>
>> raw_spin_lock_irqsave(&htab->lock, flags);
>>
>
> Same here?
>
> Thanks,
> Daniel

--
Ming Lei