LinuxLists.cc - [PATCH] mbcache: Speed up cache entry creation

2019-07-23 10:02:17

Subject: [PATCH] mbcache: Speed up cache entry creation

From: Sultan Alsawaf <[email protected]>

In order to prevent redundant entry creation by racing against itself,
mb_cache_entry_create scans through a hash-list of all current entries
in order to see if another allocation for the requested new entry has
been made. Furthermore, it allocates memory for a new entry before
scanning through this hash-list, which results in that allocated memory
being discarded when the requested new entry is already present.

Speed up cache entry creation by keeping a small linked list of
requested new entries in progress, and scanning through that first
instead of the large hash-list. And don't allocate memory for a new
entry until it's known that the allocated memory will be used.

Signed-off-by: Sultan Alsawaf <[email protected]>
---
fs/mbcache.c | 82 ++++++++++++++++++++++++++++++++++++----------------
1 file changed, 57 insertions(+), 25 deletions(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 97c54d3a2227..289f3664061e 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -25,9 +25,14 @@
* size hash table is used for fast key lookups.
*/

+struct mb_bucket {
+ struct hlist_bl_head hash;
+ struct list_head req_list;
+};
+
struct mb_cache {
/* Hash table of entries */
- struct hlist_bl_head *c_hash;
+ struct mb_bucket *c_bucket;
/* log2 of hash table size */
int c_bucket_bits;
/* Maximum entries in cache to avoid degrading hash too much */
@@ -42,15 +47,21 @@ struct mb_cache {
struct work_struct c_shrink_work;
};

+struct mb_cache_req {
+ struct list_head lnode;
+ u32 key;
+ u64 value;
+};
+
static struct kmem_cache *mb_entry_cache;

static unsigned long mb_cache_shrink(struct mb_cache *cache,
unsigned long nr_to_scan);

-static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
- u32 key)
+static inline struct mb_bucket *mb_cache_entry_bucket(struct mb_cache *cache,
+ u32 key)
{
- return &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
+ return &cache->c_bucket[hash_32(key, cache->c_bucket_bits)];
}

/*
@@ -77,6 +88,8 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
struct mb_cache_entry *entry, *dup;
struct hlist_bl_node *dup_node;
struct hlist_bl_head *head;
+ struct mb_cache_req *tmp_req, req;
+ struct mb_bucket *bucket;

/* Schedule background reclaim if there are too many entries */
if (cache->c_entry_count >= cache->c_max_entries)
@@ -85,9 +98,33 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
if (cache->c_entry_count >= 2*cache->c_max_entries)
mb_cache_shrink(cache, SYNC_SHRINK_BATCH);

+ bucket = mb_cache_entry_bucket(cache, key);
+ head = &bucket->hash;
+ hlist_bl_lock(head);
+ list_for_each_entry(tmp_req, &bucket->req_list, lnode) {
+ if (tmp_req->key == key && tmp_req->value == value) {
+ hlist_bl_unlock(head);
+ return -EBUSY;
+ }
+ }
+ hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
+ if (dup->e_key == key && dup->e_value == value) {
+ hlist_bl_unlock(head);
+ return -EBUSY;
+ }
+ }
+ req.key = key;
+ req.value = value;
+ list_add(&req.lnode, &bucket->req_list);
+ hlist_bl_unlock(head);
+
entry = kmem_cache_alloc(mb_entry_cache, mask);
- if (!entry)
+ if (!entry) {
+ hlist_bl_lock(head);
+ list_del(&req.lnode);
+ hlist_bl_unlock(head);
return -ENOMEM;
+ }

INIT_LIST_HEAD(&entry->e_list);
/* One ref for hash, one ref returned */
@@ -96,15 +133,9 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
entry->e_value = value;
entry->e_reusable = reusable;
entry->e_referenced = 0;
- head = mb_cache_entry_head(cache, key);
+
hlist_bl_lock(head);
- hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
- if (dup->e_key == key && dup->e_value == value) {
- hlist_bl_unlock(head);
- kmem_cache_free(mb_entry_cache, entry);
- return -EBUSY;
- }
- }
+ list_del(&req.lnode);
hlist_bl_add_head(&entry->e_hash_list, head);
hlist_bl_unlock(head);

@@ -133,7 +164,7 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
struct hlist_bl_node *node;
struct hlist_bl_head *head;

- head = mb_cache_entry_head(cache, key);
+ head = &mb_cache_entry_bucket(cache, key)->hash;
hlist_bl_lock(head);
if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
node = entry->e_hash_list.next;
@@ -202,7 +233,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
struct hlist_bl_head *head;
struct mb_cache_entry *entry;

- head = mb_cache_entry_head(cache, key);
+ head = &mb_cache_entry_bucket(cache, key)->hash;
hlist_bl_lock(head);
hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
if (entry->e_key == key && entry->e_value == value) {
@@ -230,7 +261,7 @@ void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
struct hlist_bl_head *head;
struct mb_cache_entry *entry;

- head = mb_cache_entry_head(cache, key);
+ head = &mb_cache_entry_bucket(cache, key)->hash;
hlist_bl_lock(head);
hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
if (entry->e_key == key && entry->e_value == value) {
@@ -300,7 +331,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
* from under us.
*/
spin_unlock(&cache->c_list_lock);
- head = mb_cache_entry_head(cache, entry->e_key);
+ head = &mb_cache_entry_bucket(cache, entry->e_key)->hash;
hlist_bl_lock(head);
if (!hlist_bl_unhashed(&entry->e_hash_list)) {
hlist_bl_del_init(&entry->e_hash_list);
@@ -354,21 +385,22 @@ struct mb_cache *mb_cache_create(int bucket_bits)
cache->c_max_entries = bucket_count << 4;
INIT_LIST_HEAD(&cache->c_list);
spin_lock_init(&cache->c_list_lock);
- cache->c_hash = kmalloc_array(bucket_count,
- sizeof(struct hlist_bl_head),
- GFP_KERNEL);
- if (!cache->c_hash) {
+ cache->c_bucket = kmalloc_array(bucket_count, sizeof(*cache->c_bucket),
+ GFP_KERNEL);
+ if (!cache->c_bucket) {
kfree(cache);
goto err_out;
}
- for (i = 0; i < bucket_count; i++)
- INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
+ for (i = 0; i < bucket_count; i++) {
+ INIT_HLIST_BL_HEAD(&cache->c_bucket[i].hash);
+ INIT_LIST_HEAD(&cache->c_bucket[i].req_list);
+ }

cache->c_shrink.count_objects = mb_cache_count;
cache->c_shrink.scan_objects = mb_cache_scan;
cache->c_shrink.seeks = DEFAULT_SEEKS;
if (register_shrinker(&cache->c_shrink)) {
- kfree(cache->c_hash);
+ kfree(cache->c_bucket);
kfree(cache);
goto err_out;
}
@@ -409,7 +441,7 @@ void mb_cache_destroy(struct mb_cache *cache)
WARN_ON(atomic_read(&entry->e_refcnt) != 1);
mb_cache_entry_put(cache, entry);
}
- kfree(cache->c_hash);
+ kfree(cache->c_bucket);
kfree(cache);
}
EXPORT_SYMBOL(mb_cache_destroy);
--
2.22.0

2019-07-24 01:40:29

by Andreas Dilger

[permalink] [raw]

Subject: Re: [PATCH] mbcache: Speed up cache entry creation

On Jul 22, 2019, at 11:35 PM, Sultan Alsawaf <[email protected]> wrote:
>
> From: Sultan Alsawaf <[email protected]>
>
> In order to prevent redundant entry creation by racing against itself,
> mb_cache_entry_create scans through a hash-list of all current entries
> in order to see if another allocation for the requested new entry has
> been made. Furthermore, it allocates memory for a new entry before
> scanning through this hash-list, which results in that allocated memory
> being discarded when the requested new entry is already present.
>
> Speed up cache entry creation by keeping a small linked list of
> requested new entries in progress, and scanning through that first
> instead of the large hash-list. And don't allocate memory for a new
> entry until it's known that the allocated memory will be used.

Do you have any kind of performance metrics that show this is an actual
improvement in performance? This would be either macro-level benchmarks
(e.g. fio, but this seems unlikely to show any benefit), or micro-level
measurements (e.g. flame graph) that show a net reduction in CPU cycles,
lock contention, etc. in this part of the code.

Cheers, Andreas

> Signed-off-by: Sultan Alsawaf <[email protected]>
> ---
> fs/mbcache.c | 82 ++++++++++++++++++++++++++++++++++++----------------
> 1 file changed, 57 insertions(+), 25 deletions(-)
>
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index 97c54d3a2227..289f3664061e 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -25,9 +25,14 @@
> * size hash table is used for fast key lookups.
> */
>
> +struct mb_bucket {
> + struct hlist_bl_head hash;
> + struct list_head req_list;
> +};
> +
> struct mb_cache {
> /* Hash table of entries */
> - struct hlist_bl_head *c_hash;
> + struct mb_bucket *c_bucket;
> /* log2 of hash table size */
> int c_bucket_bits;
> /* Maximum entries in cache to avoid degrading hash too much */
> @@ -42,15 +47,21 @@ struct mb_cache {
> struct work_struct c_shrink_work;
> };
>
> +struct mb_cache_req {
> + struct list_head lnode;
> + u32 key;
> + u64 value;
> +};
> +
> static struct kmem_cache *mb_entry_cache;
>
> static unsigned long mb_cache_shrink(struct mb_cache *cache,
> unsigned long nr_to_scan);
>
> -static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
> - u32 key)
> +static inline struct mb_bucket *mb_cache_entry_bucket(struct mb_cache *cache,
> + u32 key)
> {
> - return &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
> + return &cache->c_bucket[hash_32(key, cache->c_bucket_bits)];
> }
>
> /*
> @@ -77,6 +88,8 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> struct mb_cache_entry *entry, *dup;
> struct hlist_bl_node *dup_node;
> struct hlist_bl_head *head;
> + struct mb_cache_req *tmp_req, req;
> + struct mb_bucket *bucket;
>
> /* Schedule background reclaim if there are too many entries */
> if (cache->c_entry_count >= cache->c_max_entries)
> @@ -85,9 +98,33 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> if (cache->c_entry_count >= 2*cache->c_max_entries)
> mb_cache_shrink(cache, SYNC_SHRINK_BATCH);
>
> + bucket = mb_cache_entry_bucket(cache, key);
> + head = &bucket->hash;
> + hlist_bl_lock(head);
> + list_for_each_entry(tmp_req, &bucket->req_list, lnode) {
> + if (tmp_req->key == key && tmp_req->value == value) {
> + hlist_bl_unlock(head);
> + return -EBUSY;
> + }
> + }
> + hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
> + if (dup->e_key == key && dup->e_value == value) {
> + hlist_bl_unlock(head);
> + return -EBUSY;
> + }
> + }
> + req.key = key;
> + req.value = value;
> + list_add(&req.lnode, &bucket->req_list);
> + hlist_bl_unlock(head);
> +
> entry = kmem_cache_alloc(mb_entry_cache, mask);
> - if (!entry)
> + if (!entry) {
> + hlist_bl_lock(head);
> + list_del(&req.lnode);
> + hlist_bl_unlock(head);
> return -ENOMEM;
> + }
>
> INIT_LIST_HEAD(&entry->e_list);
> /* One ref for hash, one ref returned */
> @@ -96,15 +133,9 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> entry->e_value = value;
> entry->e_reusable = reusable;
> entry->e_referenced = 0;
> - head = mb_cache_entry_head(cache, key);
> +
> hlist_bl_lock(head);
> - hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
> - if (dup->e_key == key && dup->e_value == value) {
> - hlist_bl_unlock(head);
> - kmem_cache_free(mb_entry_cache, entry);
> - return -EBUSY;
> - }
> - }
> + list_del(&req.lnode);
> hlist_bl_add_head(&entry->e_hash_list, head);
> hlist_bl_unlock(head);
>
> @@ -133,7 +164,7 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
> struct hlist_bl_node *node;
> struct hlist_bl_head *head;
>
> - head = mb_cache_entry_head(cache, key);
> + head = &mb_cache_entry_bucket(cache, key)->hash;
> hlist_bl_lock(head);
> if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
> node = entry->e_hash_list.next;
> @@ -202,7 +233,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> struct hlist_bl_head *head;
> struct mb_cache_entry *entry;
>
> - head = mb_cache_entry_head(cache, key);
> + head = &mb_cache_entry_bucket(cache, key)->hash;
> hlist_bl_lock(head);
> hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
> if (entry->e_key == key && entry->e_value == value) {
> @@ -230,7 +261,7 @@ void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
> struct hlist_bl_head *head;
> struct mb_cache_entry *entry;
>
> - head = mb_cache_entry_head(cache, key);
> + head = &mb_cache_entry_bucket(cache, key)->hash;
> hlist_bl_lock(head);
> hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
> if (entry->e_key == key && entry->e_value == value) {
> @@ -300,7 +331,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
> * from under us.
> */
> spin_unlock(&cache->c_list_lock);
> - head = mb_cache_entry_head(cache, entry->e_key);
> + head = &mb_cache_entry_bucket(cache, entry->e_key)->hash;
> hlist_bl_lock(head);
> if (!hlist_bl_unhashed(&entry->e_hash_list)) {
> hlist_bl_del_init(&entry->e_hash_list);
> @@ -354,21 +385,22 @@ struct mb_cache *mb_cache_create(int bucket_bits)
> cache->c_max_entries = bucket_count << 4;
> INIT_LIST_HEAD(&cache->c_list);
> spin_lock_init(&cache->c_list_lock);
> - cache->c_hash = kmalloc_array(bucket_count,
> - sizeof(struct hlist_bl_head),
> - GFP_KERNEL);
> - if (!cache->c_hash) {
> + cache->c_bucket = kmalloc_array(bucket_count, sizeof(*cache->c_bucket),
> + GFP_KERNEL);
> + if (!cache->c_bucket) {
> kfree(cache);
> goto err_out;
> }
> - for (i = 0; i < bucket_count; i++)
> - INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
> + for (i = 0; i < bucket_count; i++) {
> + INIT_HLIST_BL_HEAD(&cache->c_bucket[i].hash);
> + INIT_LIST_HEAD(&cache->c_bucket[i].req_list);
> + }
>
> cache->c_shrink.count_objects = mb_cache_count;
> cache->c_shrink.scan_objects = mb_cache_scan;
> cache->c_shrink.seeks = DEFAULT_SEEKS;
> if (register_shrinker(&cache->c_shrink)) {
> - kfree(cache->c_hash);
> + kfree(cache->c_bucket);
> kfree(cache);
> goto err_out;
> }
> @@ -409,7 +441,7 @@ void mb_cache_destroy(struct mb_cache *cache)
> WARN_ON(atomic_read(&entry->e_refcnt) != 1);
> mb_cache_entry_put(cache, entry);
> }
> - kfree(cache->c_hash);
> + kfree(cache->c_bucket);
> kfree(cache);
> }
> EXPORT_SYMBOL(mb_cache_destroy);
> --
> 2.22.0
>

Cheers, Andreas

Attachments:

signature.asc (890.00 B)
Message signed with OpenPGP

2019-07-24 04:03:41

by Sultan Alsawaf

[permalink] [raw]

Subject: Re: [PATCH] mbcache: Speed up cache entry creation

On Tue, Jul 23, 2019 at 10:56:05AM -0600, Andreas Dilger wrote:
> Do you have any kind of performance metrics that show this is an actual
> improvement in performance? This would be either macro-level benchmarks
> (e.g. fio, but this seems unlikely to show any benefit), or micro-level
> measurements (e.g. flame graph) that show a net reduction in CPU cycles,
> lock contention, etc. in this part of the code.

Hi Andreas,

Here are some basic micro-benchmark results:

Before:
[ 3.162896] mb_cache_entry_create: AVG cycles: 75
[ 3.054701] mb_cache_entry_create: AVG cycles: 78
[ 3.152321] mb_cache_entry_create: AVG cycles: 77

After:
[ 3.043380] mb_cache_entry_create: AVG cycles: 68
[ 3.194321] mb_cache_entry_create: AVG cycles: 71
[ 3.038100] mb_cache_entry_create: AVG cycles: 69

The performance difference is probably more drastic when free memory is low,
since an unnecessary call to kmem_cache_alloc() can result in a long wait for
pages to be freed.

The micro-benchmark code is attached.

Thanks,
Sultan
---
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 289f3664061e..e0f22ff8fab8 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -82,7 +82,7 @@ static inline struct mb_bucket *mb_cache_entry_bucket(struct mb_cache *cache,
* -EBUSY if entry with the same key and value already exists in cache.
* Otherwise 0 is returned.
*/
-int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
+static int __mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
u64 value, bool reusable)
{
struct mb_cache_entry *entry, *dup;
@@ -148,6 +148,29 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,

return 0;
}
+
+int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
+ u64 value, bool reusable)
+{
+ static unsigned long count, sum;
+ static DEFINE_MUTEX(lock);
+ volatile cycles_t start, delta;
+ int ret;
+
+ mutex_lock(&lock);
+ local_irq_disable();
+ start = get_cycles();
+ ret = __mb_cache_entry_create(cache, mask, key, value, reusable);
+ delta = get_cycles() - start;
+ local_irq_enable();
+
+ sum += delta;
+ if (++count == 1000)
+ printk("%s: AVG cycles: %lu\n", __func__, sum / count);
+ mutex_unlock(&lock);
+
+ return ret;
+}
EXPORT_SYMBOL(mb_cache_entry_create);

void __mb_cache_entry_free(struct mb_cache_entry *entry)

2019-07-25 21:18:18

by Andreas Dilger

[permalink] [raw]

Subject: Re: [PATCH] mbcache: Speed up cache entry creation

On Jul 23, 2019, at 10:01 PM, Sultan Alsawaf <[email protected]> wrote:
>
> On Tue, Jul 23, 2019 at 10:56:05AM -0600, Andreas Dilger wrote:
>> Do you have any kind of performance metrics that show this is an actual
>> improvement in performance? This would be either macro-level benchmarks
>> (e.g. fio, but this seems unlikely to show any benefit), or micro-level
>> measurements (e.g. flame graph) that show a net reduction in CPU cycles,
>> lock contention, etc. in this part of the code.
>
> Hi Andreas,
>
> Here are some basic micro-benchmark results:
>
> Before:
> [ 3.162896] mb_cache_entry_create: AVG cycles: 75
> [ 3.054701] mb_cache_entry_create: AVG cycles: 78
> [ 3.152321] mb_cache_entry_create: AVG cycles: 77
>
> After:
> [ 3.043380] mb_cache_entry_create: AVG cycles: 68
> [ 3.194321] mb_cache_entry_create: AVG cycles: 71
> [ 3.038100] mb_cache_entry_create: AVG cycles: 69

This information should be included in the patch description, since that
allows making a decision on whether the patch is worthwhile to land or not.

> The performance difference is probably more drastic when free memory is low,
> since an unnecessary call to kmem_cache_alloc() can result in a long wait for
> pages to be freed.

Cheers, Andreas

Attachments:

signature.asc (890.00 B)
Message signed with OpenPGP