2019-11-17 18:55:26

by Vitaly Wool

[permalink] [raw]
Subject: [PATCH] zswap: use B-tree for search

From: Vitaly Wool <[email protected]>

The current zswap implementation uses red-black trees to store
entries and to perform lookups. Although this algorithm obviously
has complexity of O(log N) it still takes a while to complete
lookup (or, even more for replacement) of an entry, when the amount
of entries is huge (100K+).

B-trees are known to handle such cases more efficiently (i. e. also
with O(log N) complexity but with way lower coefficient) so trying
zswap with B-trees was worth a shot.

The implementation of B-trees that is currently present in Linux
kernel isn't really doing things in the best possible way (i. e. it
has recursion) but the testing I've run still shows a very
significant performance increase.

The usage pattern of B-tree here is not exactly following the
guidelines but it is due to the fact that pgoff_t may be both 32
and 64 bits long.

Tested on qemu-kvm (-smp 2 -m 1024) with zswap in the following
configuration:
* zpool: z3fold
* max_pool_percent: 100
and the swap size of 1G.

Test command:
$ stress-ng --io 4 --vm 4 --vm-bytes 1000M --timeout 300s --metrics

This, averaged over 20 runs on qemu-kvm (-smp 2 -m 1024) gives the
following io bogo ops:
* original: 73778.8
* btree: 393999

Signed-off-by: Vitaly Wool <[email protected]>
---
mm/Kconfig | 1 +
mm/zswap.c | 147 +++++++++++++++++++++++------------------------------
2 files changed, 65 insertions(+), 83 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index a5dae9a7eb51..b278026de513 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -527,6 +527,7 @@ config ZSWAP
depends on FRONTSWAP && CRYPTO=y
select CRYPTO_LZO
select ZPOOL
+ select BTREE
help
A lightweight compressed cache for swap pages. It takes
pages that are in the process of being swapped out and attempts to
diff --git a/mm/zswap.c b/mm/zswap.c
index 46a322316e52..5f46c4f7a6bf 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -21,7 +21,7 @@
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/frontswap.h>
-#include <linux/rbtree.h>
+#include <linux/btree.h>
#include <linux/swap.h>
#include <linux/crypto.h>
#include <linux/mempool.h>
@@ -134,7 +134,6 @@ struct zswap_pool {
* This structure contains the metadata for tracking a single compressed
* page within zswap.
*
- * rbnode - links the entry into red-black tree for the appropriate swap type
* offset - the swap offset for the entry. Index into the red-black tree.
* refcount - the number of outstanding reference to the entry. This is needed
* to protect against premature freeing of the entry by code
@@ -149,7 +148,6 @@ struct zswap_pool {
* value - value of the same-value filled pages which have same content
*/
struct zswap_entry {
- struct rb_node rbnode;
pgoff_t offset;
int refcount;
unsigned int length;
@@ -166,11 +164,11 @@ struct zswap_header {

/*
* The tree lock in the zswap_tree struct protects a few things:
- * - the rbtree
+ * - the tree
* - the refcount field of each entry in the tree
*/
struct zswap_tree {
- struct rb_root rbroot;
+ struct btree_head head;
spinlock_t lock;
};

@@ -252,7 +250,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
if (!entry)
return NULL;
entry->refcount = 1;
- RB_CLEAR_NODE(&entry->rbnode);
return entry;
}

@@ -262,58 +259,18 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
}

/*********************************
-* rbtree functions
+* btree functions
**********************************/
-static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
-{
- struct rb_node *node = root->rb_node;
- struct zswap_entry *entry;
+static struct btree_geo *btree_pgofft_geo;

- while (node) {
- entry = rb_entry(node, struct zswap_entry, rbnode);
- if (entry->offset > offset)
- node = node->rb_left;
- else if (entry->offset < offset)
- node = node->rb_right;
- else
- return entry;
- }
- return NULL;
-}
-
-/*
- * In the case that a entry with the same offset is found, a pointer to
- * the existing entry is stored in dupentry and the function returns -EEXIST
- */
-static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
- struct zswap_entry **dupentry)
+static struct zswap_entry *zswap_search(struct btree_head *head, pgoff_t offset)
{
- struct rb_node **link = &root->rb_node, *parent = NULL;
- struct zswap_entry *myentry;
-
- while (*link) {
- parent = *link;
- myentry = rb_entry(parent, struct zswap_entry, rbnode);
- if (myentry->offset > entry->offset)
- link = &(*link)->rb_left;
- else if (myentry->offset < entry->offset)
- link = &(*link)->rb_right;
- else {
- *dupentry = myentry;
- return -EEXIST;
- }
- }
- rb_link_node(&entry->rbnode, parent, link);
- rb_insert_color(&entry->rbnode, root);
- return 0;
+ return btree_lookup(head, btree_pgofft_geo, &offset);
}

-static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+static void zswap_erase(struct btree_head *head, struct zswap_entry *entry)
{
- if (!RB_EMPTY_NODE(&entry->rbnode)) {
- rb_erase(&entry->rbnode, root);
- RB_CLEAR_NODE(&entry->rbnode);
- }
+ btree_remove(head, btree_pgofft_geo, &entry->offset);
}

/*
@@ -342,25 +299,40 @@ static void zswap_entry_get(struct zswap_entry *entry)
/* caller must hold the tree lock
* remove from the tree and free it, if nobody reference the entry
*/
-static void zswap_entry_put(struct zswap_tree *tree,
+static void zswap_entry_put(struct btree_head *head,
struct zswap_entry *entry)
{
int refcount = --entry->refcount;

BUG_ON(refcount < 0);
if (refcount == 0) {
- zswap_rb_erase(&tree->rbroot, entry);
+ zswap_erase(head, entry);
zswap_free_entry(entry);
}
}

+static int zswap_insert_or_replace(struct btree_head *head,
+ struct zswap_entry *entry)
+{
+ struct zswap_entry *old;
+
+ do {
+ old = btree_remove(head, btree_pgofft_geo, &entry->offset);
+ if (old) {
+ zswap_duplicate_entry++;
+ zswap_entry_put(head, old);
+ }
+ } while (old);
+ return btree_insert(head, btree_pgofft_geo, &entry->offset, entry,
+ GFP_ATOMIC);
+}
/* caller must hold the tree lock */
-static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
+static struct zswap_entry *zswap_entry_find_get(struct btree_head *head,
pgoff_t offset)
{
struct zswap_entry *entry;

- entry = zswap_rb_search(root, offset);
+ entry = zswap_search(head, offset);
if (entry)
zswap_entry_get(entry);

@@ -861,7 +833,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)

/* find and ref zswap entry */
spin_lock(&tree->lock);
- entry = zswap_entry_find_get(&tree->rbroot, offset);
+ entry = zswap_entry_find_get(&tree->head, offset);
if (!entry) {
/* entry was invalidated */
spin_unlock(&tree->lock);
@@ -910,7 +882,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)

spin_lock(&tree->lock);
/* drop local reference */
- zswap_entry_put(tree, entry);
+ zswap_entry_put(&tree->head, entry);

/*
* There are two possible situations for entry here:
@@ -919,8 +891,8 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
* because invalidate happened during writeback
* search the tree and free the entry if find entry
*/
- if (entry == zswap_rb_search(&tree->rbroot, offset))
- zswap_entry_put(tree, entry);
+ if (entry == zswap_search(&tree->head, offset))
+ zswap_entry_put(&tree->head, entry);
spin_unlock(&tree->lock);

goto end;
@@ -934,7 +906,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
*/
fail:
spin_lock(&tree->lock);
- zswap_entry_put(tree, entry);
+ zswap_entry_put(&tree->head, entry);
spin_unlock(&tree->lock);

end:
@@ -988,7 +960,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
struct page *page)
{
struct zswap_tree *tree = zswap_trees[type];
- struct zswap_entry *entry, *dupentry;
+ struct zswap_entry *entry;
struct crypto_comp *tfm;
int ret;
unsigned int hlen, dlen = PAGE_SIZE;
@@ -1096,16 +1068,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
insert_entry:
/* map */
spin_lock(&tree->lock);
- do {
- ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
- if (ret == -EEXIST) {
- zswap_duplicate_entry++;
- /* remove from rbtree */
- zswap_rb_erase(&tree->rbroot, dupentry);
- zswap_entry_put(tree, dupentry);
- }
- } while (ret == -EEXIST);
+ ret = zswap_insert_or_replace(&tree->head, entry);
spin_unlock(&tree->lock);
+ if (ret < 0) {
+ zswap_reject_alloc_fail++;
+ goto freepage;
+ }

/* update stats */
atomic_inc(&zswap_stored_pages);
@@ -1138,7 +1106,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,

/* find */
spin_lock(&tree->lock);
- entry = zswap_entry_find_get(&tree->rbroot, offset);
+ entry = zswap_entry_find_get(&tree->head, offset);
if (!entry) {
/* entry was written back */
spin_unlock(&tree->lock);
@@ -1168,7 +1136,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,

freeentry:
spin_lock(&tree->lock);
- zswap_entry_put(tree, entry);
+ zswap_entry_put(&tree->head, entry);
spin_unlock(&tree->lock);

return 0;
@@ -1182,36 +1150,41 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)

/* find */
spin_lock(&tree->lock);
- entry = zswap_rb_search(&tree->rbroot, offset);
+ entry = zswap_search(&tree->head, offset);
if (!entry) {
/* entry was written back */
spin_unlock(&tree->lock);
return;
}

- /* remove from rbtree */
- zswap_rb_erase(&tree->rbroot, entry);
+ /* remove from tree */
+ zswap_erase(&tree->head, entry);

/* drop the initial reference from entry creation */
- zswap_entry_put(tree, entry);
+ zswap_entry_put(&tree->head, entry);

spin_unlock(&tree->lock);
}

+void do_free_entry(void *elem, unsigned long opaque, unsigned long *key,
+ size_t index, void *func2)
+{
+ struct zswap_entry *entry = elem;
+ zswap_free_entry(entry);
+}
+
/* frees all zswap entries for the given swap type */
static void zswap_frontswap_invalidate_area(unsigned type)
{
struct zswap_tree *tree = zswap_trees[type];
- struct zswap_entry *entry, *n;

if (!tree)
return;

/* walk the tree and free everything */
spin_lock(&tree->lock);
- rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
- zswap_free_entry(entry);
- tree->rbroot = RB_ROOT;
+ btree_visitor(&tree->head, btree_pgofft_geo, 0, do_free_entry, NULL);
+ btree_destroy(&tree->head);
spin_unlock(&tree->lock);
kfree(tree);
zswap_trees[type] = NULL;
@@ -1226,8 +1199,11 @@ static void zswap_frontswap_init(unsigned type)
pr_err("alloc failed, zswap disabled for swap type %d\n", type);
return;
}
-
- tree->rbroot = RB_ROOT;
+ if (btree_init(&tree->head) < 0) {
+ pr_err("couldn't init the tree head\n");
+ kfree(tree);
+ return;
+ }
spin_lock_init(&tree->lock);
zswap_trees[type] = tree;
}
@@ -1302,6 +1278,11 @@ static int __init init_zswap(void)

zswap_init_started = true;

+ if (sizeof(pgoff_t) == 8)
+ btree_pgofft_geo = &btree_geo64;
+ else
+ btree_pgofft_geo = &btree_geo32;
+
if (zswap_entry_cache_create()) {
pr_err("entry cache creation failed\n");
goto cache_fail;
--
2.17.1


2019-11-18 19:06:12

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] zswap: use B-tree for search

On Sun, Nov 17, 2019 at 08:53:32PM +0200, [email protected] wrote:
> From: Vitaly Wool <[email protected]>
>
> The current zswap implementation uses red-black trees to store
> entries and to perform lookups. Although this algorithm obviously
> has complexity of O(log N) it still takes a while to complete
> lookup (or, even more for replacement) of an entry, when the amount
> of entries is huge (100K+).
>
> B-trees are known to handle such cases more efficiently (i. e. also
> with O(log N) complexity but with way lower coefficient) so trying
> zswap with B-trees was worth a shot.
>
> The implementation of B-trees that is currently present in Linux
> kernel isn't really doing things in the best possible way (i. e. it
> has recursion) but the testing I've run still shows a very
> significant performance increase.
>
> The usage pattern of B-tree here is not exactly following the
> guidelines but it is due to the fact that pgoff_t may be both 32
> and 64 bits long.
>
> Tested on qemu-kvm (-smp 2 -m 1024) with zswap in the following
> configuration:
> * zpool: z3fold
> * max_pool_percent: 100
> and the swap size of 1G.
>
> Test command:
> $ stress-ng --io 4 --vm 4 --vm-bytes 1000M --timeout 300s --metrics
>
> This, averaged over 20 runs on qemu-kvm (-smp 2 -m 1024) gives the
> following io bogo ops:
> * original: 73778.8
> * btree: 393999

Impressive results. Was your test done with a 32bit guest? If yes, I
would assume results for a 64bit guess to drop to about 330k.

> + if (sizeof(pgoff_t) == 8)
> + btree_pgofft_geo = &btree_geo64;
> + else
> + btree_pgofft_geo = &btree_geo32;
> +

You could abuse the fact that pgoff_t is the same size as unsigned long
and use the "l" suffix variant. But apart from the obvious abuse, the
"l" variant hasn't been used before and the implementation appears to be
buggy.

So no complaints about your use of the interface.

J?rn

--
Cryptographic protocols should not be designed by a committee.
-- Niels Ferguson & Bruce Schneier

2019-11-18 22:01:42

by Vitaly Wool

[permalink] [raw]
Subject: Re: [PATCH] zswap: use B-tree for search

On Mon, Nov 18, 2019 at 8:04 PM Jörn Engel <[email protected]> wrote:
>
> On Sun, Nov 17, 2019 at 08:53:32PM +0200, [email protected] wrote:
> > From: Vitaly Wool <[email protected]>
> >
> > The current zswap implementation uses red-black trees to store
> > entries and to perform lookups. Although this algorithm obviously
> > has complexity of O(log N) it still takes a while to complete
> > lookup (or, even more for replacement) of an entry, when the amount
> > of entries is huge (100K+).
> >
> > B-trees are known to handle such cases more efficiently (i. e. also
> > with O(log N) complexity but with way lower coefficient) so trying
> > zswap with B-trees was worth a shot.
> >
> > The implementation of B-trees that is currently present in Linux
> > kernel isn't really doing things in the best possible way (i. e. it
> > has recursion) but the testing I've run still shows a very
> > significant performance increase.
> >
> > The usage pattern of B-tree here is not exactly following the
> > guidelines but it is due to the fact that pgoff_t may be both 32
> > and 64 bits long.
> >
> > Tested on qemu-kvm (-smp 2 -m 1024) with zswap in the following
> > configuration:
> > * zpool: z3fold
> > * max_pool_percent: 100
> > and the swap size of 1G.
> >
> > Test command:
> > $ stress-ng --io 4 --vm 4 --vm-bytes 1000M --timeout 300s --metrics
> >
> > This, averaged over 20 runs on qemu-kvm (-smp 2 -m 1024) gives the
> > following io bogo ops:
> > * original: 73778.8
> > * btree: 393999
>
> Impressive results. Was your test done with a 32bit guest? If yes, I
> would assume results for a 64bit guess to drop to about 330k.

No, it's on a 64 bit virtual machine. I take this improvement is partially
due to zswap_insert_or_replace function which requires less lookups
than the initial implementation, but it's the btree API that made it possible.

> > + if (sizeof(pgoff_t) == 8)
> > + btree_pgofft_geo = &btree_geo64;
> > + else
> > + btree_pgofft_geo = &btree_geo32;
> > +
>
> You could abuse the fact that pgoff_t is the same size as unsigned long
> and use the "l" suffix variant. But apart from the obvious abuse, the
> "l" variant hasn't been used before and the implementation appears to be
> buggy.
>
> So no complaints about your use of the interface.

Thanks! I would then keep it as is and have a task for myself to try out and
possibly debug the "l" suffix variant later on.

~Vitaly