2024-01-18 03:06:08

by Chris Li

[permalink] [raw]
Subject: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

The RB tree shows some contribution to the swap fault
long tail latency due to two factors:
1) RB tree requires re-balance from time to time.
2) The zswap RB tree has a tree level spin lock protecting
the tree access.

The swap cache is using xarray. The break down the swap
cache access does not have the similar long time as zswap
RB tree.

Moving the zswap entry to xarray enable read side
take read RCU lock only.

The first patch adds the xarray alongside the RB tree.
There is some debug check asserting the xarray agrees with
the RB tree results.

The second patch removes the zwap RB tree.

I expect to merge the zswap rb tree spin lock with the xarray
lock in the follow up changes.

I can surely use some help in reviewing and testing.

Signed-off-by: Chris Li <[email protected]>
---
Chris Li (2):
mm: zswap.c: add xarray tree to zswap
mm: zswap.c: remove RB tree

mm/zswap.c | 120 ++++++++++++++++++++++++++++++-------------------------------
1 file changed, 59 insertions(+), 61 deletions(-)
---
base-commit: d7ba3d7c3bf13e2faf419cce9e9bdfc3a1a50905
change-id: 20240104-zswap-xarray-716260e541e3

Best regards,
--
Chris Li <[email protected]>



2024-01-18 03:06:23

by Chris Li

[permalink] [raw]
Subject: [PATCH 1/2] mm: zswap.c: add xarray tree to zswap

The xarray tree is added alongside the zswap RB tree.
Checks for the xarray get the same result as the RB tree operations.

Rename the zswap RB tree function to a more generic function
name without the RB part.
---
mm/zswap.c | 60 ++++++++++++++++++++++++++++++++++++++++++------------------
1 file changed, 42 insertions(+), 18 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index f8bc9e089268..a40b0076722b 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -235,6 +235,7 @@ struct zswap_entry {
*/
struct zswap_tree {
struct rb_root rbroot;
+ struct xarray xarray;
spinlock_t lock;
};

@@ -462,9 +463,9 @@ static void zswap_lru_putback(struct list_lru *list_lru,
/*********************************
* rbtree functions
**********************************/
-static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
+static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
{
- struct rb_node *node = root->rb_node;
+ struct rb_node *node = tree->rbroot.rb_node;
struct zswap_entry *entry;
pgoff_t entry_offset;

@@ -475,8 +476,12 @@ static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
node = node->rb_left;
else if (entry_offset < offset)
node = node->rb_right;
- else
+ else {
+ struct zswap_entry *e = xa_load(&tree->xarray, offset);
+
+ BUG_ON(entry != e);
return entry;
+ }
}
return NULL;
}
@@ -485,13 +490,15 @@ static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
* In the case that a entry with the same offset is found, a pointer to
* the existing entry is stored in dupentry and the function returns -EEXIST
*/
-static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
+static int zswap_insert(struct zswap_tree *tree, struct zswap_entry *entry,
struct zswap_entry **dupentry)
{
+ struct rb_root *root = &tree->rbroot;
struct rb_node **link = &root->rb_node, *parent = NULL;
- struct zswap_entry *myentry;
+ struct zswap_entry *myentry, *old;
pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);

+
while (*link) {
parent = *link;
myentry = rb_entry(parent, struct zswap_entry, rbnode);
@@ -501,19 +508,26 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
else if (myentry_offset < entry_offset)
link = &(*link)->rb_right;
else {
+ old = xa_load(&tree->xarray, entry_offset);
+ BUG_ON(old != myentry);
*dupentry = myentry;
return -EEXIST;
}
}
rb_link_node(&entry->rbnode, parent, link);
rb_insert_color(&entry->rbnode, root);
+ old = xa_store(&tree->xarray, entry_offset, entry, GFP_KERNEL);
return 0;
}

-static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+static bool zswap_erase(struct zswap_tree *tree, struct zswap_entry *entry)
{
+ pgoff_t offset = swp_offset(entry->swpentry);
if (!RB_EMPTY_NODE(&entry->rbnode)) {
- rb_erase(&entry->rbnode, root);
+ struct zswap_entry *old;
+ old = xa_erase(&tree->xarray, offset);
+ BUG_ON(old != entry);
+ rb_erase(&entry->rbnode, &tree->rbroot);
RB_CLEAR_NODE(&entry->rbnode);
return true;
}
@@ -575,12 +589,12 @@ static void zswap_entry_put(struct zswap_tree *tree,
}

/* caller must hold the tree lock */
-static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
+static struct zswap_entry *zswap_entry_find_get(struct zswap_tree *tree,
pgoff_t offset)
{
struct zswap_entry *entry;

- entry = zswap_rb_search(root, offset);
+ entry = zswap_search(tree, offset);
if (entry)
zswap_entry_get(entry);

@@ -845,7 +859,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
static void zswap_invalidate_entry(struct zswap_tree *tree,
struct zswap_entry *entry)
{
- if (zswap_rb_erase(&tree->rbroot, entry))
+ if (zswap_erase(tree, entry))
zswap_entry_put(tree, entry);
}

@@ -875,7 +889,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o

/* Check for invalidate() race */
spin_lock(&tree->lock);
- if (entry != zswap_rb_search(&tree->rbroot, swpoffset))
+ if (entry != zswap_search(tree, swpoffset))
goto unlock;

/* Hold a reference to prevent a free during writeback */
@@ -1407,6 +1421,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
struct zswap_tree *tree)
{
swp_entry_t swpentry = entry->swpentry;
+ pgoff_t offset = swp_offset(swpentry);
+ struct zswap_entry *e;
struct folio *folio;
struct mempolicy *mpol;
bool folio_was_allocated;
@@ -1439,7 +1455,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
* avoid overwriting a new swap folio with old compressed data.
*/
spin_lock(&tree->lock);
- if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
+ e = zswap_search(tree, offset);
+ if (e != entry) {
spin_unlock(&tree->lock);
delete_from_swap_cache(folio);
return -ENOMEM;
@@ -1528,7 +1545,7 @@ bool zswap_store(struct folio *folio)
* the tree, and it might be written back overriding the new data.
*/
spin_lock(&tree->lock);
- dupentry = zswap_rb_search(&tree->rbroot, offset);
+ dupentry = zswap_search(tree, offset);
if (dupentry) {
zswap_duplicate_entry++;
zswap_invalidate_entry(tree, dupentry);
@@ -1671,7 +1688,7 @@ bool zswap_store(struct folio *folio)
* found again here it means that something went wrong in the swap
* cache.
*/
- while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
+ while (zswap_insert(tree, entry, &dupentry) == -EEXIST) {
WARN_ON(1);
zswap_duplicate_entry++;
zswap_invalidate_entry(tree, dupentry);
@@ -1722,7 +1739,7 @@ bool zswap_load(struct folio *folio)

/* find */
spin_lock(&tree->lock);
- entry = zswap_entry_find_get(&tree->rbroot, offset);
+ entry = zswap_entry_find_get(tree, offset);
if (!entry) {
spin_unlock(&tree->lock);
return false;
@@ -1762,7 +1779,7 @@ void zswap_invalidate(int type, pgoff_t offset)

/* find */
spin_lock(&tree->lock);
- entry = zswap_rb_search(&tree->rbroot, offset);
+ entry = zswap_search(tree, offset);
if (!entry) {
/* entry was written back */
spin_unlock(&tree->lock);
@@ -1783,6 +1800,7 @@ void zswap_swapon(int type)
}

tree->rbroot = RB_ROOT;
+ xa_init(&tree->xarray);
spin_lock_init(&tree->lock);
zswap_trees[type] = tree;
}
@@ -1790,15 +1808,21 @@ void zswap_swapon(int type)
void zswap_swapoff(int type)
{
struct zswap_tree *tree = zswap_trees[type];
- struct zswap_entry *entry, *n;
+ struct zswap_entry *entry, *e, *n;
+ XA_STATE(xas, tree ? &tree->xarray : NULL, 0);

if (!tree)
return;

/* walk the tree and free everything */
spin_lock(&tree->lock);
+
+ xas_for_each(&xas, e, ULONG_MAX)
+ zswap_invalidate_entry(tree, e);
+
rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
- zswap_free_entry(entry);
+ BUG_ON(entry);
+
tree->rbroot = RB_ROOT;
spin_unlock(&tree->lock);
kfree(tree);

--
2.43.0.429.g432eaa2c6b-goog


2024-01-18 03:06:27

by Chris Li

[permalink] [raw]
Subject: [PATCH 2/2] mm: zswap.c: remove RB tree

remove the RB tree code and the RB tree data structure
from zswap.

The xarray insert and erase code have been updated to
use the XAS version of the API to cache the lookup before
the final xarray store.

The zswap tree spinlock hasn't been removed yet due to
other usage outside of the zswap tree. The zswap xarray
function should work fine with its internal lock on RCU
without the zswap tree lock.

This removes the RB node inside the zswap entry, saving
about three pointers in size. Considering the extra
overhead of xarray lookup tables, this should have some
net saving in terms of memory overhead in zswap if the
index is dense.

The zswap RB tree spin lock is still there to protect
the zswap entry. Expect the follow up change to merge
the RB tree spin lock with the xarray lock.
---
mm/zswap.c | 98 +++++++++++++++++++++++---------------------------------------
1 file changed, 36 insertions(+), 62 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index a40b0076722b..555d5608d401 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -197,7 +197,6 @@ struct zswap_pool {
* This structure contains the metadata for tracking a single compressed
* page within zswap.
*
- * rbnode - links the entry into red-black tree for the appropriate swap type
* swpentry - associated swap entry, the offset indexes into the red-black tree
* refcount - the number of outstanding reference to the entry. This is needed
* to protect against premature freeing of the entry by code
@@ -215,7 +214,6 @@ struct zswap_pool {
* lru - handle to the pool's lru used to evict pages.
*/
struct zswap_entry {
- struct rb_node rbnode;
swp_entry_t swpentry;
int refcount;
unsigned int length;
@@ -234,7 +232,6 @@ struct zswap_entry {
* - the refcount field of each entry in the tree
*/
struct zswap_tree {
- struct rb_root rbroot;
struct xarray xarray;
spinlock_t lock;
};
@@ -357,7 +354,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
if (!entry)
return NULL;
entry->refcount = 1;
- RB_CLEAR_NODE(&entry->rbnode);
return entry;
}

@@ -465,25 +461,7 @@ static void zswap_lru_putback(struct list_lru *list_lru,
**********************************/
static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
{
- struct rb_node *node = tree->rbroot.rb_node;
- struct zswap_entry *entry;
- pgoff_t entry_offset;
-
- while (node) {
- entry = rb_entry(node, struct zswap_entry, rbnode);
- entry_offset = swp_offset(entry->swpentry);
- if (entry_offset > offset)
- node = node->rb_left;
- else if (entry_offset < offset)
- node = node->rb_right;
- else {
- struct zswap_entry *e = xa_load(&tree->xarray, offset);
-
- BUG_ON(entry != e);
- return entry;
- }
- }
- return NULL;
+ return xa_load(&tree->xarray, offset);
}

/*
@@ -493,45 +471,47 @@ static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
static int zswap_insert(struct zswap_tree *tree, struct zswap_entry *entry,
struct zswap_entry **dupentry)
{
- struct rb_root *root = &tree->rbroot;
- struct rb_node **link = &root->rb_node, *parent = NULL;
- struct zswap_entry *myentry, *old;
- pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
-
-
- while (*link) {
- parent = *link;
- myentry = rb_entry(parent, struct zswap_entry, rbnode);
- myentry_offset = swp_offset(myentry->swpentry);
- if (myentry_offset > entry_offset)
- link = &(*link)->rb_left;
- else if (myentry_offset < entry_offset)
- link = &(*link)->rb_right;
- else {
- old = xa_load(&tree->xarray, entry_offset);
- BUG_ON(old != myentry);
- *dupentry = myentry;
+ struct zswap_entry *e;
+ pgoff_t offset = swp_offset(entry->swpentry);
+ XA_STATE(xas, &tree->xarray, offset);
+
+ do {
+ xas_lock_irq(&xas);
+ do {
+ e = xas_load(&xas);
+ if (xa_is_zero(e))
+ e = NULL;
+ } while (xas_retry(&xas, e));
+ if (xas_valid(&xas) && e) {
+ xas_unlock_irq(&xas);
+ *dupentry = e;
return -EEXIST;
}
- }
- rb_link_node(&entry->rbnode, parent, link);
- rb_insert_color(&entry->rbnode, root);
- old = xa_store(&tree->xarray, entry_offset, entry, GFP_KERNEL);
- return 0;
+ xas_store(&xas, entry);
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+ return xas_error(&xas);
}

static bool zswap_erase(struct zswap_tree *tree, struct zswap_entry *entry)
{
+ struct zswap_entry *e;
pgoff_t offset = swp_offset(entry->swpentry);
- if (!RB_EMPTY_NODE(&entry->rbnode)) {
- struct zswap_entry *old;
- old = xa_erase(&tree->xarray, offset);
- BUG_ON(old != entry);
- rb_erase(&entry->rbnode, &tree->rbroot);
- RB_CLEAR_NODE(&entry->rbnode);
- return true;
- }
- return false;
+ XA_STATE(xas, &tree->xarray, offset);
+
+ do {
+ xas_lock_irq(&xas);
+ do {
+ e = xas_load(&xas);
+ } while (xas_retry(&xas, e));
+ if (xas_valid(&xas) && e != entry) {
+ xas_unlock_irq(&xas);
+ return false;
+ }
+ xas_store(&xas, NULL);
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+ return !xas_error(&xas);
}

static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
@@ -583,7 +563,6 @@ static void zswap_entry_put(struct zswap_tree *tree,

WARN_ON_ONCE(refcount < 0);
if (refcount == 0) {
- WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
zswap_free_entry(entry);
}
}
@@ -1799,7 +1778,6 @@ void zswap_swapon(int type)
return;
}

- tree->rbroot = RB_ROOT;
xa_init(&tree->xarray);
spin_lock_init(&tree->lock);
zswap_trees[type] = tree;
@@ -1808,7 +1786,7 @@ void zswap_swapon(int type)
void zswap_swapoff(int type)
{
struct zswap_tree *tree = zswap_trees[type];
- struct zswap_entry *entry, *e, *n;
+ struct zswap_entry *e;
XA_STATE(xas, tree ? &tree->xarray : NULL, 0);

if (!tree)
@@ -1820,10 +1798,6 @@ void zswap_swapoff(int type)
xas_for_each(&xas, e, ULONG_MAX)
zswap_invalidate_entry(tree, e);

- rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
- BUG_ON(entry);
-
- tree->rbroot = RB_ROOT;
spin_unlock(&tree->lock);
kfree(tree);
zswap_trees[type] = NULL;

--
2.43.0.429.g432eaa2c6b-goog


2024-01-18 06:02:37

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

That's a long CC list for sure :)

On Wed, Jan 17, 2024 at 7:06 PM Chris Li <[email protected]> wrote:
>
> The RB tree shows some contribution to the swap fault
> long tail latency due to two factors:
> 1) RB tree requires re-balance from time to time.
> 2) The zswap RB tree has a tree level spin lock protecting
> the tree access.
>
> The swap cache is using xarray. The break down the swap
> cache access does not have the similar long time as zswap
> RB tree.

I think the comparison to the swap cache may not be valid as the swap
cache has many trees per swapfile, while zswap has a single tree.

>
> Moving the zswap entry to xarray enable read side
> take read RCU lock only.

Nice.

>
> The first patch adds the xarray alongside the RB tree.
> There is some debug check asserting the xarray agrees with
> the RB tree results.
>
> The second patch removes the zwap RB tree.

The breakdown looks like something that would be a development step,
but for patch submission I think it makes more sense to have a single
patch replacing the rbtree with an xarray.

>
> I expect to merge the zswap rb tree spin lock with the xarray
> lock in the follow up changes.

Shouldn't this simply be changing uses of tree->lock to use
xa_{lock/unlock}? We also need to make sure we don't try to lock the
tree when operating on the xarray if the caller is already holding the
lock, but this seems to be straightforward enough to be done as part
of this patch or this series at least.

Am I missing something?

>
> I can surely use some help in reviewing and testing.
>
> Signed-off-by: Chris Li <[email protected]>
> ---
> Chris Li (2):
> mm: zswap.c: add xarray tree to zswap
> mm: zswap.c: remove RB tree
>
> mm/zswap.c | 120 ++++++++++++++++++++++++++++++-------------------------------
> 1 file changed, 59 insertions(+), 61 deletions(-)
> ---
> base-commit: d7ba3d7c3bf13e2faf419cce9e9bdfc3a1a50905
> change-id: 20240104-zswap-xarray-716260e541e3
>
> Best regards,
> --
> Chris Li <[email protected]>
>

2024-01-18 06:21:17

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: zswap.c: add xarray tree to zswap

On Wed, Jan 17, 2024 at 7:06 PM Chris Li <[email protected]> wrote:
>
> The xarray tree is added alongside the zswap RB tree.
> Checks for the xarray get the same result as the RB tree operations.
>
> Rename the zswap RB tree function to a more generic function
> name without the RB part.

As I mentioned in the cover letter, I believe this should be squashed
into the second patch. I have some comments below as well on the parts
that should remain after the squash.

[..]
>
> @@ -462,9 +463,9 @@ static void zswap_lru_putback(struct list_lru *list_lru,
> /*********************************
> * rbtree functions
> **********************************/
> -static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
> +static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)

Let's change the zswap_rb_* prefixes to zswap_tree_* instead of just
zswap_*. Otherwise, it will be confusing to have both zswap_store and
zswap_insert (as well as zswap_load and zswap_search).

[..]
> @@ -1790,15 +1808,21 @@ void zswap_swapon(int type)
> void zswap_swapoff(int type)
> {
> struct zswap_tree *tree = zswap_trees[type];
> - struct zswap_entry *entry, *n;
> + struct zswap_entry *entry, *e, *n;
> + XA_STATE(xas, tree ? &tree->xarray : NULL, 0);
>
> if (!tree)
> return;
>
> /* walk the tree and free everything */
> spin_lock(&tree->lock);
> +
> + xas_for_each(&xas, e, ULONG_MAX)

Why not use xa_for_each?

> + zswap_invalidate_entry(tree, e);
> +
> rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
> - zswap_free_entry(entry);

Replacing zswap_free_entry() with zswap_invalidate_entry() is a
behavioral change that should be done separate from this series, but I
am wondering why it's needed. IIUC, the swapoff code should be making
sure there are no ongoing swapin/swapout operations, and there are no
pages left in zswap to writeback.

Is it the case that swapoff may race with writeback, such that
writeback is holding the last remaining ref after zswap_invalidate()
is called, and then zswap_swapoff() is called freeing the zswap entry
while writeback is still accessing it?

2024-01-18 06:36:03

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: zswap.c: remove RB tree

> @@ -493,45 +471,47 @@ static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
> static int zswap_insert(struct zswap_tree *tree, struct zswap_entry *entry,
> struct zswap_entry **dupentry)
> {
> - struct rb_root *root = &tree->rbroot;
> - struct rb_node **link = &root->rb_node, *parent = NULL;
> - struct zswap_entry *myentry, *old;
> - pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
> -
> -
> - while (*link) {
> - parent = *link;
> - myentry = rb_entry(parent, struct zswap_entry, rbnode);
> - myentry_offset = swp_offset(myentry->swpentry);
> - if (myentry_offset > entry_offset)
> - link = &(*link)->rb_left;
> - else if (myentry_offset < entry_offset)
> - link = &(*link)->rb_right;
> - else {
> - old = xa_load(&tree->xarray, entry_offset);
> - BUG_ON(old != myentry);
> - *dupentry = myentry;
> + struct zswap_entry *e;
> + pgoff_t offset = swp_offset(entry->swpentry);
> + XA_STATE(xas, &tree->xarray, offset);
> +
> + do {
> + xas_lock_irq(&xas);
> + do {
> + e = xas_load(&xas);
> + if (xa_is_zero(e))
> + e = NULL;
> + } while (xas_retry(&xas, e));
> + if (xas_valid(&xas) && e) {
> + xas_unlock_irq(&xas);
> + *dupentry = e;
> return -EEXIST;
> }
> - }
> - rb_link_node(&entry->rbnode, parent, link);
> - rb_insert_color(&entry->rbnode, root);
> - old = xa_store(&tree->xarray, entry_offset, entry, GFP_KERNEL);
> - return 0;
> + xas_store(&xas, entry);
> + xas_unlock_irq(&xas);
> + } while (xas_nomem(&xas, GFP_KERNEL));
> + return xas_error(&xas);

I think using the xas_* APIs can be avoided here. The only reason we
need it is that we want to check if there's an existing entry first,
and return -EEXIST. However, in that case, the caller will replace it
anyway (and do some operations on the dupentry):

while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
WARN_ON(1);
zswap_duplicate_entry++;
zswap_invalidate_entry(tree, dupentry);
}

So I think we can do something like this in zswap_insert() instead:

dupentry = xa_store(..);
if (WARN_ON(dupentry)) {
zswap_duplicate_entry++;
zswap_invalidate_entry(tree, dupentry);
}

WDYT?

> }
>
> static bool zswap_erase(struct zswap_tree *tree, struct zswap_entry *entry)
> {
> + struct zswap_entry *e;
> pgoff_t offset = swp_offset(entry->swpentry);
> - if (!RB_EMPTY_NODE(&entry->rbnode)) {
> - struct zswap_entry *old;
> - old = xa_erase(&tree->xarray, offset);
> - BUG_ON(old != entry);
> - rb_erase(&entry->rbnode, &tree->rbroot);
> - RB_CLEAR_NODE(&entry->rbnode);
> - return true;
> - }
> - return false;
> + XA_STATE(xas, &tree->xarray, offset);
> +
> + do {
> + xas_lock_irq(&xas);
> + do {
> + e = xas_load(&xas);
> + } while (xas_retry(&xas, e));
> + if (xas_valid(&xas) && e != entry) {
> + xas_unlock_irq(&xas);
> + return false;
> + }
> + xas_store(&xas, NULL);
> + xas_unlock_irq(&xas);
> + } while (xas_nomem(&xas, GFP_KERNEL));
> + return !xas_error(&xas);
> }

Same here, I think we just want:

return !!xa_erase(..);

>
> static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
> @@ -583,7 +563,6 @@ static void zswap_entry_put(struct zswap_tree *tree,
>
> WARN_ON_ONCE(refcount < 0);
> if (refcount == 0) {
> - WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
> zswap_free_entry(entry);
> }

nit: the braces are no longer needed here

2024-01-18 06:40:33

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On Wed, Jan 17, 2024 at 10:01 PM Yosry Ahmed <[email protected]> wrote:
>
> That's a long CC list for sure :)
>
> On Wed, Jan 17, 2024 at 7:06 PM Chris Li <[email protected]> wrote:
> >
> > The RB tree shows some contribution to the swap fault
> > long tail latency due to two factors:
> > 1) RB tree requires re-balance from time to time.
> > 2) The zswap RB tree has a tree level spin lock protecting
> > the tree access.
> >
> > The swap cache is using xarray. The break down the swap
> > cache access does not have the similar long time as zswap
> > RB tree.
>
> I think the comparison to the swap cache may not be valid as the swap
> cache has many trees per swapfile, while zswap has a single tree.
>
> >
> > Moving the zswap entry to xarray enable read side
> > take read RCU lock only.
>
> Nice.
>
> >
> > The first patch adds the xarray alongside the RB tree.
> > There is some debug check asserting the xarray agrees with
> > the RB tree results.
> >
> > The second patch removes the zwap RB tree.
>
> The breakdown looks like something that would be a development step,
> but for patch submission I think it makes more sense to have a single
> patch replacing the rbtree with an xarray.
>
> >
> > I expect to merge the zswap rb tree spin lock with the xarray
> > lock in the follow up changes.
>
> Shouldn't this simply be changing uses of tree->lock to use
> xa_{lock/unlock}? We also need to make sure we don't try to lock the
> tree when operating on the xarray if the caller is already holding the
> lock, but this seems to be straightforward enough to be done as part
> of this patch or this series at least.
>
> Am I missing something?

Also, I assume we will only see performance improvements after the
tree lock in its current form is removed so that we get loads
protected only by RCU. Can we get some performance numbers to see how
the latency improves with the xarray under contention (unless
Chengming is already planning on testing this for his multi-tree
patches).

2024-01-18 06:48:37

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On Wed, Jan 17, 2024 at 10:02 PM Yosry Ahmed <[email protected]> wrote:
>
> That's a long CC list for sure :)
>
> On Wed, Jan 17, 2024 at 7:06 PM Chris Li <[email protected]> wrote:
> >
> > The RB tree shows some contribution to the swap fault
> > long tail latency due to two factors:
> > 1) RB tree requires re-balance from time to time.
> > 2) The zswap RB tree has a tree level spin lock protecting
> > the tree access.
> >
> > The swap cache is using xarray. The break down the swap
> > cache access does not have the similar long time as zswap
> > RB tree.
>
> I think the comparison to the swap cache may not be valid as the swap
> cache has many trees per swapfile, while zswap has a single tree.

Yes, good point. I think we can bench mark the xarray zswap vs the RB
tree zswap, that would be more of a direct comparison.

> > Moving the zswap entry to xarray enable read side
> > take read RCU lock only.
>
> Nice.
>
> >
> > The first patch adds the xarray alongside the RB tree.
> > There is some debug check asserting the xarray agrees with
> > the RB tree results.
> >
> > The second patch removes the zwap RB tree.
>
> The breakdown looks like something that would be a development step,
> but for patch submission I think it makes more sense to have a single
> patch replacing the rbtree with an xarray.

I think it makes the review easier. The code adding and removing does
not have much overlap. Combining it to a single patch does not save
patch size. Having the assert check would be useful for some bisecting
to narrow down which step causing the problem. I am fine with squash
it to one patch as well.
>
> >
> > I expect to merge the zswap rb tree spin lock with the xarray
> > lock in the follow up changes.
>
> Shouldn't this simply be changing uses of tree->lock to use
> xa_{lock/unlock}? We also need to make sure we don't try to lock the
> tree when operating on the xarray if the caller is already holding the
> lock, but this seems to be straightforward enough to be done as part
> of this patch or this series at least.
>
> Am I missing something?

Currently the zswap entry refcount is protected by the zswap tree spin
lock as well. Can't remove the tree spin lock without changing the
refcount code. I think the zswap search entry should just return the
entry with refcount atomic increase, inside the RCU read() or xarray
lock. The previous zswap code does the find_and_get entry() which is
closer to what I want.

Chris

2024-01-18 06:57:58

by Chengming Zhou

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

Hi Yosry and Chris,

On 2024/1/18 14:39, Yosry Ahmed wrote:
> On Wed, Jan 17, 2024 at 10:01 PM Yosry Ahmed <[email protected]> wrote:
>>
>> That's a long CC list for sure :)
>>
>> On Wed, Jan 17, 2024 at 7:06 PM Chris Li <[email protected]> wrote:
>>>
>>> The RB tree shows some contribution to the swap fault
>>> long tail latency due to two factors:
>>> 1) RB tree requires re-balance from time to time.
>>> 2) The zswap RB tree has a tree level spin lock protecting
>>> the tree access.
>>>
>>> The swap cache is using xarray. The break down the swap
>>> cache access does not have the similar long time as zswap
>>> RB tree.
>>
>> I think the comparison to the swap cache may not be valid as the swap
>> cache has many trees per swapfile, while zswap has a single tree.
>>
>>>
>>> Moving the zswap entry to xarray enable read side
>>> take read RCU lock only.
>>
>> Nice.
>>
>>>
>>> The first patch adds the xarray alongside the RB tree.
>>> There is some debug check asserting the xarray agrees with
>>> the RB tree results.
>>>
>>> The second patch removes the zwap RB tree.
>>
>> The breakdown looks like something that would be a development step,
>> but for patch submission I think it makes more sense to have a single
>> patch replacing the rbtree with an xarray.
>>
>>>
>>> I expect to merge the zswap rb tree spin lock with the xarray
>>> lock in the follow up changes.
>>
>> Shouldn't this simply be changing uses of tree->lock to use
>> xa_{lock/unlock}? We also need to make sure we don't try to lock the
>> tree when operating on the xarray if the caller is already holding the
>> lock, but this seems to be straightforward enough to be done as part
>> of this patch or this series at least.
>>
>> Am I missing something?
>
> Also, I assume we will only see performance improvements after the
> tree lock in its current form is removed so that we get loads
> protected only by RCU. Can we get some performance numbers to see how
> the latency improves with the xarray under contention (unless
> Chengming is already planning on testing this for his multi-tree
> patches).

I just give it a try, the same test of kernel build in tmpfs with zswap
shrinker enabled, all based on the latest mm/mm-stable branch.

mm-stable zswap-split-tree zswap-xarray
real 1m10.442s 1m4.157s 1m9.962s
user 17m48.232s 17m41.477s 17m45.887s
sys 8m13.517s 5m2.226s 7m59.305s

Looks like the contention of concurrency is still there, I haven't
look into the code yet, will review it later.


2024-01-18 07:06:47

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

The name changes from Chris to Christopher are confusing :D

>
> I think it makes the review easier. The code adding and removing does
> not have much overlap. Combining it to a single patch does not save
> patch size. Having the assert check would be useful for some bisecting
> to narrow down which step causing the problem. I am fine with squash
> it to one patch as well.

I think having two patches is unnecessarily noisy, and we add some
debug code in this patch that we remove in the next patch anyway.
Let's see what others think, but personally I prefer a single patch.

> >
> > >
> > > I expect to merge the zswap rb tree spin lock with the xarray
> > > lock in the follow up changes.
> >
> > Shouldn't this simply be changing uses of tree->lock to use
> > xa_{lock/unlock}? We also need to make sure we don't try to lock the
> > tree when operating on the xarray if the caller is already holding the
> > lock, but this seems to be straightforward enough to be done as part
> > of this patch or this series at least.
> >
> > Am I missing something?
>
> Currently the zswap entry refcount is protected by the zswap tree spin
> lock as well. Can't remove the tree spin lock without changing the
> refcount code. I think the zswap search entry should just return the
> entry with refcount atomic increase, inside the RCU read() or xarray
> lock. The previous zswap code does the find_and_get entry() which is
> closer to what I want.

I think this can be done in an RCU read section surrounding xa_load()
and the refcount increment. Didn't look closely to check how much
complexity this adds to manage refcounts with RCU, but I think there
should be a lot of examples all around the kernel.

IIUC, there are no performance benefits from this conversion until we
remove the tree spinlock, right?

2024-01-18 07:20:01

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On Wed, Jan 17, 2024 at 11:02 PM Yosry Ahmed <[email protected]> wrote:
>
> On Wed, Jan 17, 2024 at 10:57 PM Chengming Zhou
> <[email protected]> wrote:
> >
> > Hi Yosry and Chris,
> >
> > On 2024/1/18 14:39, Yosry Ahmed wrote:
> > > On Wed, Jan 17, 2024 at 10:01 PM Yosry Ahmed <[email protected]> wrote:
> > >>
> > >> That's a long CC list for sure :)
> > >>
> > >> On Wed, Jan 17, 2024 at 7:06 PM Chris Li <[email protected]> wrote:
> > >>>
> > >>> The RB tree shows some contribution to the swap fault
> > >>> long tail latency due to two factors:
> > >>> 1) RB tree requires re-balance from time to time.
> > >>> 2) The zswap RB tree has a tree level spin lock protecting
> > >>> the tree access.
> > >>>
> > >>> The swap cache is using xarray. The break down the swap
> > >>> cache access does not have the similar long time as zswap
> > >>> RB tree.
> > >>
> > >> I think the comparison to the swap cache may not be valid as the swap
> > >> cache has many trees per swapfile, while zswap has a single tree.
> > >>
> > >>>
> > >>> Moving the zswap entry to xarray enable read side
> > >>> take read RCU lock only.
> > >>
> > >> Nice.
> > >>
> > >>>
> > >>> The first patch adds the xarray alongside the RB tree.
> > >>> There is some debug check asserting the xarray agrees with
> > >>> the RB tree results.
> > >>>
> > >>> The second patch removes the zwap RB tree.
> > >>
> > >> The breakdown looks like something that would be a development step,
> > >> but for patch submission I think it makes more sense to have a single
> > >> patch replacing the rbtree with an xarray.
> > >>
> > >>>
> > >>> I expect to merge the zswap rb tree spin lock with the xarray
> > >>> lock in the follow up changes.
> > >>
> > >> Shouldn't this simply be changing uses of tree->lock to use
> > >> xa_{lock/unlock}? We also need to make sure we don't try to lock the
> > >> tree when operating on the xarray if the caller is already holding the
> > >> lock, but this seems to be straightforward enough to be done as part
> > >> of this patch or this series at least.
> > >>
> > >> Am I missing something?
> > >
> > > Also, I assume we will only see performance improvements after the
> > > tree lock in its current form is removed so that we get loads
> > > protected only by RCU. Can we get some performance numbers to see how
> > > the latency improves with the xarray under contention (unless
> > > Chengming is already planning on testing this for his multi-tree
> > > patches).
> >
> > I just give it a try, the same test of kernel build in tmpfs with zswap
> > shrinker enabled, all based on the latest mm/mm-stable branch.
> >
> > mm-stable zswap-split-tree zswap-xarray
> > real 1m10.442s 1m4.157s 1m9.962s
> > user 17m48.232s 17m41.477s 17m45.887s
> > sys 8m13.517s 5m2.226s 7m59.305s
> >
> > Looks like the contention of concurrency is still there, I haven't
> > look into the code yet, will review it later.

Thanks for the quick test. Interesting to see the sys usage drop for
the xarray case even with the spin lock.
Not sure if the 13 second saving is statistically significant or not.

We might need to have both xarray and split trees for the zswap. It is
likely removing the spin lock wouldn't be able to make up the 35%
difference. That is just my guess. There is only one way to find out.

BTW, do you have a script I can run to replicate your results?

>
> I think that's expected with the current version because the tree
> spin_lock is still there and we are still doing lookups with a
> spinlock.

Right.

Chris

2024-01-18 07:28:41

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On Wed, Jan 17, 2024 at 11:05 PM Yosry Ahmed <[email protected]> wrote:
>
> The name changes from Chris to Christopher are confusing :D
>
> >
> > I think it makes the review easier. The code adding and removing does
> > not have much overlap. Combining it to a single patch does not save
> > patch size. Having the assert check would be useful for some bisecting
> > to narrow down which step causing the problem. I am fine with squash
> > it to one patch as well.
>
> I think having two patches is unnecessarily noisy, and we add some
> debug code in this patch that we remove in the next patch anyway.
> Let's see what others think, but personally I prefer a single patch.
>
> > >
> > > >
> > > > I expect to merge the zswap rb tree spin lock with the xarray
> > > > lock in the follow up changes.
> > >
> > > Shouldn't this simply be changing uses of tree->lock to use
> > > xa_{lock/unlock}? We also need to make sure we don't try to lock the
> > > tree when operating on the xarray if the caller is already holding the
> > > lock, but this seems to be straightforward enough to be done as part
> > > of this patch or this series at least.
> > >
> > > Am I missing something?
> >
> > Currently the zswap entry refcount is protected by the zswap tree spin
> > lock as well. Can't remove the tree spin lock without changing the
> > refcount code. I think the zswap search entry should just return the
> > entry with refcount atomic increase, inside the RCU read() or xarray
> > lock. The previous zswap code does the find_and_get entry() which is
> > closer to what I want.
>
> I think this can be done in an RCU read section surrounding xa_load()

xa_load() already has RCU read lock inside. If you do that you might
just as well use some XAS API to work with the lock directly.

> and the refcount increment. Didn't look closely to check how much
> complexity this adds to manage refcounts with RCU, but I think there
> should be a lot of examples all around the kernel.

The complexity is not adding the refcount inside xa_load(). It is on
the zswap code that calls zswap_search() and zswap_{insert,erase}().
As far as I can tell, those codes need some tricky changes to go along
with the refcount change.

>
> IIUC, there are no performance benefits from this conversion until we
> remove the tree spinlock, right?

The original intent is helping the long tail case. RB tree has worse
long tails than xarray. I expect it will help the page fault long tail
even without removing the tree spinlock.

Chris

2024-01-18 07:36:23

by Chengming Zhou

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On 2024/1/18 15:19, Chris Li wrote:
> On Wed, Jan 17, 2024 at 11:02 PM Yosry Ahmed <[email protected]> wrote:
>>
>> On Wed, Jan 17, 2024 at 10:57 PM Chengming Zhou
>> <[email protected]> wrote:
>>>
>>> Hi Yosry and Chris,
>>>
>>> On 2024/1/18 14:39, Yosry Ahmed wrote:
>>>> On Wed, Jan 17, 2024 at 10:01 PM Yosry Ahmed <[email protected]> wrote:
>>>>>
>>>>> That's a long CC list for sure :)
>>>>>
>>>>> On Wed, Jan 17, 2024 at 7:06 PM Chris Li <[email protected]> wrote:
>>>>>>
>>>>>> The RB tree shows some contribution to the swap fault
>>>>>> long tail latency due to two factors:
>>>>>> 1) RB tree requires re-balance from time to time.
>>>>>> 2) The zswap RB tree has a tree level spin lock protecting
>>>>>> the tree access.
>>>>>>
>>>>>> The swap cache is using xarray. The break down the swap
>>>>>> cache access does not have the similar long time as zswap
>>>>>> RB tree.
>>>>>
>>>>> I think the comparison to the swap cache may not be valid as the swap
>>>>> cache has many trees per swapfile, while zswap has a single tree.
>>>>>
>>>>>>
>>>>>> Moving the zswap entry to xarray enable read side
>>>>>> take read RCU lock only.
>>>>>
>>>>> Nice.
>>>>>
>>>>>>
>>>>>> The first patch adds the xarray alongside the RB tree.
>>>>>> There is some debug check asserting the xarray agrees with
>>>>>> the RB tree results.
>>>>>>
>>>>>> The second patch removes the zwap RB tree.
>>>>>
>>>>> The breakdown looks like something that would be a development step,
>>>>> but for patch submission I think it makes more sense to have a single
>>>>> patch replacing the rbtree with an xarray.
>>>>>
>>>>>>
>>>>>> I expect to merge the zswap rb tree spin lock with the xarray
>>>>>> lock in the follow up changes.
>>>>>
>>>>> Shouldn't this simply be changing uses of tree->lock to use
>>>>> xa_{lock/unlock}? We also need to make sure we don't try to lock the
>>>>> tree when operating on the xarray if the caller is already holding the
>>>>> lock, but this seems to be straightforward enough to be done as part
>>>>> of this patch or this series at least.
>>>>>
>>>>> Am I missing something?
>>>>
>>>> Also, I assume we will only see performance improvements after the
>>>> tree lock in its current form is removed so that we get loads
>>>> protected only by RCU. Can we get some performance numbers to see how
>>>> the latency improves with the xarray under contention (unless
>>>> Chengming is already planning on testing this for his multi-tree
>>>> patches).
>>>
>>> I just give it a try, the same test of kernel build in tmpfs with zswap
>>> shrinker enabled, all based on the latest mm/mm-stable branch.
>>>
>>> mm-stable zswap-split-tree zswap-xarray
>>> real 1m10.442s 1m4.157s 1m9.962s
>>> user 17m48.232s 17m41.477s 17m45.887s
>>> sys 8m13.517s 5m2.226s 7m59.305s
>>>
>>> Looks like the contention of concurrency is still there, I haven't
>>> look into the code yet, will review it later.
>
> Thanks for the quick test. Interesting to see the sys usage drop for
> the xarray case even with the spin lock.
> Not sure if the 13 second saving is statistically significant or not.
>
> We might need to have both xarray and split trees for the zswap. It is
> likely removing the spin lock wouldn't be able to make up the 35%
> difference. That is just my guess. There is only one way to find out.

Yes, I totally agree with this! IMHO, concurrent zswap_store paths still
have to contend for the xarray spinlock even though we would have converted
the rb-tree to the xarray structure at last. So I think we should have both.

>
> BTW, do you have a script I can run to replicate your results?

```
#!/bin/bash

testname="build-kernel-tmpfs"
cgroup="/sys/fs/cgroup/$testname"

tmpdir="/tmp/vm-scalability-tmp"
workdir="$tmpdir/$testname"

memory_max="$((2 * 1024 * 1024 * 1024))"

linux_src="/root/zcm/linux-6.6.tar.xz"
NR_TASK=32

swapon ~/zcm/swapfile
echo 60 > /proc/sys/vm/swappiness

echo zsmalloc > /sys/module/zswap/parameters/zpool
echo lz4 > /sys/module/zswap/parameters/compressor
echo 1 > /sys/module/zswap/parameters/shrinker_enabled
echo 1 > /sys/module/zswap/parameters/enabled

if ! [ -d $tmpdir ]; then
mkdir -p $tmpdir
mount -t tmpfs -o size=100% nodev $tmpdir
fi

mkdir -p $cgroup
echo $memory_max > $cgroup/memory.max
echo $$ > $cgroup/cgroup.procs

rm -rf $workdir
mkdir -p $workdir
cd $workdir

tar xvf $linux_src
cd linux-6.6
make -j$NR_TASK clean
make defconfig
time make -j$NR_TASK
```


2024-01-18 14:48:36

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On Wed, Jan 17, 2024 at 11:05:15PM -0800, Yosry Ahmed wrote:
> > I think it makes the review easier. The code adding and removing does
> > not have much overlap. Combining it to a single patch does not save
> > patch size. Having the assert check would be useful for some bisecting
> > to narrow down which step causing the problem. I am fine with squash
> > it to one patch as well.
>
> I think having two patches is unnecessarily noisy, and we add some
> debug code in this patch that we remove in the next patch anyway.
> Let's see what others think, but personally I prefer a single patch.

+1

2024-01-18 17:02:46

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: zswap.c: add xarray tree to zswap

On Thu, Jan 18, 2024 at 5:52 AM Matthew Wilcox <[email protected]> wrote:
>
> On Wed, Jan 17, 2024 at 10:20:29PM -0800, Yosry Ahmed wrote:
> > > /* walk the tree and free everything */
> > > spin_lock(&tree->lock);
> > > +
> > > + xas_for_each(&xas, e, ULONG_MAX)
> >
> > Why not use xa_for_each?
>
> xas_for_each() is O(n) while xa_for_each() is O(n log n), as mentioned
> in the fine documentation. If you don't need to drop the lock while
> in the body of the loop, always prefer xas_for_each().

Thanks for pointing this out. Out of ignorance, I skipped reading the
doc for this one and operated under the general assumption to use xa_*
functions were possible.

The doc also says we should hold either the RCU read lock or the
xa_lock while iterating, we are not doing either here, no?

2024-01-18 17:15:37

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On Wed, Jan 17, 2024 at 11:28 PM Chris Li <[email protected]> wrote:
>
> On Wed, Jan 17, 2024 at 11:05 PM Yosry Ahmed <[email protected]> wrote:
> >
> > The name changes from Chris to Christopher are confusing :D
> >
> > >
> > > I think it makes the review easier. The code adding and removing does
> > > not have much overlap. Combining it to a single patch does not save
> > > patch size. Having the assert check would be useful for some bisecting
> > > to narrow down which step causing the problem. I am fine with squash
> > > it to one patch as well.
> >
> > I think having two patches is unnecessarily noisy, and we add some
> > debug code in this patch that we remove in the next patch anyway.
> > Let's see what others think, but personally I prefer a single patch.
> >
> > > >
> > > > >
> > > > > I expect to merge the zswap rb tree spin lock with the xarray
> > > > > lock in the follow up changes.
> > > >
> > > > Shouldn't this simply be changing uses of tree->lock to use
> > > > xa_{lock/unlock}? We also need to make sure we don't try to lock the
> > > > tree when operating on the xarray if the caller is already holding the
> > > > lock, but this seems to be straightforward enough to be done as part
> > > > of this patch or this series at least.
> > > >
> > > > Am I missing something?
> > >
> > > Currently the zswap entry refcount is protected by the zswap tree spin
> > > lock as well. Can't remove the tree spin lock without changing the
> > > refcount code. I think the zswap search entry should just return the
> > > entry with refcount atomic increase, inside the RCU read() or xarray
> > > lock. The previous zswap code does the find_and_get entry() which is
> > > closer to what I want.
> >
> > I think this can be done in an RCU read section surrounding xa_load()
>
> xa_load() already has RCU read lock inside. If you do that you might
> just as well use some XAS API to work with the lock directly.

RCU reda locks are nestable, some users of xa_load() do some in an RCU
read section, also for refcounting purposes. Also, I thought the point
was avoiding the lock in this path.

>
> > and the refcount increment. Didn't look closely to check how much
> > complexity this adds to manage refcounts with RCU, but I think there
> > should be a lot of examples all around the kernel.
>
> The complexity is not adding the refcount inside xa_load(). It is on
> the zswap code that calls zswap_search() and zswap_{insert,erase}().
> As far as I can tell, those codes need some tricky changes to go along
> with the refcount change.

I don't think it should be very tricky.
https://docs.kernel.org/RCU/rcuref.html may have relevant examples,
and there should be examples all over the code.

>
> >
> > IIUC, there are no performance benefits from this conversion until we
> > remove the tree spinlock, right?
>
> The original intent is helping the long tail case. RB tree has worse
> long tails than xarray. I expect it will help the page fault long tail
> even without removing the tree spinlock.

I think it would be better if we can remove the tree spinlock as part
of this change.

2024-01-18 18:01:57

by Nhat Pham

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On Wed, Jan 17, 2024 at 7:06 PM Chris Li <[email protected]> wrote:
>
> The RB tree shows some contribution to the swap fault
> long tail latency due to two factors:
> 1) RB tree requires re-balance from time to time.
> 2) The zswap RB tree has a tree level spin lock protecting
> the tree access.
>
> The swap cache is using xarray. The break down the swap
> cache access does not have the similar long time as zswap
> RB tree.
>
> Moving the zswap entry to xarray enable read side
> take read RCU lock only.
>
> The first patch adds the xarray alongside the RB tree.
> There is some debug check asserting the xarray agrees with
> the RB tree results.
>
> The second patch removes the zwap RB tree.
>
> I expect to merge the zswap rb tree spin lock with the xarray
> lock in the follow up changes.
>
> I can surely use some help in reviewing and testing.
>
> Signed-off-by: Chris Li <[email protected]>
> ---
> Chris Li (2):
> mm: zswap.c: add xarray tree to zswap

While I think it is pretty neat to keep the rbtree around to check if
the results agree during development stages, in the final version
please squash the patches. One patch is enough :)

> mm: zswap.c: remove RB tree
>
> mm/zswap.c | 120 ++++++++++++++++++++++++++++++-------------------------------
> 1 file changed, 59 insertions(+), 61 deletions(-)
> ---
> base-commit: d7ba3d7c3bf13e2faf419cce9e9bdfc3a1a50905
> change-id: 20240104-zswap-xarray-716260e541e3
>
> Best regards,
> --
> Chris Li <[email protected]>
>

2024-01-18 19:01:31

by Liam R. Howlett

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

* Christopher Li <[email protected]> [240118 01:48]:
> On Wed, Jan 17, 2024 at 10:02 PM Yosry Ahmed <[email protected]> wrote:
> >
> > That's a long CC list for sure :)
> >
> > On Wed, Jan 17, 2024 at 7:06 PM Chris Li <[email protected]> wrote:
> > >
> > > The RB tree shows some contribution to the swap fault
> > > long tail latency due to two factors:
> > > 1) RB tree requires re-balance from time to time.
> > > 2) The zswap RB tree has a tree level spin lock protecting
> > > the tree access.
> > >
> > > The swap cache is using xarray. The break down the swap
> > > cache access does not have the similar long time as zswap
> > > RB tree.
> >
> > I think the comparison to the swap cache may not be valid as the swap
> > cache has many trees per swapfile, while zswap has a single tree.
>
> Yes, good point. I think we can bench mark the xarray zswap vs the RB
> tree zswap, that would be more of a direct comparison.
>
> > > Moving the zswap entry to xarray enable read side
> > > take read RCU lock only.
> >
> > Nice.
> >
> > >
> > > The first patch adds the xarray alongside the RB tree.
> > > There is some debug check asserting the xarray agrees with
> > > the RB tree results.
> > >
> > > The second patch removes the zwap RB tree.
> >
> > The breakdown looks like something that would be a development step,
> > but for patch submission I think it makes more sense to have a single
> > patch replacing the rbtree with an xarray.
>
> I think it makes the review easier. The code adding and removing does
> not have much overlap. Combining it to a single patch does not save
> patch size. Having the assert check would be useful for some bisecting
> to narrow down which step causing the problem. I am fine with squash
> it to one patch as well.

I had thought similar when I replaced the rbtree with the maple tree in
the VMA space. That conversion was more involved and I wanted to detect
if there was ever any difference, and where I had made the error in the
multiple patch conversion.

This became rather painful once an issue was found, as then anyone
bisecting other issues could hit this difference and either blamed the
commit pointing at the BUG_ON() or gave up (I don't blame them for
giving up, I would). With only two commits, it may be easier for people
to see a fixed tag pointing to the same commit that bisect found (if
they check), but it proved an issue with my multiple patch conversion.

You may not experience this issue with the users of the zswap, but I
plan to avoid doing this again in the future. At least a WARN_ON_ONCE()
and a comment might help?

Thanks,
Liam

2024-01-18 19:36:27

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: zswap.c: remove RB tree

On Wed, Jan 17, 2024 at 10:35 PM Yosry Ahmed <[email protected]> wrote:
>
> > @@ -493,45 +471,47 @@ static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
> > static int zswap_insert(struct zswap_tree *tree, struct zswap_entry *entry,
> > struct zswap_entry **dupentry)
> > {
> > - struct rb_root *root = &tree->rbroot;
> > - struct rb_node **link = &root->rb_node, *parent = NULL;
> > - struct zswap_entry *myentry, *old;
> > - pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
> > -
> > -
> > - while (*link) {
> > - parent = *link;
> > - myentry = rb_entry(parent, struct zswap_entry, rbnode);
> > - myentry_offset = swp_offset(myentry->swpentry);
> > - if (myentry_offset > entry_offset)
> > - link = &(*link)->rb_left;
> > - else if (myentry_offset < entry_offset)
> > - link = &(*link)->rb_right;
> > - else {
> > - old = xa_load(&tree->xarray, entry_offset);
> > - BUG_ON(old != myentry);
> > - *dupentry = myentry;
> > + struct zswap_entry *e;
> > + pgoff_t offset = swp_offset(entry->swpentry);
> > + XA_STATE(xas, &tree->xarray, offset);
> > +
> > + do {
> > + xas_lock_irq(&xas);
> > + do {
> > + e = xas_load(&xas);
> > + if (xa_is_zero(e))
> > + e = NULL;
> > + } while (xas_retry(&xas, e));
> > + if (xas_valid(&xas) && e) {
> > + xas_unlock_irq(&xas);
> > + *dupentry = e;
> > return -EEXIST;
> > }
> > - }
> > - rb_link_node(&entry->rbnode, parent, link);
> > - rb_insert_color(&entry->rbnode, root);
> > - old = xa_store(&tree->xarray, entry_offset, entry, GFP_KERNEL);
> > - return 0;
> > + xas_store(&xas, entry);
> > + xas_unlock_irq(&xas);
> > + } while (xas_nomem(&xas, GFP_KERNEL));
> > + return xas_error(&xas);
>
> I think using the xas_* APIs can be avoided here. The only reason we
> need it is that we want to check if there's an existing entry first,
> and return -EEXIST. However, in that case, the caller will replace it
> anyway (and do some operations on the dupentry):
>
> while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
> WARN_ON(1);
> zswap_duplicate_entry++;
> zswap_invalidate_entry(tree, dupentry);
> }
>
> So I think we can do something like this in zswap_insert() instead:
>
> dupentry = xa_store(..);
> if (WARN_ON(dupentry)) {
> zswap_duplicate_entry++;
> zswap_invalidate_entry(tree, dupentry);
> }

If this is doable, I think we can return xa_store(..) and keep the
logic in the caller. I think there's a chance
zswap_{search/insert/erase} may end up being very thin wrappers around
xa_{load/store/erase}, and we may be able to remove them completely.
Let's see how it goes.

2024-01-18 20:22:10

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: zswap.c: add xarray tree to zswap

On Wed, Jan 17, 2024 at 10:20:29PM -0800, Yosry Ahmed wrote:
> > /* walk the tree and free everything */
> > spin_lock(&tree->lock);
> > +
> > + xas_for_each(&xas, e, ULONG_MAX)
>
> Why not use xa_for_each?

xas_for_each() is O(n) while xa_for_each() is O(n log n), as mentioned
in the fine documentation. If you don't need to drop the lock while
in the body of the loop, always prefer xas_for_each().


2024-01-18 20:28:50

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: zswap.c: add xarray tree to zswap

On Thu, Jan 18, 2024 at 08:59:55AM -0800, Yosry Ahmed wrote:
> On Thu, Jan 18, 2024 at 5:52 AM Matthew Wilcox <[email protected]> wrote:
> >
> > On Wed, Jan 17, 2024 at 10:20:29PM -0800, Yosry Ahmed wrote:
> > > > /* walk the tree and free everything */
> > > > spin_lock(&tree->lock);
> > > > +
> > > > + xas_for_each(&xas, e, ULONG_MAX)
> > >
> > > Why not use xa_for_each?
> >
> > xas_for_each() is O(n) while xa_for_each() is O(n log n), as mentioned
> > in the fine documentation. If you don't need to drop the lock while
> > in the body of the loop, always prefer xas_for_each().
>
> Thanks for pointing this out. Out of ignorance, I skipped reading the
> doc for this one and operated under the general assumption to use xa_*
> functions were possible.
>
> The doc also says we should hold either the RCU read lock or the
> xa_lock while iterating, we are not doing either here, no?

I have no idea; I haven't studied the patches in detail yet. I have
debugging assertions for that, so I was assuming that Chris had been
developing with debug options turned on. If not, I guess the bots will
do it for him.

2024-01-19 04:59:45

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On Wed, Jan 17, 2024 at 11:35 PM Chengming Zhou
<[email protected]> wrote:

> >>> mm-stable zswap-split-tree zswap-xarray
> >>> real 1m10.442s 1m4.157s 1m9.962s
> >>> user 17m48.232s 17m41.477s 17m45.887s
> >>> sys 8m13.517s 5m2.226s 7m59.305s
> >>>
> >>> Looks like the contention of concurrency is still there, I haven't
> >>> look into the code yet, will review it later.
> >
> > Thanks for the quick test. Interesting to see the sys usage drop for
> > the xarray case even with the spin lock.
> > Not sure if the 13 second saving is statistically significant or not.
> >
> > We might need to have both xarray and split trees for the zswap. It is
> > likely removing the spin lock wouldn't be able to make up the 35%
> > difference. That is just my guess. There is only one way to find out.
>
> Yes, I totally agree with this! IMHO, concurrent zswap_store paths still
> have to contend for the xarray spinlock even though we would have converted
> the rb-tree to the xarray structure at last. So I think we should have both.
>
> >
> > BTW, do you have a script I can run to replicate your results?

Hi Chengming,

Thanks for your script.

>
> ```
> #!/bin/bash
>
> testname="build-kernel-tmpfs"
> cgroup="/sys/fs/cgroup/$testname"
>
> tmpdir="/tmp/vm-scalability-tmp"
> workdir="$tmpdir/$testname"
>
> memory_max="$((2 * 1024 * 1024 * 1024))"
>
> linux_src="/root/zcm/linux-6.6.tar.xz"
> NR_TASK=32
>
> swapon ~/zcm/swapfile

How big is your swapfile here?

It seems you have only one swapfile there. That can explain the contention.
Have you tried multiple swapfiles for the same test?
That should reduce the contention without using your patch.

Chris

> echo 60 > /proc/sys/vm/swappiness
>
> echo zsmalloc > /sys/module/zswap/parameters/zpool
> echo lz4 > /sys/module/zswap/parameters/compressor
> echo 1 > /sys/module/zswap/parameters/shrinker_enabled
> echo 1 > /sys/module/zswap/parameters/enabled
>
> if ! [ -d $tmpdir ]; then
> mkdir -p $tmpdir
> mount -t tmpfs -o size=100% nodev $tmpdir
> fi
>
> mkdir -p $cgroup
> echo $memory_max > $cgroup/memory.max
> echo $$ > $cgroup/cgroup.procs
>
> rm -rf $workdir
> mkdir -p $workdir
> cd $workdir
>
> tar xvf $linux_src
> cd linux-6.6
> make -j$NR_TASK clean
> make defconfig
> time make -j$NR_TASK
> ```
>
>

2024-01-19 05:13:32

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On Thu, Jan 18, 2024 at 11:00 AM Liam R. Howlett
<[email protected]> wrote:
> > >
> > > >
> > > > The first patch adds the xarray alongside the RB tree.
> > > > There is some debug check asserting the xarray agrees with
> > > > the RB tree results.
> > > >
> > > > The second patch removes the zwap RB tree.
> > >
> > > The breakdown looks like something that would be a development step,
> > > but for patch submission I think it makes more sense to have a single
> > > patch replacing the rbtree with an xarray.
> >
> > I think it makes the review easier. The code adding and removing does
> > not have much overlap. Combining it to a single patch does not save
> > patch size. Having the assert check would be useful for some bisecting
> > to narrow down which step causing the problem. I am fine with squash
> > it to one patch as well.
>
> I had thought similar when I replaced the rbtree with the maple tree in
> the VMA space. That conversion was more involved and I wanted to detect
> if there was ever any difference, and where I had made the error in the
> multiple patch conversion.
>
> This became rather painful once an issue was found, as then anyone
> bisecting other issues could hit this difference and either blamed the
> commit pointing at the BUG_ON() or gave up (I don't blame them for
> giving up, I would). With only two commits, it may be easier for people
> to see a fixed tag pointing to the same commit that bisect found (if
> they check), but it proved an issue with my multiple patch conversion.

Thanks for sharing your experience. That debug assert did help me
catch issues on my own internal version after rebasing to the latest
mm tree. If the user can't do the bisect, then I agree we don't need
to assert in the official version. I can always bisect on my one
internal version.

>
> You may not experience this issue with the users of the zswap, but I
> plan to avoid doing this again in the future. At least a WARN_ON_ONCE()
> and a comment might help?

Sure, I might just merge the two patches. Don't have the BUG_ON() any more.

Chris

2024-01-19 05:14:36

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On Thu, Jan 18, 2024 at 10:01 AM Nhat Pham <[email protected]> wrote:
>
> On Wed, Jan 17, 2024 at 7:06 PM Chris Li <[email protected]> wrote:
> >
> > The RB tree shows some contribution to the swap fault
> > long tail latency due to two factors:
> > 1) RB tree requires re-balance from time to time.
> > 2) The zswap RB tree has a tree level spin lock protecting
> > the tree access.
> >
> > The swap cache is using xarray. The break down the swap
> > cache access does not have the similar long time as zswap
> > RB tree.
> >
> > Moving the zswap entry to xarray enable read side
> > take read RCU lock only.
> >
> > The first patch adds the xarray alongside the RB tree.
> > There is some debug check asserting the xarray agrees with
> > the RB tree results.
> >
> > The second patch removes the zwap RB tree.
> >
> > I expect to merge the zswap rb tree spin lock with the xarray
> > lock in the follow up changes.
> >
> > I can surely use some help in reviewing and testing.
> >
> > Signed-off-by: Chris Li <[email protected]>
> > ---
> > Chris Li (2):
> > mm: zswap.c: add xarray tree to zswap
>
> While I think it is pretty neat to keep the rbtree around to check if
> the results agree during development stages, in the final version
> please squash the patches. One patch is enough :)

Ack.

Chris

2024-01-19 05:24:37

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: zswap.c: add xarray tree to zswap

Hi Yosry,

On Wed, Jan 17, 2024 at 10:21 PM Yosry Ahmed <[email protected]> wrote:
>
> On Wed, Jan 17, 2024 at 7:06 PM Chris Li <[email protected]> wrote:
> >
> > The xarray tree is added alongside the zswap RB tree.
> > Checks for the xarray get the same result as the RB tree operations.
> >
> > Rename the zswap RB tree function to a more generic function
> > name without the RB part.
>
> As I mentioned in the cover letter, I believe this should be squashed
> into the second patch. I have some comments below as well on the parts
> that should remain after the squash.
>
> [..]
> >
> > @@ -462,9 +463,9 @@ static void zswap_lru_putback(struct list_lru *list_lru,
> > /*********************************
> > * rbtree functions
> > **********************************/
> > -static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
> > +static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
>
> Let's change the zswap_rb_* prefixes to zswap_tree_* instead of just
> zswap_*. Otherwise, it will be confusing to have both zswap_store and
> zswap_insert (as well as zswap_load and zswap_search).

How about zswap_xa_* ?

>
> [..]
> > @@ -1790,15 +1808,21 @@ void zswap_swapon(int type)
> > void zswap_swapoff(int type)
> > {
> > struct zswap_tree *tree = zswap_trees[type];
> > - struct zswap_entry *entry, *n;
> > + struct zswap_entry *entry, *e, *n;
> > + XA_STATE(xas, tree ? &tree->xarray : NULL, 0);
> >
> > if (!tree)
> > return;
> >
> > /* walk the tree and free everything */
> > spin_lock(&tree->lock);
> > +
> > + xas_for_each(&xas, e, ULONG_MAX)
>
> Why not use xa_for_each?
>
> > + zswap_invalidate_entry(tree, e);
> > +
> > rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
> > - zswap_free_entry(entry);
>
> Replacing zswap_free_entry() with zswap_invalidate_entry() is a
> behavioral change that should be done separate from this series, but I
> am wondering why it's needed. IIUC, the swapoff code should be making
> sure there are no ongoing swapin/swapout operations, and there are no
> pages left in zswap to writeback.
>
> Is it the case that swapoff may race with writeback, such that
> writeback is holding the last remaining ref after zswap_invalidate()
> is called, and then zswap_swapoff() is called freeing the zswap entry
> while writeback is still accessing it?

For the RB tree the mapping is stored in the zswap entry as RB node.
That is different from xarray. Xarry stores the mapping outside of
zswap entry. Just freeing the entry does not remove the mapping from
xarray. Therefore it needs to call zswap_invalidate_entry() to remove
the entry from the xarray. I could call zswap_erase() then free entry.
I just think zswap_invalidate_entry() is more consistent with the rest
of the code.

Chris

2024-01-19 05:28:35

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: zswap.c: add xarray tree to zswap

On Thu, Jan 18, 2024 at 10:25 AM Matthew Wilcox <[email protected]> wrote:
>
> On Thu, Jan 18, 2024 at 08:59:55AM -0800, Yosry Ahmed wrote:
> > On Thu, Jan 18, 2024 at 5:52 AM Matthew Wilcox <willy@infradeadorg> wrote:
> > >
> > > On Wed, Jan 17, 2024 at 10:20:29PM -0800, Yosry Ahmed wrote:
> > > > > /* walk the tree and free everything */
> > > > > spin_lock(&tree->lock);
> > > > > +
> > > > > + xas_for_each(&xas, e, ULONG_MAX)
> > > >
> > > > Why not use xa_for_each?
> > >
> > > xas_for_each() is O(n) while xa_for_each() is O(n log n), as mentioned
> > > in the fine documentation. If you don't need to drop the lock while
> > > in the body of the loop, always prefer xas_for_each().
> >
> > Thanks for pointing this out. Out of ignorance, I skipped reading the
> > doc for this one and operated under the general assumption to use xa_*
> > functions were possible.
> >
> > The doc also says we should hold either the RCU read lock or the
> > xa_lock while iterating, we are not doing either here, no?
>
> I have no idea; I haven't studied the patches in detail yet. I have
> debugging assertions for that, so I was assuming that Chris had been
> developing with debug options turned on. If not, I guess the bots will
> do it for him.

It is fine now because we have the extra zswap tree spin lock. When we
remove the zswap tree spin lock it does require RCU read lock. You are
right I would get assertion failure.

Chris

2024-01-19 05:43:59

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: zswap.c: remove RB tree

Hi Yosry,

On Wed, Jan 17, 2024 at 10:35 PM Yosry Ahmed <[email protected]> wrote:
>
> > @@ -493,45 +471,47 @@ static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
> > static int zswap_insert(struct zswap_tree *tree, struct zswap_entry *entry,
> > struct zswap_entry **dupentry)
> > {
> > - struct rb_root *root = &tree->rbroot;
> > - struct rb_node **link = &root->rb_node, *parent = NULL;
> > - struct zswap_entry *myentry, *old;
> > - pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
> > -
> > -
> > - while (*link) {
> > - parent = *link;
> > - myentry = rb_entry(parent, struct zswap_entry, rbnode);
> > - myentry_offset = swp_offset(myentry->swpentry);
> > - if (myentry_offset > entry_offset)
> > - link = &(*link)->rb_left;
> > - else if (myentry_offset < entry_offset)
> > - link = &(*link)->rb_right;
> > - else {
> > - old = xa_load(&tree->xarray, entry_offset);
> > - BUG_ON(old != myentry);
> > - *dupentry = myentry;
> > + struct zswap_entry *e;
> > + pgoff_t offset = swp_offset(entry->swpentry);
> > + XA_STATE(xas, &tree->xarray, offset);
> > +
> > + do {
> > + xas_lock_irq(&xas);
> > + do {
> > + e = xas_load(&xas);
> > + if (xa_is_zero(e))
> > + e = NULL;
> > + } while (xas_retry(&xas, e));
> > + if (xas_valid(&xas) && e) {
> > + xas_unlock_irq(&xas);
> > + *dupentry = e;
> > return -EEXIST;
> > }
> > - }
> > - rb_link_node(&entry->rbnode, parent, link);
> > - rb_insert_color(&entry->rbnode, root);
> > - old = xa_store(&tree->xarray, entry_offset, entry, GFP_KERNEL);
> > - return 0;
> > + xas_store(&xas, entry);
> > + xas_unlock_irq(&xas);
> > + } while (xas_nomem(&xas, GFP_KERNEL));
> > + return xas_error(&xas);
>
> I think using the xas_* APIs can be avoided here. The only reason we
> need it is that we want to check if there's an existing entry first,
> and return -EEXIST. However, in that case, the caller will replace it
> anyway (and do some operations on the dupentry):

We might be able to for the insert case if we don't mind changing the
code behavior a bit. My original intent is to keep close to the
original zswap code and not stir the pot too much for the xarray
replacement. We can always make more adjustment once the RB tree is
gone.

> while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
> WARN_ON(1);
> zswap_duplicate_entry++;
> zswap_invalidate_entry(tree, dupentry);
> }
>
> So I think we can do something like this in zswap_insert() instead:
>
> dupentry = xa_store(..);
> if (WARN_ON(dupentry)) {
> zswap_duplicate_entry++;
> zswap_invalidate_entry(tree, dupentry);
> }
>
> WDYT?
>
> > }
> >
> > static bool zswap_erase(struct zswap_tree *tree, struct zswap_entry *entry)
> > {
> > + struct zswap_entry *e;
> > pgoff_t offset = swp_offset(entry->swpentry);
> > - if (!RB_EMPTY_NODE(&entry->rbnode)) {
> > - struct zswap_entry *old;
> > - old = xa_erase(&tree->xarray, offset);
> > - BUG_ON(old != entry);
> > - rb_erase(&entry->rbnode, &tree->rbroot);
> > - RB_CLEAR_NODE(&entry->rbnode);
> > - return true;
> > - }
> > - return false;
> > + XA_STATE(xas, &tree->xarray, offset);
> > +
> > + do {
> > + xas_lock_irq(&xas);
> > + do {
> > + e = xas_load(&xas);
> > + } while (xas_retry(&xas, e));
> > + if (xas_valid(&xas) && e != entry) {
> > + xas_unlock_irq(&xas);
> > + return false;
> > + }
> > + xas_store(&xas, NULL);
> > + xas_unlock_irq(&xas);
> > + } while (xas_nomem(&xas, GFP_KERNEL));
> > + return !xas_error(&xas);
> > }
>
> Same here, I think we just want:
>
> return !!xa_erase(..);

For the erase case it is tricky.
The current zswap code does not erase an entry if the tree entry at
the same offset has been changed. It should be fine if the new entry
is NULL. Basically some race to remove the entry already. However, if
the entry is not NULL, then force resetting it to NULL will change
behavior compared to the current.

From reading the zswap code, I am not sure this is in theory
impossible to happen: Some one races to remove the zswap entry then
reclaim converts that page back to the zswap, with a new entry. By the
time the zswap_erase() tries to erase the current entry, the entry has
changed to a new entry. It seems the right thing to do in this
unlikely case is that we should skip the force erase and drop the
current entry, assuming someone else has already invalidated it. Don't
touch the entry in the tree, we assume it is a newer generation.

If we can reason the above is impossible to happen, then the force
erase and reset the entry to NULL should be fine(Famous last word).
Noted that this is a behavior change, I would like to seperate it out
from the drop in replacement patch(keep the original behavior)

Chris




>
> >
> > static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
> > @@ -583,7 +563,6 @@ static void zswap_entry_put(struct zswap_tree *tree,
> >
> > WARN_ON_ONCE(refcount < 0);
> > if (refcount == 0) {
> > - WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
> > zswap_free_entry(entry);
> > }
>
> nit: the braces are no longer needed here
>

2024-01-19 05:49:47

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: zswap.c: remove RB tree

On Thu, Jan 18, 2024 at 11:36 AM Yosry Ahmed <[email protected]> wrote:
>
> On Wed, Jan 17, 2024 at 10:35 PM Yosry Ahmed <[email protected]> wrote:
> >
> > > @@ -493,45 +471,47 @@ static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
> > > static int zswap_insert(struct zswap_tree *tree, struct zswap_entry *entry,
> > > struct zswap_entry **dupentry)
> > > {
> > > - struct rb_root *root = &tree->rbroot;
> > > - struct rb_node **link = &root->rb_node, *parent = NULL;
> > > - struct zswap_entry *myentry, *old;
> > > - pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
> > > -
> > > -
> > > - while (*link) {
> > > - parent = *link;
> > > - myentry = rb_entry(parent, struct zswap_entry, rbnode);
> > > - myentry_offset = swp_offset(myentry->swpentry);
> > > - if (myentry_offset > entry_offset)
> > > - link = &(*link)->rb_left;
> > > - else if (myentry_offset < entry_offset)
> > > - link = &(*link)->rb_right;
> > > - else {
> > > - old = xa_load(&tree->xarray, entry_offset);
> > > - BUG_ON(old != myentry);
> > > - *dupentry = myentry;
> > > + struct zswap_entry *e;
> > > + pgoff_t offset = swp_offset(entry->swpentry);
> > > + XA_STATE(xas, &tree->xarray, offset);
> > > +
> > > + do {
> > > + xas_lock_irq(&xas);
> > > + do {
> > > + e = xas_load(&xas);
> > > + if (xa_is_zero(e))
> > > + e = NULL;
> > > + } while (xas_retry(&xas, e));
> > > + if (xas_valid(&xas) && e) {
> > > + xas_unlock_irq(&xas);
> > > + *dupentry = e;
> > > return -EEXIST;
> > > }
> > > - }
> > > - rb_link_node(&entry->rbnode, parent, link);
> > > - rb_insert_color(&entry->rbnode, root);
> > > - old = xa_store(&tree->xarray, entry_offset, entry, GFP_KERNEL);
> > > - return 0;
> > > + xas_store(&xas, entry);
> > > + xas_unlock_irq(&xas);
> > > + } while (xas_nomem(&xas, GFP_KERNEL));
> > > + return xas_error(&xas);
> >
> > I think using the xas_* APIs can be avoided here. The only reason we
> > need it is that we want to check if there's an existing entry first,
> > and return -EEXIST. However, in that case, the caller will replace it
> > anyway (and do some operations on the dupentry):
> >
> > while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
> > WARN_ON(1);
> > zswap_duplicate_entry++;
> > zswap_invalidate_entry(tree, dupentry);
> > }
> >
> > So I think we can do something like this in zswap_insert() instead:
> >
> > dupentry = xa_store(..);
> > if (WARN_ON(dupentry)) {
> > zswap_duplicate_entry++;
> > zswap_invalidate_entry(tree, dupentry);
> > }
>
> If this is doable, I think we can return xa_store(..) and keep the
> logic in the caller. I think there's a chance
> zswap_{search/insert/erase} may end up being very thin wrappers around
> xa_{load/store/erase}, and we may be able to remove them completely.
> Let's see how it goes.
>

See my other email about erasing an entry raced into a new entry. That
is the part I am not certain.
Anyway, when zswap adopte folio, then it might need to handle multi
entry, we will be back to using the XAS API.

Chris

2024-01-19 06:19:17

by Chengming Zhou

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On 2024/1/19 12:59, Chris Li wrote:
> On Wed, Jan 17, 2024 at 11:35 PM Chengming Zhou
> <[email protected]> wrote:
>
>>>>> mm-stable zswap-split-tree zswap-xarray
>>>>> real 1m10.442s 1m4.157s 1m9.962s
>>>>> user 17m48.232s 17m41.477s 17m45.887s
>>>>> sys 8m13.517s 5m2.226s 7m59.305s
>>>>>
>>>>> Looks like the contention of concurrency is still there, I haven't
>>>>> look into the code yet, will review it later.
>>>
>>> Thanks for the quick test. Interesting to see the sys usage drop for
>>> the xarray case even with the spin lock.
>>> Not sure if the 13 second saving is statistically significant or not.
>>>
>>> We might need to have both xarray and split trees for the zswap. It is
>>> likely removing the spin lock wouldn't be able to make up the 35%
>>> difference. That is just my guess. There is only one way to find out.
>>
>> Yes, I totally agree with this! IMHO, concurrent zswap_store paths still
>> have to contend for the xarray spinlock even though we would have converted
>> the rb-tree to the xarray structure at last. So I think we should have both.
>>
>>>
>>> BTW, do you have a script I can run to replicate your results?
>
> Hi Chengming,
>
> Thanks for your script.
>
>>
>> ```
>> #!/bin/bash
>>
>> testname="build-kernel-tmpfs"
>> cgroup="/sys/fs/cgroup/$testname"
>>
>> tmpdir="/tmp/vm-scalability-tmp"
>> workdir="$tmpdir/$testname"
>>
>> memory_max="$((2 * 1024 * 1024 * 1024))"
>>
>> linux_src="/root/zcm/linux-6.6.tar.xz"
>> NR_TASK=32
>>
>> swapon ~/zcm/swapfile
>
> How big is your swapfile here?

The swapfile is big enough here, I use a 50GB swapfile.

>
> It seems you have only one swapfile there. That can explain the contention.
> Have you tried multiple swapfiles for the same test?
> That should reduce the contention without using your patch.
Do you mean to have many 64MB swapfiles to swapon at the same time?
Maybe it's feasible to test, I'm not sure how swapout will choose.
But in our usecase, we normally have only one swapfile.

Thanks.

>
> Chris
>
>> echo 60 > /proc/sys/vm/swappiness
>>
>> echo zsmalloc > /sys/module/zswap/parameters/zpool
>> echo lz4 > /sys/module/zswap/parameters/compressor
>> echo 1 > /sys/module/zswap/parameters/shrinker_enabled
>> echo 1 > /sys/module/zswap/parameters/enabled
>>
>> if ! [ -d $tmpdir ]; then
>> mkdir -p $tmpdir
>> mount -t tmpfs -o size=100% nodev $tmpdir
>> fi
>>
>> mkdir -p $cgroup
>> echo $memory_max > $cgroup/memory.max
>> echo $$ > $cgroup/cgroup.procs
>>
>> rm -rf $workdir
>> mkdir -p $workdir
>> cd $workdir
>>
>> tar xvf $linux_src
>> cd linux-6.6
>> make -j$NR_TASK clean
>> make defconfig
>> time make -j$NR_TASK
>> ```
>>
>>

2024-01-19 10:27:15

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On Thu, Jan 18, 2024 at 10:19 PM Chengming Zhou
<[email protected]> wrote:
>
> On 2024/1/19 12:59, Chris Li wrote:
> > On Wed, Jan 17, 2024 at 11:35 PM Chengming Zhou
> > <[email protected]> wrote:
> >
> >>>>> mm-stable zswap-split-tree zswap-xarray
> >>>>> real 1m10.442s 1m4.157s 1m9.962s
> >>>>> user 17m48.232s 17m41.477s 17m45.887s
> >>>>> sys 8m13.517s 5m2.226s 7m59.305s
> >>>>>
> >>>>> Looks like the contention of concurrency is still there, I haven't
> >>>>> look into the code yet, will review it later.
> >>>
> >>> Thanks for the quick test. Interesting to see the sys usage drop for
> >>> the xarray case even with the spin lock.
> >>> Not sure if the 13 second saving is statistically significant or not.
> >>>
> >>> We might need to have both xarray and split trees for the zswap. It is
> >>> likely removing the spin lock wouldn't be able to make up the 35%
> >>> difference. That is just my guess. There is only one way to find out.
> >>
> >> Yes, I totally agree with this! IMHO, concurrent zswap_store paths still
> >> have to contend for the xarray spinlock even though we would have converted
> >> the rb-tree to the xarray structure at last. So I think we should have both.
> >>
> >>>
> >>> BTW, do you have a script I can run to replicate your results?
> >
> > Hi Chengming,
> >
> > Thanks for your script.
> >
> >>
> >> ```
> >> #!/bin/bash
> >>
> >> testname="build-kernel-tmpfs"
> >> cgroup="/sys/fs/cgroup/$testname"
> >>
> >> tmpdir="/tmp/vm-scalability-tmp"
> >> workdir="$tmpdir/$testname"
> >>
> >> memory_max="$((2 * 1024 * 1024 * 1024))"
> >>
> >> linux_src="/root/zcm/linux-6.6.tar.xz"
> >> NR_TASK=32
> >>
> >> swapon ~/zcm/swapfile
> >
> > How big is your swapfile here?
>
> The swapfile is big enough here, I use a 50GB swapfile.

Thanks,

>
> >
> > It seems you have only one swapfile there. That can explain the contention.
> > Have you tried multiple swapfiles for the same test?
> > That should reduce the contention without using your patch.
> Do you mean to have many 64MB swapfiles to swapon at the same time?

64MB is too small. There are limits to MAX_SWAPFILES. It is less than
(32 - n) swap files.
If you want to use 50G swap space, you can have MAX_SWAPFILES, each
swapfile 50GB / MAX_SWAPFILES.

> Maybe it's feasible to test,

Of course it is testable, I am curious to see the test results.

> I'm not sure how swapout will choose.

It will rotate through the same priority swap files first.
swapfile.c: get_swap_pages().

> But in our usecase, we normally have only one swapfile.

Is there a good reason why you can't use more than one swapfile?
One swapfile will not take the full advantage of the existing code.
Even if you split the zswap trees within a swapfile. With only one
swapfile, you will still be having lock contention on "(struct
swap_info_struct).lock".
It is one lock per swapfile.
Using more than one swap file should get you better results.

Chris

2024-01-19 11:14:55

by Chengming Zhou

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On 2024/1/19 18:26, Chris Li wrote:
> On Thu, Jan 18, 2024 at 10:19 PM Chengming Zhou
> <[email protected]> wrote:
>>
>> On 2024/1/19 12:59, Chris Li wrote:
>>> On Wed, Jan 17, 2024 at 11:35 PM Chengming Zhou
>>> <[email protected]> wrote:
>>>
>>>>>>> mm-stable zswap-split-tree zswap-xarray
>>>>>>> real 1m10.442s 1m4.157s 1m9.962s
>>>>>>> user 17m48.232s 17m41.477s 17m45.887s
>>>>>>> sys 8m13.517s 5m2.226s 7m59.305s
>>>>>>>
>>>>>>> Looks like the contention of concurrency is still there, I haven't
>>>>>>> look into the code yet, will review it later.
>>>>>
>>>>> Thanks for the quick test. Interesting to see the sys usage drop for
>>>>> the xarray case even with the spin lock.
>>>>> Not sure if the 13 second saving is statistically significant or not.
>>>>>
>>>>> We might need to have both xarray and split trees for the zswap. It is
>>>>> likely removing the spin lock wouldn't be able to make up the 35%
>>>>> difference. That is just my guess. There is only one way to find out.
>>>>
>>>> Yes, I totally agree with this! IMHO, concurrent zswap_store paths still
>>>> have to contend for the xarray spinlock even though we would have converted
>>>> the rb-tree to the xarray structure at last. So I think we should have both.
>>>>
>>>>>
>>>>> BTW, do you have a script I can run to replicate your results?
>>>
>>> Hi Chengming,
>>>
>>> Thanks for your script.
>>>
>>>>
>>>> ```
>>>> #!/bin/bash
>>>>
>>>> testname="build-kernel-tmpfs"
>>>> cgroup="/sys/fs/cgroup/$testname"
>>>>
>>>> tmpdir="/tmp/vm-scalability-tmp"
>>>> workdir="$tmpdir/$testname"
>>>>
>>>> memory_max="$((2 * 1024 * 1024 * 1024))"
>>>>
>>>> linux_src="/root/zcm/linux-6.6.tar.xz"
>>>> NR_TASK=32
>>>>
>>>> swapon ~/zcm/swapfile
>>>
>>> How big is your swapfile here?
>>
>> The swapfile is big enough here, I use a 50GB swapfile.
>
> Thanks,
>
>>
>>>
>>> It seems you have only one swapfile there. That can explain the contention.
>>> Have you tried multiple swapfiles for the same test?
>>> That should reduce the contention without using your patch.
>> Do you mean to have many 64MB swapfiles to swapon at the same time?
>
> 64MB is too small. There are limits to MAX_SWAPFILES. It is less than
> (32 - n) swap files.
> If you want to use 50G swap space, you can have MAX_SWAPFILES, each
> swapfile 50GB / MAX_SWAPFILES.

Right.

>
>> Maybe it's feasible to test,
>
> Of course it is testable, I am curious to see the test results.
>
>> I'm not sure how swapout will choose.
>
> It will rotate through the same priority swap files first.
> swapfile.c: get_swap_pages().
>
>> But in our usecase, we normally have only one swapfile.
>
> Is there a good reason why you can't use more than one swapfile?

I think no, but it seems an unneeded change/burden to our admin.
So I just tested and optimized for the normal case.

> One swapfile will not take the full advantage of the existing code.
> Even if you split the zswap trees within a swapfile. With only one
> swapfile, you will still be having lock contention on "(struct
> swap_info_struct).lock".
> It is one lock per swapfile.
> Using more than one swap file should get you better results.

IIUC, we already have the per-cpu swap entry cache to not contend for
this lock? And I don't see much hot of this lock in the testing.

Thanks.

2024-01-19 12:02:09

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 0/2] RFC: zswap tree use xarray instead of RB tree

On Fri, Jan 19, 2024 at 3:12 AM Chengming Zhou
<[email protected]> wrote:
>
> On 2024/1/19 18:26, Chris Li wrote:
> > On Thu, Jan 18, 2024 at 10:19 PM Chengming Zhou
> > <[email protected]> wrote:
> >>
> >> On 2024/1/19 12:59, Chris Li wrote:
> >>> On Wed, Jan 17, 2024 at 11:35 PM Chengming Zhou
> >>> <[email protected]> wrote:
> >>>
> >>>>>>> mm-stable zswap-split-tree zswap-xarray
> >>>>>>> real 1m10.442s 1m4.157s 1m9.962s
> >>>>>>> user 17m48.232s 17m41.477s 17m45887s
> >>>>>>> sys 8m13.517s 5m2.226s 7m59.305s
> >>>>>>>
> >>>>>>> Looks like the contention of concurrency is still there, I haven't
> >>>>>>> look into the code yet, will review it later.
> >>>>>
> >>>>> Thanks for the quick test. Interesting to see the sys usage drop for
> >>>>> the xarray case even with the spin lock.
> >>>>> Not sure if the 13 second saving is statistically significant or not.
> >>>>>
> >>>>> We might need to have both xarray and split trees for the zswap. It is
> >>>>> likely removing the spin lock wouldn't be able to make up the 35%
> >>>>> difference. That is just my guess. There is only one way to find out.
> >>>>
> >>>> Yes, I totally agree with this! IMHO, concurrent zswap_store paths still
> >>>> have to contend for the xarray spinlock even though we would have converted
> >>>> the rb-tree to the xarray structure at last. So I think we should have both.
> >>>>
> >>>>>
> >>>>> BTW, do you have a script I can run to replicate your results?
> >>>
> >>> Hi Chengming,
> >>>
> >>> Thanks for your script.
> >>>
> >>>>
> >>>> ```
> >>>> #!/bin/bash
> >>>>
> >>>> testname="build-kernel-tmpfs"
> >>>> cgroup="/sys/fs/cgroup/$testname"
> >>>>
> >>>> tmpdir="/tmp/vm-scalability-tmp"
> >>>> workdir="$tmpdir/$testname"
> >>>>
> >>>> memory_max="$((2 * 1024 * 1024 * 1024))"
> >>>>
> >>>> linux_src="/root/zcm/linux-6.6.tar.xz"
> >>>> NR_TASK=32
> >>>>
> >>>> swapon ~/zcm/swapfile
> >>>
> >>> How big is your swapfile here?
> >>
> >> The swapfile is big enough here, I use a 50GB swapfile.
> >
> > Thanks,
> >
> >>
> >>>
> >>> It seems you have only one swapfile there. That can explain the contention.
> >>> Have you tried multiple swapfiles for the same test?
> >>> That should reduce the contention without using your patch.
> >> Do you mean to have many 64MB swapfiles to swapon at the same time?
> >
> > 64MB is too small. There are limits to MAX_SWAPFILES. It is less than
> > (32 - n) swap files.
> > If you want to use 50G swap space, you can have MAX_SWAPFILES, each
> > swapfile 50GB / MAX_SWAPFILES.
>
> Right.
>
> >
> >> Maybe it's feasible to test,
> >
> > Of course it is testable, I am curious to see the test results.
> >
> >> I'm not sure how swapout will choose.
> >
> > It will rotate through the same priority swap files first.
> > swapfile.c: get_swap_pages().
> >
> >> But in our usecase, we normally have only one swapfile.
> >
> > Is there a good reason why you can't use more than one swapfile?
>
> I think no, but it seems an unneeded change/burden to our admin.
> So I just tested and optimized for the normal case.

I understand. Just saying it is not really a kernel limitation per say.
I blame the user space :-)

>
> > One swapfile will not take the full advantage of the existing code.
> > Even if you split the zswap trees within a swapfile. With only one
> > swapfile, you will still be having lock contention on "(struct
> > swap_info_struct).lock".
> > It is one lock per swapfile.
> > Using more than one swap file should get you better results.
>
> IIUC, we already have the per-cpu swap entry cache to not contend for
> this lock? And I don't see much hot of this lock in the testing.

Yes. The swap entry cache helps. The cache batching also causes other
problems, e.g. the long tail in swap faults handling.
Shameless plug, I have a patch posted earlier to address the swap
fault long tail latencies.

https://lore.kernel.org/linux-mm/[email protected]/T/

Chris

2024-01-19 19:30:31

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: zswap.c: add xarray tree to zswap

My previous email got messed up, sorry.

> > > @@ -462,9 +463,9 @@ static void zswap_lru_putback(struct list_lru *list_lru,
> > > /*********************************
> > > * rbtree functions
> > > **********************************/
> > > -static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
> > > +static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
> >
> > Let's change the zswap_rb_* prefixes to zswap_tree_* instead of just
> > zswap_*. Otherwise, it will be confusing to have both zswap_store and
> > zswap_insert (as well as zswap_load and zswap_search).
>
> How about zswap_xa_* ?

SGTM.

> >
> > [..]
> > > @@ -1790,15 +1808,21 @@ void zswap_swapon(int type)
> > > void zswap_swapoff(int type)
> > > {
> > > struct zswap_tree *tree = zswap_trees[type];
> > > - struct zswap_entry *entry, *n;
> > > + struct zswap_entry *entry, *e, *n;
> > > + XA_STATE(xas, tree ? &tree->xarray : NULL, 0);
> > >
> > > if (!tree)
> > > return;
> > >
> > > /* walk the tree and free everything */
> > > spin_lock(&tree->lock);
> > > +
> > > + xas_for_each(&xas, e, ULONG_MAX)
> >
> > Why not use xa_for_each?
> >
> > > + zswap_invalidate_entry(tree, e);
> > > +
> > > rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
> > > - zswap_free_entry(entry);
> >
> > Replacing zswap_free_entry() with zswap_invalidate_entry() is a
> > behavioral change that should be done separate from this series, but I
> > am wondering why it's needed. IIUC, the swapoff code should be making
> > sure there are no ongoing swapin/swapout operations, and there are no
> > pages left in zswap to writeback.
> >
> > Is it the case that swapoff may race with writeback, such that
> > writeback is holding the last remaining ref after zswap_invalidate()
> > is called, and then zswap_swapoff() is called freeing the zswap entry
> > while writeback is still accessing it?
>
> For the RB tree the mapping is stored in the zswap entry as RB node.
> That is different from xarray. Xarry stores the mapping outside of
> zswap entry. Just freeing the entry does not remove the mapping from
> xarray. Therefore it needs to call zswap_invalidate_entry() to remove
> the entry from the xarray. I could call zswap_erase() then free entry.
> I just think zswap_invalidate_entry() is more consistent with the rest
> of the code.

I see, but it's not clear to me if the xarray is being properly
cleaned up in this case.

Do we have to call xa_destroy() anyway to make sure everything is
cleaned up in the xarray? In that case, we can just do that after the
loop.

2024-01-19 19:32:01

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: zswap.c: add xarray tree to zswap

On Thu, Jan 18, 2024 at 9:28 PM Chris Li <[email protected]> wrote:
>
> On Thu, Jan 18, 2024 at 10:25 AM Matthew Wilcox <[email protected]> wrote:
> >
> > On Thu, Jan 18, 2024 at 08:59:55AM -0800, Yosry Ahmed wrote:
> > > On Thu, Jan 18, 2024 at 5:52 AM Matthew Wilcox <[email protected]> wrote:
> > > >
> > > > On Wed, Jan 17, 2024 at 10:20:29PM -0800, Yosry Ahmed wrote:
> > > > > > /* walk the tree and free everything */
> > > > > > spin_lock(&tree->lock);
> > > > > > +
> > > > > > + xas_for_each(&xas, e, ULONG_MAX)
> > > > >
> > > > > Why not use xa_for_each?
> > > >
> > > > xas_for_each() is O(n) while xa_for_each() is O(n log n), as mentioned
> > > > in the fine documentation. If you don't need to drop the lock while
> > > > in the body of the loop, always prefer xas_for_each().
> > >
> > > Thanks for pointing this out. Out of ignorance, I skipped reading the
> > > doc for this one and operated under the general assumption to use xa_*
> > > functions were possible.
> > >
> > > The doc also says we should hold either the RCU read lock or the
> > > xa_lock while iterating, we are not doing either here, no?
> >
> > I have no idea; I haven't studied the patches in detail yet. I have
> > debugging assertions for that, so I was assuming that Chris had been
> > developing with debug options turned on. If not, I guess the bots will
> > do it for him.
>
> It is fine now because we have the extra zswap tree spin lock. When we
> remove the zswap tree spin lock it does require RCU read lock. You are
> right I would get assertion failure.

I would imagine the assertions are that we are holding either the RCU
read lock or the xa_lock, how would holding the zswap tree lock help?

2024-01-19 19:37:16

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: zswap.c: remove RB tree

On Thu, Jan 18, 2024 at 9:43 PM Chris Li <[email protected]> wrote:
>
> Hi Yosry,
>
> On Wed, Jan 17, 2024 at 10:35 PM Yosry Ahmed <[email protected]> wrote:
> >
> > > @@ -493,45 +471,47 @@ static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
> > > static int zswap_insert(struct zswap_tree *tree, struct zswap_entry *entry,
> > > struct zswap_entry **dupentry)
> > > {
> > > - struct rb_root *root = &tree->rbroot;
> > > - struct rb_node **link = &root->rb_node, *parent = NULL;
> > > - struct zswap_entry *myentry, *old;
> > > - pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
> > > -
> > > -
> > > - while (*link) {
> > > - parent = *link;
> > > - myentry = rb_entry(parent, struct zswap_entry, rbnode);
> > > - myentry_offset = swp_offset(myentry->swpentry);
> > > - if (myentry_offset > entry_offset)
> > > - link = &(*link)->rb_left;
> > > - else if (myentry_offset < entry_offset)
> > > - link = &(*link)->rb_right;
> > > - else {
> > > - old = xa_load(&tree->xarray, entry_offset);
> > > - BUG_ON(old != myentry);
> > > - *dupentry = myentry;
> > > + struct zswap_entry *e;
> > > + pgoff_t offset = swp_offset(entry->swpentry);
> > > + XA_STATE(xas, &tree->xarray, offset);
> > > +
> > > + do {
> > > + xas_lock_irq(&xas);
> > > + do {
> > > + e = xas_load(&xas);
> > > + if (xa_is_zero(e))
> > > + e = NULL;
> > > + } while (xas_retry(&xas, e));
> > > + if (xas_valid(&xas) && e) {
> > > + xas_unlock_irq(&xas);
> > > + *dupentry = e;
> > > return -EEXIST;
> > > }
> > > - }
> > > - rb_link_node(&entry->rbnode, parent, link);
> > > - rb_insert_color(&entry->rbnode, root);
> > > - old = xa_store(&tree->xarray, entry_offset, entry, GFP_KERNEL);
> > > - return 0;
> > > + xas_store(&xas, entry);
> > > + xas_unlock_irq(&xas);
> > > + } while (xas_nomem(&xas, GFP_KERNEL));
> > > + return xas_error(&xas);
> >
> > I think using the xas_* APIs can be avoided here. The only reason we
> > need it is that we want to check if there's an existing entry first,
> > and return -EEXIST. However, in that case, the caller will replace it
> > anyway (and do some operations on the dupentry):
>
> We might be able to for the insert case if we don't mind changing the
> code behavior a bit. My original intent is to keep close to the
> original zswap code and not stir the pot too much for the xarray
> replacement. We can always make more adjustment once the RB tree is
> gone.

I don't see how this changes code behavior though. The current code in
zswap_store() will do the following:

- Hold the tree lock to make sure no one else modifies it.
- Try to insert, check if there is already a dupentry at the index and
return -EEXIST.
- Warn, increment zswap_duplicate_entry, and invalidate the dupentry.
- Try to insert again (this should always succeed since we are holding
the lock).

What I am proposing is:
- zswap_xa_insert() is a thin wrapper around xa_store() (or we can
remove it completely).
- zswap_store() does the following:
- Use zswap_xa_insert() and check if there is a returned dupentry.
- Warn, increment zswap_duplicate_entry, and invalidate the dupentry.

Either way, we always place the entry we have in the tree, and if
there is a dupentry we warn and invalidate it. If anything, the latter
is more straightforward.

Am I missing something?

> >
> > > }
> > >
> > > static bool zswap_erase(struct zswap_tree *tree, struct zswap_entry *entry)
> > > {
> > > + struct zswap_entry *e;
> > > pgoff_t offset = swp_offset(entry->swpentry);
> > > - if (!RB_EMPTY_NODE(&entry->rbnode)) {
> > > - struct zswap_entry *old;
> > > - old = xa_erase(&tree->xarray, offset);
> > > - BUG_ON(old != entry);
> > > - rb_erase(&entry->rbnode, &tree->rbroot);
> > > - RB_CLEAR_NODE(&entry->rbnode);
> > > - return true;
> > > - }
> > > - return false;
> > > + XA_STATE(xas, &tree->xarray, offset);
> > > +
> > > + do {
> > > + xas_lock_irq(&xas);
> > > + do {
> > > + e = xas_load(&xas);
> > > + } while (xas_retry(&xas, e));
> > > + if (xas_valid(&xas) && e != entry) {
> > > + xas_unlock_irq(&xas);
> > > + return false;
> > > + }
> > > + xas_store(&xas, NULL);
> > > + xas_unlock_irq(&xas);
> > > + } while (xas_nomem(&xas, GFP_KERNEL));
> > > + return !xas_error(&xas);
> > > }
> >
> > Same here, I think we just want:
> >
> > return !!xa_erase(..);
>
> For the erase case it is tricky.
> The current zswap code does not erase an entry if the tree entry at
> the same offset has been changed. It should be fine if the new entry
> is NULL. Basically some race to remove the entry already. However, if
> the entry is not NULL, then force resetting it to NULL will change
> behavior compared to the current.

I see, very good point. I think we can use xa_cmpxchg() and pass in NULL?

2024-01-19 19:41:47

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: zswap.c: remove RB tree

> > If this is doable, I think we can return xa_store(..) and keep the
> > logic in the caller. I think there's a chance
> > zswap_{search/insert/erase} may end up being very thin wrappers around
> > xa_{load/store/erase}, and we may be able to remove them completely.
> > Let's see how it goes.
> >
>
> See my other email about erasing an entry raced into a new entry. That
> is the part I am not certain.
> Anyway, when zswap adopte folio, then it might need to handle multi
> entry, we will be back to using the XAS API.

Handling large folios in zswap is a much larger topic that involves a
lot more than this xa_* vs. xas_* apis dispute. Let's not worry about
this for now.

2024-01-19 20:04:59

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: zswap.c: add xarray tree to zswap

On Fri, Jan 19, 2024 at 11:29:42AM -0800, Yosry Ahmed wrote:
> I see, but it's not clear to me if the xarray is being properly
> cleaned up in this case.
>
> Do we have to call xa_destroy() anyway to make sure everything is
> cleaned up in the xarray? In that case, we can just do that after the
> loop.

You do not need to call xa_destroy(). xa_destroy() exists for two
patterns: first, that you're storing values, not pointers in the tree,
and you can just delete the tree without leaking memory. second, that
you do xas_for_each() { kfree(p); }; xa_destroy(); that's more
efficient than xas_for_each() { kfree(p); xas_store(NULL); } as it
batches the freeing of the nodes to the end.

if your code is naturally structured so that you delete the entries
after freeing them, you have no reason to call xa_destroy().

2024-01-19 21:32:43

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: zswap.c: remove RB tree

On Fri, Jan 19, 2024 at 11:37 AM Yosry Ahmed <[email protected]> wrote:
> > > I think using the xas_* APIs can be avoided here. The only reason we
> > > need it is that we want to check if there's an existing entry first,
> > > and return -EEXIST. However, in that case, the caller will replace it
> > > anyway (and do some operations on the dupentry):
> >
> > We might be able to for the insert case if we don't mind changing the
> > code behavior a bit. My original intent is to keep close to the
> > original zswap code and not stir the pot too much for the xarray
> > replacement. We can always make more adjustment once the RB tree is
> > gone.
>
> I don't see how this changes code behavior though. The current code in
> zswap_store() will do the following:

I am referring to the log and update counter happening after the zswap
mapping was updated. Maybe nobody actually cares about that behavior
difference. In my mind, there is a difference.

>
> - Hold the tree lock to make sure no one else modifies it.
> - Try to insert, check if there is already a dupentry at the index and
> return -EEXIST.
> - Warn, increment zswap_duplicate_entry, and invalidate the dupentry.
> - Try to insert again (this should always succeed since we are holding
> the lock).
>
> What I am proposing is:
> - zswap_xa_insert() is a thin wrapper around xa_store() (or we can
> remove it completely).
> - zswap_store() does the following:
> - Use zswap_xa_insert() and check if there is a returned dupentry.
> - Warn, increment zswap_duplicate_entry, and invalidate the dupentry.
>
> Either way, we always place the entry we have in the tree, and if
> there is a dupentry we warn and invalidate it. If anything, the latter
> is more straightforward.
>
> Am I missing something?

No, that is what I would do if I have to use the xa_* API.


>
> > >
> > > > }
> > > >
> > > > static bool zswap_erase(struct zswap_tree *tree, struct zswap_entry *entry)
> > > > {
> > > > + struct zswap_entry *e;
> > > > pgoff_t offset = swp_offset(entry->swpentry);
> > > > - if (!RB_EMPTY_NODE(&entry->rbnode)) {
> > > > - struct zswap_entry *old;
> > > > - old = xa_erase(&tree->xarray, offset);
> > > > - BUG_ON(old != entry);
> > > > - rb_erase(&entry->rbnode, &tree->rbroot);
> > > > - RB_CLEAR_NODE(&entry->rbnode);
> > > > - return true;
> > > > - }
> > > > - return false;
> > > > + XA_STATE(xas, &tree->xarray, offset);
> > > > +
> > > > + do {
> > > > + xas_lock_irq(&xas);
> > > > + do {
> > > > + e = xas_load(&xas);
> > > > + } while (xas_retry(&xas, e));
> > > > + if (xas_valid(&xas) && e != entry) {
> > > > + xas_unlock_irq(&xas);
> > > > + return false;
> > > > + }
> > > > + xas_store(&xas, NULL);
> > > > + xas_unlock_irq(&xas);
> > > > + } while (xas_nomem(&xas, GFP_KERNEL));
> > > > + return !xas_error(&xas);
> > > > }
> > >
> > > Same here, I think we just want:
> > >
> > > return !!xa_erase(..);
> >
> > For the erase case it is tricky.
> > The current zswap code does not erase an entry if the tree entry at
> > the same offset has been changed. It should be fine if the new entry
> > is NULL. Basically some race to remove the entry already. However, if
> > the entry is not NULL, then force resetting it to NULL will change
> > behavior compared to the current.
>
> I see, very good point. I think we can use xa_cmpxchg() and pass in NULL?
>
That is certainly possible. Thanks for bringing it up.
Let me try to combine the tree->lock with xarray lock first. If
xa_cmpxchg() can simplify the result there, I will use it.


> Handling large folios in zswap is a much larger topic that involves a
> lot more than this xa_* vs. xas_* apis dispute. Let's not worry about
> this for now.

Ack. One more reason to use the XAS interface is that zswap currently
does multiple lookups on typical zswap_load(). It finds entries by
offset, for the entry (lookup one). Then after folio install to swap
cache, it deletes the entry, it will performan another lookup to
delete the entry (look up two). Using XAS might be able to cache the
node location for the second lookup to avoid the full node walk. That
is not in my current patch and can be a later improvement patch as
well.

Chris

2024-01-19 21:45:24

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: zswap.c: remove RB tree

On Fri, Jan 19, 2024 at 1:32 PM Chris Li <[email protected]> wrote:
>
> On Fri, Jan 19, 2024 at 11:37 AM Yosry Ahmed <[email protected]> wrote:
> > > > I think using the xas_* APIs can be avoided here. The only reason we
> > > > need it is that we want to check if there's an existing entry first,
> > > > and return -EEXIST. However, in that case, the caller will replace it
> > > > anyway (and do some operations on the dupentry):
> > >
> > > We might be able to for the insert case if we don't mind changing the
> > > code behavior a bit. My original intent is to keep close to the
> > > original zswap code and not stir the pot too much for the xarray
> > > replacement. We can always make more adjustment once the RB tree is
> > > gone.
> >
> > I don't see how this changes code behavior though. The current code in
> > zswap_store() will do the following:
>
> I am referring to the log and update counter happening after the zswap
> mapping was updated. Maybe nobody actually cares about that behavior
> difference. In my mind, there is a difference.

I don't think it matters tbh, certainly not worth the more complicated
implementation.

> > > > > static bool zswap_erase(struct zswap_tree *tree, struct zswap_entry *entry)
> > > > > {
> > > > > + struct zswap_entry *e;
> > > > > pgoff_t offset = swp_offset(entry->swpentry);
> > > > > - if (!RB_EMPTY_NODE(&entry->rbnode)) {
> > > > > - struct zswap_entry *old;
> > > > > - old = xa_erase(&tree->xarray, offset);
> > > > > - BUG_ON(old != entry);
> > > > > - rb_erase(&entry->rbnode, &tree->rbroot);
> > > > > - RB_CLEAR_NODE(&entry->rbnode);
> > > > > - return true;
> > > > > - }
> > > > > - return false;
> > > > > + XA_STATE(xas, &tree->xarray, offset);
> > > > > +
> > > > > + do {
> > > > > + xas_lock_irq(&xas);
> > > > > + do {
> > > > > + e = xas_load(&xas);
> > > > > + } while (xas_retry(&xas, e));
> > > > > + if (xas_valid(&xas) && e != entry) {
> > > > > + xas_unlock_irq(&xas);
> > > > > + return false;
> > > > > + }
> > > > > + xas_store(&xas, NULL);
> > > > > + xas_unlock_irq(&xas);
> > > > > + } while (xas_nomem(&xas, GFP_KERNEL));
> > > > > + return !xas_error(&xas);
> > > > > }
> > > >
> > > > Same here, I think we just want:
> > > >
> > > > return !!xa_erase(..);
> > >
> > > For the erase case it is tricky.
> > > The current zswap code does not erase an entry if the tree entry at
> > > the same offset has been changed. It should be fine if the new entry
> > > is NULL. Basically some race to remove the entry already. However, if
> > > the entry is not NULL, then force resetting it to NULL will change
> > > behavior compared to the current.
> >
> > I see, very good point. I think we can use xa_cmpxchg() and pass in NULL?
> >
> That is certainly possible. Thanks for bringing it up.
> Let me try to combine the tree->lock with xarray lock first. If
> xa_cmpxchg() can simplify the result there, I will use it.

SGTM.

> > Handling large folios in zswap is a much larger topic that involves a
> > lot more than this xa_* vs. xas_* apis dispute. Let's not worry about
> > this for now.
>
> Ack. One more reason to use the XAS interface is that zswap currently
> does multiple lookups on typical zswap_load(). It finds entries by
> offset, for the entry (lookup one). Then after folio install to swap
> cache, it deletes the entry, it will performan another lookup to
> delete the entry (look up two). Using XAS might be able to cache the
> node location for the second lookup to avoid the full node walk. That
> is not in my current patch and can be a later improvement patch as
> well.

One more straightforward optimization we can do with the xas_* API is
to cache the lookup done in zswap_load() and reuse it when doing
invalidations for exclusive loads.

For the initial implementation, let's keep it simple and try to use
the xa_* APIs where possible.

2024-01-19 21:52:24

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: zswap.c: add xarray tree to zswap

On Fri, Jan 19, 2024 at 12:04 PM Matthew Wilcox <[email protected]> wrote:
>
> On Fri, Jan 19, 2024 at 11:29:42AM -0800, Yosry Ahmed wrote:
> > I see, but it's not clear to me if the xarray is being properly
> > cleaned up in this case.
> >
> > Do we have to call xa_destroy() anyway to make sure everything is
> > cleaned up in the xarray? In that case, we can just do that after the
> > loop.
>
> You do not need to call xa_destroy(). xa_destroy() exists for two
> patterns: first, that you're storing values, not pointers in the tree,
> and you can just delete the tree without leaking memory. second, that
> you do xas_for_each() { kfree(p); }; xa_destroy(); that's more
> efficient than xas_for_each() { kfree(p); xas_store(NULL); } as it
> batches the freeing of the nodes to the end.
>
> if your code is naturally structured so that you delete the entries
> after freeing them, you have no reason to call xa_destroy().

Thanks for elaborating. Based on this, I believe doing xas_for_each()
{ zswap_free_entry(); }; xa_destroy(); is both closer to the current
code structure and more efficient.

2024-01-19 22:05:53

by Chris Li

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: zswap.c: add xarray tree to zswap

Hi Yosry,

On Fri, Jan 19, 2024 at 1:41 PM Yosry Ahmed <[email protected]> wrote:
> > if your code is naturally structured so that you delete the entries
> > after freeing them, you have no reason to call xa_destroy().
>
> Thanks for elaborating. Based on this, I believe doing xas_for_each()
> { zswap_free_entry(); }; xa_destroy(); is both closer to the current
> code structure and more efficient.
>
Can't do that in this case though. Because you get the RCU read lock
on the tree.
Other threads can still lookup the xarray (also RCU read lock) and get
a pointer to the already freed memory.

Chris

2024-01-19 22:08:53

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: zswap.c: add xarray tree to zswap

On Fri, Jan 19, 2024 at 2:05 PM Chris Li <[email protected]> wrote:
>
> Hi Yosry,
>
> On Fri, Jan 19, 2024 at 1:41 PM Yosry Ahmed <[email protected]> wrote:
> > > if your code is naturally structured so that you delete the entries
> > > after freeing them, you have no reason to call xa_destroy().
> >
> > Thanks for elaborating. Based on this, I believe doing xas_for_each()
> > { zswap_free_entry(); }; xa_destroy(); is both closer to the current
> > code structure and more efficient.
> >
> Can't do that in this case though. Because you get the RCU read lock
> on the tree.
> Other threads can still lookup the xarray (also RCU read lock) and get
> a pointer to the already freed memory.

During swapoff no one should be trying to swapin or swapout, where
would the lookups come from?