2023-06-05 09:18:26

by Domenico Cerasuolo

[permalink] [raw]
Subject: [PATCH 0/7] mm: zswap: move writeback LRU from zpool to zswap

This series aims to improve the zswap reclaim mechanism by reorganizing
the LRU management. In the current implementation, the LRU is maintained
within each zpool driver, resulting in duplicated code across the three
drivers. The proposed change consists in moving the LRU management from
the individual implementations up to the zswap layer.

The primary objective of this refactoring effort is to simplify the
codebase. By unifying the reclaim loop and consolidating LRU handling
within zswap, we can eliminate redundant code and improve
maintainability. Additionally, this change enables the reclamation of
stored pages in their actual LRU order. Presently, the zpool drivers
link backing pages in an LRU, causing compressed pages with different
LRU positions to be written back simultaneously.

The series consists of several patches. The first patch implements the
LRU and the reclaim loop in zswap, but it is not used yet because all
three driver implementations are marked as zpool_evictable.
The following three commits modify each zpool driver to be not
zpool_evictable, allowing the use of the reclaim loop in zswap.
As the drivers removed their shrink functions, the zpool interface is
then trimmed by removing zpool_evictable, zpool_ops, and zpool_shrink.
Finally, the code in zswap is further cleaned up by simplifying the
writeback function and removing the now unnecessary zswap_header.

Based on mm-stable + commit 399ab221f3ff
("mm: zswap: shrink until can accept") currently in mm-unstable.

Domenico Cerasuolo (7):
mm: zswap: add pool shrinking mechanism
mm: zswap: remove page reclaim logic from zbud
mm: zswap: remove page reclaim logic from z3fold
mm: zswap: remove page reclaim logic from zsmalloc
mm: zswap: remove shrink from zpool interface
mm: zswap: simplify writeback function
mm: zswap: remove zswap_header

include/linux/zpool.h | 19 +--
mm/z3fold.c | 249 +----------------------------------
mm/zbud.c | 167 +-----------------------
mm/zpool.c | 48 +------
mm/zsmalloc.c | 294 +-----------------------------------------
mm/zswap.c | 159 +++++++++++++----------
6 files changed, 104 insertions(+), 832 deletions(-)

--
2.34.1



2023-06-05 09:19:13

by Domenico Cerasuolo

[permalink] [raw]
Subject: [RFC PATCH 1/7] mm: zswap: add pool shrinking mechanism

Each zpool driver (zbud, z3fold and zsmalloc) implements its own shrink
function, which is called from zpool_shrink. However, with this commit,
a unified shrink function is added to zswap. The ultimate goal is to
eliminate the need for zpool_shrink once all zpool implementations have
dropped their shrink code.

To ensure the functionality of each commit, this change focuses solely
on adding the mechanism itself. No modifications are made to
the backends, meaning that functionally, there are no immediate changes.
The zswap mechanism will only come into effect once the backends have
removed their shrink code. The subsequent commits will address the
modifications needed in the backends.

Signed-off-by: Domenico Cerasuolo <[email protected]>
---
mm/zswap.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 78 insertions(+), 5 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index bcb82e09eb64..80d7780bf066 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -159,6 +159,8 @@ struct zswap_pool {
struct work_struct shrink_work;
struct hlist_node node;
char tfm_name[CRYPTO_MAX_ALG_NAME];
+ struct list_head lru;
+ spinlock_t lock;
};

/*
@@ -176,10 +178,12 @@ struct zswap_pool {
* be held while changing the refcount. Since the lock must
* be held, there is no reason to also make refcount atomic.
* length - the length in bytes of the compressed page data. Needed during
- * decompression. For a same value filled page length is 0.
+ * decompression. For a same value filled page length is 0, and both
+ * pool and lru are invalid and must be ignored.
* pool - the zswap_pool the entry's data is in
* handle - zpool allocation handle that stores the compressed page data
* value - value of the same-value filled pages which have same content
+ * lru - handle to the pool's lru used to evict pages.
*/
struct zswap_entry {
struct rb_node rbnode;
@@ -192,6 +196,7 @@ struct zswap_entry {
unsigned long value;
};
struct obj_cgroup *objcg;
+ struct list_head lru;
};

struct zswap_header {
@@ -364,6 +369,9 @@ static void zswap_free_entry(struct zswap_entry *entry)
if (!entry->length)
atomic_dec(&zswap_same_filled_pages);
else {
+ spin_lock(&entry->pool->lock);
+ list_del_init(&entry->lru);
+ spin_unlock(&entry->pool->lock);
zpool_free(entry->pool->zpool, entry->handle);
zswap_pool_put(entry->pool);
}
@@ -584,14 +592,65 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
return NULL;
}

+static int zswap_shrink(struct zswap_pool *pool)
+{
+ struct zswap_entry *lru_entry, *tree_entry = NULL;
+ struct zswap_header *zhdr;
+ struct zswap_tree *tree;
+ swp_entry_t swpentry;
+ int ret;
+
+ /* get a reclaimable entry from LRU */
+ spin_lock(&pool->lock);
+ if (list_empty(&pool->lru)) {
+ spin_unlock(&pool->lock);
+ return -EINVAL;
+ }
+ lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
+ list_del_init(&lru_entry->lru);
+ zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
+ tree = zswap_trees[swp_type(zhdr->swpentry)];
+ zpool_unmap_handle(pool->zpool, lru_entry->handle);
+ swpentry = zhdr->swpentry;
+ spin_unlock(&pool->lock);
+
+ /* hold a reference from tree so it won't be freed during writeback */
+ spin_lock(&tree->lock);
+ tree_entry = zswap_entry_find_get(&tree->rbroot, swp_offset(swpentry));
+ if (tree_entry != lru_entry) {
+ if (tree_entry)
+ zswap_entry_put(tree, tree_entry);
+ spin_unlock(&tree->lock);
+ return -EAGAIN;
+ }
+ spin_unlock(&tree->lock);
+
+ ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
+
+ spin_lock(&tree->lock);
+ if (ret) {
+ spin_lock(&pool->lock);
+ list_move(&lru_entry->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ }
+ zswap_entry_put(tree, tree_entry);
+ spin_unlock(&tree->lock);
+
+ return ret ? -EAGAIN : 0;
+}
+
static void shrink_worker(struct work_struct *w)
{
struct zswap_pool *pool = container_of(w, typeof(*pool),
shrink_work);
int ret, failures = 0;

+ /* zpool_evictable will be removed once all 3 backends have migrated*/
do {
- ret = zpool_shrink(pool->zpool, 1, NULL);
+ if (zpool_evictable(pool->zpool))
+ ret = zpool_shrink(pool->zpool, 1, NULL);
+ else
+ ret = zswap_shrink(pool);
if (ret) {
zswap_reject_reclaim_fail++;
if (ret != -EAGAIN)
@@ -655,6 +714,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
*/
kref_init(&pool->kref);
INIT_LIST_HEAD(&pool->list);
+ INIT_LIST_HEAD(&pool->lru);
+ spin_lock_init(&pool->lock);
INIT_WORK(&pool->shrink_work, shrink_worker);

zswap_pool_debug("created", pool);
@@ -1270,7 +1331,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
}

/* store */
- hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
+ hlen = sizeof(zhdr);
gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
if (zpool_malloc_support_movable(entry->pool->zpool))
gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
@@ -1313,6 +1374,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
zswap_entry_put(tree, dupentry);
}
} while (ret == -EEXIST);
+ INIT_LIST_HEAD(&entry->lru);
+ /* zpool_evictable will be removed once all 3 backends have migrated*/
+ if (entry->length && !zpool_evictable(entry->pool->zpool)) {
+ spin_lock(&entry->pool->lock);
+ list_add(&entry->lru, &entry->pool->lru);
+ spin_unlock(&entry->pool->lock);
+ }
spin_unlock(&tree->lock);

/* update stats */
@@ -1384,8 +1452,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
/* decompress */
dlen = PAGE_SIZE;
src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
- if (zpool_evictable(entry->pool->zpool))
- src += sizeof(struct zswap_header);
+ src += sizeof(struct zswap_header);

if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
memcpy(tmp, src, entry->length);
@@ -1415,6 +1482,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
freeentry:
spin_lock(&tree->lock);
zswap_entry_put(tree, entry);
+ /* zpool_evictable will be removed once all 3 backends have migrated*/
+ if (entry->length && !zpool_evictable(entry->pool->zpool)) {
+ spin_lock(&entry->pool->lock);
+ list_move(&entry->lru, &entry->pool->lru);
+ spin_unlock(&entry->pool->lock);
+ }
spin_unlock(&tree->lock);

return ret;
--
2.34.1


2023-06-05 15:39:26

by Johannes Weiner

[permalink] [raw]
Subject: Re: [RFC PATCH 1/7] mm: zswap: add pool shrinking mechanism

Hi Domenico,

On Mon, Jun 05, 2023 at 10:54:13AM +0200, Domenico Cerasuolo wrote:
> @@ -364,6 +369,9 @@ static void zswap_free_entry(struct zswap_entry *entry)
> if (!entry->length)
> atomic_dec(&zswap_same_filled_pages);
> else {
> + spin_lock(&entry->pool->lock);
> + list_del_init(&entry->lru);
> + spin_unlock(&entry->pool->lock);

This should be list_del(), as the entry is freed right after this and
the list isn't reused anymore.

The slab memory is recycled, but the allocation site (commented on
below) doesn't need the list initialized.

However, I think it also needs to check !zpool_evictable(). If
alloc/store doesn't do the list_add(), this would be a double delete.

> @@ -584,14 +592,65 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> return NULL;
> }
>
> +static int zswap_shrink(struct zswap_pool *pool)
> +{
> + struct zswap_entry *lru_entry, *tree_entry = NULL;
> + struct zswap_header *zhdr;
> + struct zswap_tree *tree;
> + swp_entry_t swpentry;
> + int ret;
> +
> + /* get a reclaimable entry from LRU */
> + spin_lock(&pool->lock);
> + if (list_empty(&pool->lru)) {
> + spin_unlock(&pool->lock);
> + return -EINVAL;
> + }
> + lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> + list_del_init(&lru_entry->lru);
> + zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> + tree = zswap_trees[swp_type(zhdr->swpentry)];
> + zpool_unmap_handle(pool->zpool, lru_entry->handle);
> + swpentry = zhdr->swpentry;
> + spin_unlock(&pool->lock);

Once the pool lock is dropped, the lru_entry might get freed. But the
swpentry is copied to the stack, and lru_entry isn't deref'd again
until the entry is verified to still be alive in the tree.

This could use a comment.

> + /* hold a reference from tree so it won't be freed during writeback */
> + spin_lock(&tree->lock);
> + tree_entry = zswap_entry_find_get(&tree->rbroot, swp_offset(swpentry));
> + if (tree_entry != lru_entry) {
> + if (tree_entry)
> + zswap_entry_put(tree, tree_entry);
> + spin_unlock(&tree->lock);
> + return -EAGAIN;
> + }
> + spin_unlock(&tree->lock);

It's pretty outrageous how much simpler this is compared to the
<backend>_reclaim_page() functions! The backends have to jump through
a lot of hoops to serialize against freeing, whereas zswap can simply
hold a reference. This is clearly a much better design.

> + ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> +
> + spin_lock(&tree->lock);
> + if (ret) {
> + spin_lock(&pool->lock);
> + list_move(&lru_entry->lru, &pool->lru);
> + spin_unlock(&pool->lock);
> + }
> + zswap_entry_put(tree, tree_entry);
> + spin_unlock(&tree->lock);
> +
> + return ret ? -EAGAIN : 0;
> +}
> +
> static void shrink_worker(struct work_struct *w)
> {
> struct zswap_pool *pool = container_of(w, typeof(*pool),
> shrink_work);
> int ret, failures = 0;
>
> + /* zpool_evictable will be removed once all 3 backends have migrated*/

Missing space between text and */

> do {
> - ret = zpool_shrink(pool->zpool, 1, NULL);
> + if (zpool_evictable(pool->zpool))
> + ret = zpool_shrink(pool->zpool, 1, NULL);
> + else
> + ret = zswap_shrink(pool);
> if (ret) {
> zswap_reject_reclaim_fail++;
> if (ret != -EAGAIN)
> @@ -655,6 +714,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> */
> kref_init(&pool->kref);
> INIT_LIST_HEAD(&pool->list);
> + INIT_LIST_HEAD(&pool->lru);
> + spin_lock_init(&pool->lock);
> INIT_WORK(&pool->shrink_work, shrink_worker);
>
> zswap_pool_debug("created", pool);
> @@ -1270,7 +1331,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> }
>
> /* store */
> - hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
> + hlen = sizeof(zhdr);
> gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> if (zpool_malloc_support_movable(entry->pool->zpool))
> gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> @@ -1313,6 +1374,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> zswap_entry_put(tree, dupentry);
> }
> } while (ret == -EEXIST);
> + INIT_LIST_HEAD(&entry->lru);

The list_add() below initializes the entry, so this shouldn't be
needed.

> + /* zpool_evictable will be removed once all 3 backends have migrated*/
> + if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> + spin_lock(&entry->pool->lock);
> + list_add(&entry->lru, &entry->pool->lru);
> + spin_unlock(&entry->pool->lock);
> + }
> spin_unlock(&tree->lock);
>
> /* update stats */

2023-06-06 02:48:41

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [RFC PATCH 1/7] mm: zswap: add pool shrinking mechanism

Hi Domenico,

On Mon, Jun 5, 2023 at 1:54 AM Domenico Cerasuolo
<[email protected]> wrote:
>
> Each zpool driver (zbud, z3fold and zsmalloc) implements its own shrink
> function, which is called from zpool_shrink. However, with this commit,
> a unified shrink function is added to zswap. The ultimate goal is to
> eliminate the need for zpool_shrink once all zpool implementations have
> dropped their shrink code.
>
> To ensure the functionality of each commit, this change focuses solely
> on adding the mechanism itself. No modifications are made to
> the backends, meaning that functionally, there are no immediate changes.
> The zswap mechanism will only come into effect once the backends have
> removed their shrink code. The subsequent commits will address the
> modifications needed in the backends.
>
> Signed-off-by: Domenico Cerasuolo <[email protected]>
> ---
> mm/zswap.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 78 insertions(+), 5 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index bcb82e09eb64..80d7780bf066 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -159,6 +159,8 @@ struct zswap_pool {
> struct work_struct shrink_work;
> struct hlist_node node;
> char tfm_name[CRYPTO_MAX_ALG_NAME];
> + struct list_head lru;
> + spinlock_t lock;

If this lock is only protecting the lru then I believe it's better to
call in lru_lock to make it explicit.

> };
>
> /*
> @@ -176,10 +178,12 @@ struct zswap_pool {
> * be held while changing the refcount. Since the lock must
> * be held, there is no reason to also make refcount atomic.
> * length - the length in bytes of the compressed page data. Needed during
> - * decompression. For a same value filled page length is 0.
> + * decompression. For a same value filled page length is 0, and both
> + * pool and lru are invalid and must be ignored.
> * pool - the zswap_pool the entry's data is in
> * handle - zpool allocation handle that stores the compressed page data
> * value - value of the same-value filled pages which have same content
> + * lru - handle to the pool's lru used to evict pages.
> */
> struct zswap_entry {
> struct rb_node rbnode;
> @@ -192,6 +196,7 @@ struct zswap_entry {
> unsigned long value;
> };
> struct obj_cgroup *objcg;
> + struct list_head lru;
> };
>
> struct zswap_header {
> @@ -364,6 +369,9 @@ static void zswap_free_entry(struct zswap_entry *entry)
> if (!entry->length)
> atomic_dec(&zswap_same_filled_pages);
> else {
> + spin_lock(&entry->pool->lock);
> + list_del_init(&entry->lru);
> + spin_unlock(&entry->pool->lock);

I think we should document the lock ordering somewhere (tree lock ->
lru lock), otherwise we may run into an ABBA deadlock down the road.

> zpool_free(entry->pool->zpool, entry->handle);
> zswap_pool_put(entry->pool);
> }
> @@ -584,14 +592,65 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> return NULL;
> }
>
> +static int zswap_shrink(struct zswap_pool *pool)
> +{
> + struct zswap_entry *lru_entry, *tree_entry = NULL;
> + struct zswap_header *zhdr;
> + struct zswap_tree *tree;
> + swp_entry_t swpentry;
> + int ret;
> +
> + /* get a reclaimable entry from LRU */
> + spin_lock(&pool->lock);
> + if (list_empty(&pool->lru)) {
> + spin_unlock(&pool->lock);
> + return -EINVAL;
> + }
> + lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> + list_del_init(&lru_entry->lru);
> + zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> + tree = zswap_trees[swp_type(zhdr->swpentry)];
> + zpool_unmap_handle(pool->zpool, lru_entry->handle);
> + swpentry = zhdr->swpentry;
> + spin_unlock(&pool->lock);
> +
> + /* hold a reference from tree so it won't be freed during writeback */
> + spin_lock(&tree->lock);
> + tree_entry = zswap_entry_find_get(&tree->rbroot, swp_offset(swpentry));
> + if (tree_entry != lru_entry) {
> + if (tree_entry)
> + zswap_entry_put(tree, tree_entry);
> + spin_unlock(&tree->lock);
> + return -EAGAIN;
> + }
> + spin_unlock(&tree->lock);
> +
> + ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> +
> + spin_lock(&tree->lock);
> + if (ret) {
> + spin_lock(&pool->lock);
> + list_move(&lru_entry->lru, &pool->lru);
> + spin_unlock(&pool->lock);
> + }
> + zswap_entry_put(tree, tree_entry);
> + spin_unlock(&tree->lock);
> +
> + return ret ? -EAGAIN : 0;
> +}
> +
> static void shrink_worker(struct work_struct *w)
> {
> struct zswap_pool *pool = container_of(w, typeof(*pool),
> shrink_work);
> int ret, failures = 0;
>
> + /* zpool_evictable will be removed once all 3 backends have migrated*/
> do {
> - ret = zpool_shrink(pool->zpool, 1, NULL);
> + if (zpool_evictable(pool->zpool))
> + ret = zpool_shrink(pool->zpool, 1, NULL);
> + else
> + ret = zswap_shrink(pool);
> if (ret) {
> zswap_reject_reclaim_fail++;
> if (ret != -EAGAIN)
> @@ -655,6 +714,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> */
> kref_init(&pool->kref);
> INIT_LIST_HEAD(&pool->list);
> + INIT_LIST_HEAD(&pool->lru);
> + spin_lock_init(&pool->lock);
> INIT_WORK(&pool->shrink_work, shrink_worker);
>
> zswap_pool_debug("created", pool);
> @@ -1270,7 +1331,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> }
>
> /* store */
> - hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
> + hlen = sizeof(zhdr);
> gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> if (zpool_malloc_support_movable(entry->pool->zpool))
> gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> @@ -1313,6 +1374,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> zswap_entry_put(tree, dupentry);
> }
> } while (ret == -EEXIST);
> + INIT_LIST_HEAD(&entry->lru);
> + /* zpool_evictable will be removed once all 3 backends have migrated*/
> + if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> + spin_lock(&entry->pool->lock);
> + list_add(&entry->lru, &entry->pool->lru);
> + spin_unlock(&entry->pool->lock);
> + }
> spin_unlock(&tree->lock);
>
> /* update stats */
> @@ -1384,8 +1452,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> /* decompress */
> dlen = PAGE_SIZE;
> src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
> - if (zpool_evictable(entry->pool->zpool))
> - src += sizeof(struct zswap_header);
> + src += sizeof(struct zswap_header);
>
> if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> memcpy(tmp, src, entry->length);
> @@ -1415,6 +1482,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> freeentry:
> spin_lock(&tree->lock);
> zswap_entry_put(tree, entry);
> + /* zpool_evictable will be removed once all 3 backends have migrated*/
> + if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> + spin_lock(&entry->pool->lock);
> + list_move(&entry->lru, &entry->pool->lru);
> + spin_unlock(&entry->pool->lock);
> + }
> spin_unlock(&tree->lock);
>
> return ret;
> --
> 2.34.1
>

2023-06-06 09:49:58

by Domenico Cerasuolo

[permalink] [raw]
Subject: Re: [RFC PATCH 1/7] mm: zswap: add pool shrinking mechanism

On Mon, Jun 5, 2023 at 5:29 PM Johannes Weiner <[email protected]> wrote:
>
> Hi Domenico,
>
> On Mon, Jun 05, 2023 at 10:54:13AM +0200, Domenico Cerasuolo wrote:
> > @@ -364,6 +369,9 @@ static void zswap_free_entry(struct zswap_entry *entry)
> > if (!entry->length)
> > atomic_dec(&zswap_same_filled_pages);
> > else {
> > + spin_lock(&entry->pool->lock);
> > + list_del_init(&entry->lru);
> > + spin_unlock(&entry->pool->lock);
>
> This should be list_del(), as the entry is freed right after this and
> the list isn't reused anymore.
>
> The slab memory is recycled, but the allocation site (commented on
> below) doesn't need the list initialized.
>
> However, I think it also needs to check !zpool_evictable(). If
> alloc/store doesn't do the list_add(), this would be a double delete.
>

Thanks Johannes, I'm addressing all of your comments in the series in a V2.

> > @@ -584,14 +592,65 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> > return NULL;
> > }
> >
> > +static int zswap_shrink(struct zswap_pool *pool)
> > +{
> > + struct zswap_entry *lru_entry, *tree_entry = NULL;
> > + struct zswap_header *zhdr;
> > + struct zswap_tree *tree;
> > + swp_entry_t swpentry;
> > + int ret;
> > +
> > + /* get a reclaimable entry from LRU */
> > + spin_lock(&pool->lock);
> > + if (list_empty(&pool->lru)) {
> > + spin_unlock(&pool->lock);
> > + return -EINVAL;
> > + }
> > + lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> > + list_del_init(&lru_entry->lru);
> > + zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> > + tree = zswap_trees[swp_type(zhdr->swpentry)];
> > + zpool_unmap_handle(pool->zpool, lru_entry->handle);
> > + swpentry = zhdr->swpentry;
> > + spin_unlock(&pool->lock);
>
> Once the pool lock is dropped, the lru_entry might get freed. But the
> swpentry is copied to the stack, and lru_entry isn't deref'd again
> until the entry is verified to still be alive in the tree.
>
> This could use a comment.
>
> > + /* hold a reference from tree so it won't be freed during writeback */
> > + spin_lock(&tree->lock);
> > + tree_entry = zswap_entry_find_get(&tree->rbroot, swp_offset(swpentry));
> > + if (tree_entry != lru_entry) {
> > + if (tree_entry)
> > + zswap_entry_put(tree, tree_entry);
> > + spin_unlock(&tree->lock);
> > + return -EAGAIN;
> > + }
> > + spin_unlock(&tree->lock);
>
> It's pretty outrageous how much simpler this is compared to the
> <backend>_reclaim_page() functions! The backends have to jump through
> a lot of hoops to serialize against freeing, whereas zswap can simply
> hold a reference. This is clearly a much better design.
>
> > + ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> > +
> > + spin_lock(&tree->lock);
> > + if (ret) {
> > + spin_lock(&pool->lock);
> > + list_move(&lru_entry->lru, &pool->lru);
> > + spin_unlock(&pool->lock);
> > + }
> > + zswap_entry_put(tree, tree_entry);
> > + spin_unlock(&tree->lock);
> > +
> > + return ret ? -EAGAIN : 0;
> > +}
> > +
> > static void shrink_worker(struct work_struct *w)
> > {
> > struct zswap_pool *pool = container_of(w, typeof(*pool),
> > shrink_work);
> > int ret, failures = 0;
> >
> > + /* zpool_evictable will be removed once all 3 backends have migrated*/
>
> Missing space between text and */
>
> > do {
> > - ret = zpool_shrink(pool->zpool, 1, NULL);
> > + if (zpool_evictable(pool->zpool))
> > + ret = zpool_shrink(pool->zpool, 1, NULL);
> > + else
> > + ret = zswap_shrink(pool);
> > if (ret) {
> > zswap_reject_reclaim_fail++;
> > if (ret != -EAGAIN)
> > @@ -655,6 +714,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> > */
> > kref_init(&pool->kref);
> > INIT_LIST_HEAD(&pool->list);
> > + INIT_LIST_HEAD(&pool->lru);
> > + spin_lock_init(&pool->lock);
> > INIT_WORK(&pool->shrink_work, shrink_worker);
> >
> > zswap_pool_debug("created", pool);
> > @@ -1270,7 +1331,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> > }
> >
> > /* store */
> > - hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
> > + hlen = sizeof(zhdr);
> > gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> > if (zpool_malloc_support_movable(entry->pool->zpool))
> > gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> > @@ -1313,6 +1374,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> > zswap_entry_put(tree, dupentry);
> > }
> > } while (ret == -EEXIST);
> > + INIT_LIST_HEAD(&entry->lru);
>
> The list_add() below initializes the entry, so this shouldn't be
> needed.
>
> > + /* zpool_evictable will be removed once all 3 backends have migrated*/
> > + if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> > + spin_lock(&entry->pool->lock);
> > + list_add(&entry->lru, &entry->pool->lru);
> > + spin_unlock(&entry->pool->lock);
> > + }
> > spin_unlock(&tree->lock);
> >
> > /* update stats */

2023-06-06 10:27:47

by Domenico Cerasuolo

[permalink] [raw]
Subject: Re: [RFC PATCH 1/7] mm: zswap: add pool shrinking mechanism

On Tue, Jun 6, 2023 at 4:19 AM Yosry Ahmed <[email protected]> wrote:
>
> Hi Domenico,
>
> On Mon, Jun 5, 2023 at 1:54 AM Domenico Cerasuolo
> <[email protected]> wrote:
> >
> > Each zpool driver (zbud, z3fold and zsmalloc) implements its own shrink
> > function, which is called from zpool_shrink. However, with this commit,
> > a unified shrink function is added to zswap. The ultimate goal is to
> > eliminate the need for zpool_shrink once all zpool implementations have
> > dropped their shrink code.
> >
> > To ensure the functionality of each commit, this change focuses solely
> > on adding the mechanism itself. No modifications are made to
> > the backends, meaning that functionally, there are no immediate changes.
> > The zswap mechanism will only come into effect once the backends have
> > removed their shrink code. The subsequent commits will address the
> > modifications needed in the backends.
> >
> > Signed-off-by: Domenico Cerasuolo <[email protected]>
> > ---
> > mm/zswap.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 78 insertions(+), 5 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index bcb82e09eb64..80d7780bf066 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -159,6 +159,8 @@ struct zswap_pool {
> > struct work_struct shrink_work;
> > struct hlist_node node;
> > char tfm_name[CRYPTO_MAX_ALG_NAME];
> > + struct list_head lru;
> > + spinlock_t lock;
>
> If this lock is only protecting the lru then I believe it's better to
> call in lru_lock to make it explicit.

Hi Yosry,

thanks for the input, it makes sense to call it lru_lock, will update.

>
> > };
> >
> > /*
> > @@ -176,10 +178,12 @@ struct zswap_pool {
> > * be held while changing the refcount. Since the lock must
> > * be held, there is no reason to also make refcount atomic.
> > * length - the length in bytes of the compressed page data. Needed during
> > - * decompression. For a same value filled page length is 0.
> > + * decompression. For a same value filled page length is 0, and both
> > + * pool and lru are invalid and must be ignored.
> > * pool - the zswap_pool the entry's data is in
> > * handle - zpool allocation handle that stores the compressed page data
> > * value - value of the same-value filled pages which have same content
> > + * lru - handle to the pool's lru used to evict pages.
> > */
> > struct zswap_entry {
> > struct rb_node rbnode;
> > @@ -192,6 +196,7 @@ struct zswap_entry {
> > unsigned long value;
> > };
> > struct obj_cgroup *objcg;
> > + struct list_head lru;
> > };
> >
> > struct zswap_header {
> > @@ -364,6 +369,9 @@ static void zswap_free_entry(struct zswap_entry *entry)
> > if (!entry->length)
> > atomic_dec(&zswap_same_filled_pages);
> > else {
> > + spin_lock(&entry->pool->lock);
> > + list_del_init(&entry->lru);
> > + spin_unlock(&entry->pool->lock);
>
> I think we should document the lock ordering somewhere (tree lock ->
> lru lock), otherwise we may run into an ABBA deadlock down the road.

Will update in the next iteration.

>
> > zpool_free(entry->pool->zpool, entry->handle);
> > zswap_pool_put(entry->pool);
> > }
> > @@ -584,14 +592,65 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> > return NULL;
> > }
> >
> > +static int zswap_shrink(struct zswap_pool *pool)
> > +{
> > + struct zswap_entry *lru_entry, *tree_entry = NULL;
> > + struct zswap_header *zhdr;
> > + struct zswap_tree *tree;
> > + swp_entry_t swpentry;
> > + int ret;
> > +
> > + /* get a reclaimable entry from LRU */
> > + spin_lock(&pool->lock);
> > + if (list_empty(&pool->lru)) {
> > + spin_unlock(&pool->lock);
> > + return -EINVAL;
> > + }
> > + lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> > + list_del_init(&lru_entry->lru);
> > + zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> > + tree = zswap_trees[swp_type(zhdr->swpentry)];
> > + zpool_unmap_handle(pool->zpool, lru_entry->handle);
> > + swpentry = zhdr->swpentry;
> > + spin_unlock(&pool->lock);
> > +
> > + /* hold a reference from tree so it won't be freed during writeback */
> > + spin_lock(&tree->lock);
> > + tree_entry = zswap_entry_find_get(&tree->rbroot, swp_offset(swpentry));
> > + if (tree_entry != lru_entry) {
> > + if (tree_entry)
> > + zswap_entry_put(tree, tree_entry);
> > + spin_unlock(&tree->lock);
> > + return -EAGAIN;
> > + }
> > + spin_unlock(&tree->lock);
> > +
> > + ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> > +
> > + spin_lock(&tree->lock);
> > + if (ret) {
> > + spin_lock(&pool->lock);
> > + list_move(&lru_entry->lru, &pool->lru);
> > + spin_unlock(&pool->lock);
> > + }
> > + zswap_entry_put(tree, tree_entry);
> > + spin_unlock(&tree->lock);
> > +
> > + return ret ? -EAGAIN : 0;
> > +}
> > +
> > static void shrink_worker(struct work_struct *w)
> > {
> > struct zswap_pool *pool = container_of(w, typeof(*pool),
> > shrink_work);
> > int ret, failures = 0;
> >
> > + /* zpool_evictable will be removed once all 3 backends have migrated*/
> > do {
> > - ret = zpool_shrink(pool->zpool, 1, NULL);
> > + if (zpool_evictable(pool->zpool))
> > + ret = zpool_shrink(pool->zpool, 1, NULL);
> > + else
> > + ret = zswap_shrink(pool);
> > if (ret) {
> > zswap_reject_reclaim_fail++;
> > if (ret != -EAGAIN)
> > @@ -655,6 +714,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> > */
> > kref_init(&pool->kref);
> > INIT_LIST_HEAD(&pool->list);
> > + INIT_LIST_HEAD(&pool->lru);
> > + spin_lock_init(&pool->lock);
> > INIT_WORK(&pool->shrink_work, shrink_worker);
> >
> > zswap_pool_debug("created", pool);
> > @@ -1270,7 +1331,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> > }
> >
> > /* store */
> > - hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
> > + hlen = sizeof(zhdr);
> > gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> > if (zpool_malloc_support_movable(entry->pool->zpool))
> > gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> > @@ -1313,6 +1374,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> > zswap_entry_put(tree, dupentry);
> > }
> > } while (ret == -EEXIST);
> > + INIT_LIST_HEAD(&entry->lru);
> > + /* zpool_evictable will be removed once all 3 backends have migrated*/
> > + if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> > + spin_lock(&entry->pool->lock);
> > + list_add(&entry->lru, &entry->pool->lru);
> > + spin_unlock(&entry->pool->lock);
> > + }
> > spin_unlock(&tree->lock);
> >
> > /* update stats */
> > @@ -1384,8 +1452,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> > /* decompress */
> > dlen = PAGE_SIZE;
> > src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
> > - if (zpool_evictable(entry->pool->zpool))
> > - src += sizeof(struct zswap_header);
> > + src += sizeof(struct zswap_header);
> >
> > if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> > memcpy(tmp, src, entry->length);
> > @@ -1415,6 +1482,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> > freeentry:
> > spin_lock(&tree->lock);
> > zswap_entry_put(tree, entry);
> > + /* zpool_evictable will be removed once all 3 backends have migrated*/
> > + if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> > + spin_lock(&entry->pool->lock);
> > + list_move(&entry->lru, &entry->pool->lru);
> > + spin_unlock(&entry->pool->lock);
> > + }
> > spin_unlock(&tree->lock);
> >
> > return ret;
> > --
> > 2.34.1
> >