2023-07-27 16:42:57

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 0/3] mm: zswap: three cleanups

Three small cleanups to zswap, the first one suggested by Yosry during
the frontswap removal.

mm/zswap.c | 150 ++++++++++++++++++++++-------------------------------------
1 file changed, 56 insertions(+), 94 deletions(-)




2023-07-27 17:19:57

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 3/3] mm: zswap: kill zswap_get_swap_cache_page()

The __read_swap_cache_async() interface isn't more difficult to
understand than what the helper abstracts. Save the indirection and a
level of indentation for the primary work of the writeback func.

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/zswap.c | 142 ++++++++++++++++++++---------------------------------
1 file changed, 53 insertions(+), 89 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index e34ac89e6098..bba4ead672be 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1016,43 +1016,6 @@ static int zswap_enabled_param_set(const char *val,
/*********************************
* writeback code
**********************************/
-/* return enum for zswap_get_swap_cache_page */
-enum zswap_get_swap_ret {
- ZSWAP_SWAPCACHE_NEW,
- ZSWAP_SWAPCACHE_EXIST,
- ZSWAP_SWAPCACHE_FAIL,
-};
-
-/*
- * zswap_get_swap_cache_page
- *
- * This is an adaption of read_swap_cache_async()
- *
- * This function tries to find a page with the given swap entry
- * in the swapper_space address space (the swap cache). If the page
- * is found, it is returned in retpage. Otherwise, a page is allocated,
- * added to the swap cache, and returned in retpage.
- *
- * If success, the swap cache page is returned in retpage
- * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
- * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
- * the new page is added to swapcache and locked
- * Returns ZSWAP_SWAPCACHE_FAIL on error
- */
-static int zswap_get_swap_cache_page(swp_entry_t entry,
- struct page **retpage)
-{
- bool page_was_allocated;
-
- *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
- NULL, 0, &page_was_allocated);
- if (page_was_allocated)
- return ZSWAP_SWAPCACHE_NEW;
- if (!*retpage)
- return ZSWAP_SWAPCACHE_FAIL;
- return ZSWAP_SWAPCACHE_EXIST;
-}
-
/*
* Attempts to free an entry by adding a page to the swap cache,
* decompressing the entry data into the page, and issuing a
@@ -1073,7 +1036,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
struct zpool *pool = entry->pool->zpool;
-
+ bool page_was_allocated;
u8 *src, *tmp = NULL;
unsigned int dlen;
int ret;
@@ -1088,65 +1051,66 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
}

/* try to allocate swap cache page */
- switch (zswap_get_swap_cache_page(swpentry, &page)) {
- case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
+ page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
+ &page_was_allocated);
+ if (!page) {
ret = -ENOMEM;
goto fail;
+ }

- case ZSWAP_SWAPCACHE_EXIST:
- /* page is already in the swap cache, ignore for now */
+ /* Found an existing page, we raced with load/swapin */
+ if (!page_was_allocated) {
put_page(page);
ret = -EEXIST;
goto fail;
+ }

- case ZSWAP_SWAPCACHE_NEW: /* page is locked */
- /*
- * Having a local reference to the zswap entry doesn't exclude
- * swapping from invalidating and recycling the swap slot. Once
- * the swapcache is secured against concurrent swapping to and
- * from the slot, recheck that the entry is still current before
- * writing.
- */
- spin_lock(&tree->lock);
- if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
- spin_unlock(&tree->lock);
- delete_from_swap_cache(page_folio(page));
- ret = -ENOMEM;
- goto fail;
- }
+ /*
+ * Page is locked, and the swapcache is now secured against
+ * concurrent swapping to and from the slot. Verify that the
+ * swap entry hasn't been invalidated and recycled behind our
+ * backs (our zswap_entry reference doesn't prevent that), to
+ * avoid overwriting a new swap page with old compressed data.
+ */
+ spin_lock(&tree->lock);
+ if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
spin_unlock(&tree->lock);
+ delete_from_swap_cache(page_folio(page));
+ ret = -ENOMEM;
+ goto fail;
+ }
+ spin_unlock(&tree->lock);

- /* decompress */
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
- dlen = PAGE_SIZE;
+ /* decompress */
+ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+ dlen = PAGE_SIZE;

- src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
- if (!zpool_can_sleep_mapped(pool)) {
- memcpy(tmp, src, entry->length);
- src = tmp;
- zpool_unmap_handle(pool, entry->handle);
- }
+ src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
+ if (!zpool_can_sleep_mapped(pool)) {
+ memcpy(tmp, src, entry->length);
+ src = tmp;
+ zpool_unmap_handle(pool, entry->handle);
+ }

- mutex_lock(acomp_ctx->mutex);
- sg_init_one(&input, src, entry->length);
- sg_init_table(&output, 1);
- sg_set_page(&output, page, PAGE_SIZE, 0);
- acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
- ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
- dlen = acomp_ctx->req->dlen;
- mutex_unlock(acomp_ctx->mutex);
-
- if (!zpool_can_sleep_mapped(pool))
- kfree(tmp);
- else
- zpool_unmap_handle(pool, entry->handle);
+ mutex_lock(acomp_ctx->mutex);
+ sg_init_one(&input, src, entry->length);
+ sg_init_table(&output, 1);
+ sg_set_page(&output, page, PAGE_SIZE, 0);
+ acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
+ ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
+ mutex_unlock(acomp_ctx->mutex);
+
+ if (!zpool_can_sleep_mapped(pool))
+ kfree(tmp);
+ else
+ zpool_unmap_handle(pool, entry->handle);

- BUG_ON(ret);
- BUG_ON(dlen != PAGE_SIZE);
+ BUG_ON(ret);
+ BUG_ON(dlen != PAGE_SIZE);

- /* page is up to date */
- SetPageUptodate(page);
- }
+ /* page is up to date */
+ SetPageUptodate(page);

/* move it to the tail of the inactive list after end_writeback */
SetPageReclaim(page);
@@ -1157,16 +1121,16 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
zswap_written_back_pages++;

return ret;
+
fail:
if (!zpool_can_sleep_mapped(pool))
kfree(tmp);

/*
- * if we get here due to ZSWAP_SWAPCACHE_EXIST
- * a load may be happening concurrently.
- * it is safe and okay to not free the entry.
- * it is also okay to return !0
- */
+ * If we get here because the page is already in swapcache, a
+ * load may be happening concurrently. It is safe and okay to
+ * not free the entry. It is also okay to return !0.
+ */
return ret;
}

--
2.41.0


2023-07-27 17:33:57

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 1/3] mm: zswap: use zswap_invalidate_entry() for duplicates

Minor cleanup. Instead of open-coding the tree deletion and the put,
use the zswap_invalidate_entry() convenience helper.

Suggested-by: Yosry Ahmed <[email protected]>
Signed-off-by: Johannes Weiner <[email protected]>
---
mm/zswap.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 583ef7b84dc3..e123b1c7981c 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1344,9 +1344,7 @@ bool zswap_store(struct page *page)
spin_lock(&tree->lock);
while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
zswap_duplicate_entry++;
- /* remove from rbtree */
- zswap_rb_erase(&tree->rbroot, dupentry);
- zswap_entry_put(tree, dupentry);
+ zswap_invalidate_entry(tree, dupentry);
}
if (entry->length) {
spin_lock(&entry->pool->lru_lock);
--
2.41.0


2023-07-27 17:34:16

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 2/3] mm: zswap: tighten up entry invalidation

Removing a zswap entry from the tree is tied to an explicit operation
that's supposed to drop the base reference: swap invalidation,
exclusive load, duplicate store. Don't silently remove the entry on
final put, but instead warn if an entry is in tree without reference.

While in that diff context, convert a BUG_ON to a WARN_ON_ONCE. No
need to crash on a refcount underflow.

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/zswap.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index e123b1c7981c..e34ac89e6098 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -402,9 +402,9 @@ static void zswap_entry_put(struct zswap_tree *tree,
{
int refcount = --entry->refcount;

- BUG_ON(refcount < 0);
+ WARN_ON_ONCE(refcount < 0);
if (refcount == 0) {
- zswap_rb_erase(&tree->rbroot, entry);
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
zswap_free_entry(entry);
}
}
--
2.41.0


2023-07-27 18:31:33

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm: zswap: tighten up entry invalidation

On Thu, Jul 27, 2023 at 9:23 AM Johannes Weiner <[email protected]> wrote:
>
> Removing a zswap entry from the tree is tied to an explicit operation
> that's supposed to drop the base reference: swap invalidation,
> exclusive load, duplicate store. Don't silently remove the entry on
> final put, but instead warn if an entry is in tree without reference.
>
> While in that diff context, convert a BUG_ON to a WARN_ON_ONCE. No
> need to crash on a refcount underflow.
>
> Signed-off-by: Johannes Weiner <[email protected]>

I have always found it confusing that we explicitly remove the zswap
entry from the entry in the contexts you mentioned, yet we have
zswap_rb_erase() called in zswap_entry_put(). In fact, I think in some
contexts this leads to zswap_rb_erase() being called unnecessarily
twice on the same entry (e.g. once from invalidation, then once again
when an outstanding local ref is dropped). It's probably harmless with
the current implementation, but such a design can easily go wrong.

Thanks for the cleanup, it would be interesting to see if this warning
is actually fired.

Reviewed-by: Yosry Ahmed <[email protected]>

> ---
> mm/zswap.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index e123b1c7981c..e34ac89e6098 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -402,9 +402,9 @@ static void zswap_entry_put(struct zswap_tree *tree,
> {
> int refcount = --entry->refcount;
>
> - BUG_ON(refcount < 0);
> + WARN_ON_ONCE(refcount < 0);
> if (refcount == 0) {
> - zswap_rb_erase(&tree->rbroot, entry);
> + WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
> zswap_free_entry(entry);
> }
> }
> --
> 2.41.0
>

2023-07-27 19:14:41

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm: zswap: use zswap_invalidate_entry() for duplicates

On Thu, Jul 27, 2023 at 9:23 AM Johannes Weiner <[email protected]> wrote:
>
> Minor cleanup. Instead of open-coding the tree deletion and the put,
> use the zswap_invalidate_entry() convenience helper.
>
> Suggested-by: Yosry Ahmed <[email protected]>
> Signed-off-by: Johannes Weiner <[email protected]>

Reviewed-by: Yosry Ahmed <[email protected]>

Thanks!

> ---
> mm/zswap.c | 4 +---
> 1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 583ef7b84dc3..e123b1c7981c 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1344,9 +1344,7 @@ bool zswap_store(struct page *page)
> spin_lock(&tree->lock);
> while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
> zswap_duplicate_entry++;
> - /* remove from rbtree */
> - zswap_rb_erase(&tree->rbroot, dupentry);
> - zswap_entry_put(tree, dupentry);
> + zswap_invalidate_entry(tree, dupentry);
> }
> if (entry->length) {
> spin_lock(&entry->pool->lru_lock);
> --
> 2.41.0
>

2023-07-27 19:32:32

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH 3/3] mm: zswap: kill zswap_get_swap_cache_page()

On Thu, Jul 27, 2023 at 9:23 AM Johannes Weiner <[email protected]> wrote:
>
> The __read_swap_cache_async() interface isn't more difficult to
> understand than what the helper abstracts. Save the indirection and a
> level of indentation for the primary work of the writeback func.
>
> Signed-off-by: Johannes Weiner <[email protected]>

Arguably any abstraction to the swap code is better than dealing with
the swap code, but I am not against this :P

The diffstat looks nice and the code looks correct to me.

Reviewed-by: Yosry Ahmed <[email protected]>

> ---
> mm/zswap.c | 142 ++++++++++++++++++++---------------------------------
> 1 file changed, 53 insertions(+), 89 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index e34ac89e6098..bba4ead672be 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1016,43 +1016,6 @@ static int zswap_enabled_param_set(const char *val,
> /*********************************
> * writeback code
> **********************************/
> -/* return enum for zswap_get_swap_cache_page */
> -enum zswap_get_swap_ret {
> - ZSWAP_SWAPCACHE_NEW,
> - ZSWAP_SWAPCACHE_EXIST,
> - ZSWAP_SWAPCACHE_FAIL,
> -};
> -
> -/*
> - * zswap_get_swap_cache_page
> - *
> - * This is an adaption of read_swap_cache_async()
> - *
> - * This function tries to find a page with the given swap entry
> - * in the swapper_space address space (the swap cache). If the page
> - * is found, it is returned in retpage. Otherwise, a page is allocated,
> - * added to the swap cache, and returned in retpage.
> - *
> - * If success, the swap cache page is returned in retpage
> - * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
> - * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
> - * the new page is added to swapcache and locked
> - * Returns ZSWAP_SWAPCACHE_FAIL on error
> - */
> -static int zswap_get_swap_cache_page(swp_entry_t entry,
> - struct page **retpage)
> -{
> - bool page_was_allocated;
> -
> - *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
> - NULL, 0, &page_was_allocated);
> - if (page_was_allocated)
> - return ZSWAP_SWAPCACHE_NEW;
> - if (!*retpage)
> - return ZSWAP_SWAPCACHE_FAIL;
> - return ZSWAP_SWAPCACHE_EXIST;
> -}
> -
> /*
> * Attempts to free an entry by adding a page to the swap cache,
> * decompressing the entry data into the page, and issuing a
> @@ -1073,7 +1036,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> struct scatterlist input, output;
> struct crypto_acomp_ctx *acomp_ctx;
> struct zpool *pool = entry->pool->zpool;
> -
> + bool page_was_allocated;
> u8 *src, *tmp = NULL;
> unsigned int dlen;
> int ret;
> @@ -1088,65 +1051,66 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> }
>
> /* try to allocate swap cache page */
> - switch (zswap_get_swap_cache_page(swpentry, &page)) {
> - case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
> + page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
> + &page_was_allocated);
> + if (!page) {
> ret = -ENOMEM;
> goto fail;
> + }
>
> - case ZSWAP_SWAPCACHE_EXIST:
> - /* page is already in the swap cache, ignore for now */
> + /* Found an existing page, we raced with load/swapin */
> + if (!page_was_allocated) {
> put_page(page);
> ret = -EEXIST;
> goto fail;
> + }
>
> - case ZSWAP_SWAPCACHE_NEW: /* page is locked */
> - /*
> - * Having a local reference to the zswap entry doesn't exclude
> - * swapping from invalidating and recycling the swap slot. Once
> - * the swapcache is secured against concurrent swapping to and
> - * from the slot, recheck that the entry is still current before
> - * writing.
> - */
> - spin_lock(&tree->lock);
> - if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
> - spin_unlock(&tree->lock);
> - delete_from_swap_cache(page_folio(page));
> - ret = -ENOMEM;
> - goto fail;
> - }
> + /*
> + * Page is locked, and the swapcache is now secured against
> + * concurrent swapping to and from the slot. Verify that the
> + * swap entry hasn't been invalidated and recycled behind our
> + * backs (our zswap_entry reference doesn't prevent that), to
> + * avoid overwriting a new swap page with old compressed data.
> + */
> + spin_lock(&tree->lock);
> + if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
> spin_unlock(&tree->lock);
> + delete_from_swap_cache(page_folio(page));
> + ret = -ENOMEM;
> + goto fail;
> + }
> + spin_unlock(&tree->lock);
>
> - /* decompress */
> - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> - dlen = PAGE_SIZE;
> + /* decompress */
> + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> + dlen = PAGE_SIZE;
>
> - src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
> - if (!zpool_can_sleep_mapped(pool)) {
> - memcpy(tmp, src, entry->length);
> - src = tmp;
> - zpool_unmap_handle(pool, entry->handle);
> - }
> + src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
> + if (!zpool_can_sleep_mapped(pool)) {
> + memcpy(tmp, src, entry->length);
> + src = tmp;
> + zpool_unmap_handle(pool, entry->handle);
> + }
>
> - mutex_lock(acomp_ctx->mutex);
> - sg_init_one(&input, src, entry->length);
> - sg_init_table(&output, 1);
> - sg_set_page(&output, page, PAGE_SIZE, 0);
> - acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
> - ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
> - dlen = acomp_ctx->req->dlen;
> - mutex_unlock(acomp_ctx->mutex);
> -
> - if (!zpool_can_sleep_mapped(pool))
> - kfree(tmp);
> - else
> - zpool_unmap_handle(pool, entry->handle);
> + mutex_lock(acomp_ctx->mutex);
> + sg_init_one(&input, src, entry->length);
> + sg_init_table(&output, 1);
> + sg_set_page(&output, page, PAGE_SIZE, 0);
> + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
> + ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
> + dlen = acomp_ctx->req->dlen;
> + mutex_unlock(acomp_ctx->mutex);
> +
> + if (!zpool_can_sleep_mapped(pool))
> + kfree(tmp);
> + else
> + zpool_unmap_handle(pool, entry->handle);
>
> - BUG_ON(ret);
> - BUG_ON(dlen != PAGE_SIZE);
> + BUG_ON(ret);
> + BUG_ON(dlen != PAGE_SIZE);
>
> - /* page is up to date */
> - SetPageUptodate(page);
> - }
> + /* page is up to date */
> + SetPageUptodate(page);
>
> /* move it to the tail of the inactive list after end_writeback */
> SetPageReclaim(page);
> @@ -1157,16 +1121,16 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> zswap_written_back_pages++;
>
> return ret;
> +
> fail:
> if (!zpool_can_sleep_mapped(pool))
> kfree(tmp);
>
> /*
> - * if we get here due to ZSWAP_SWAPCACHE_EXIST
> - * a load may be happening concurrently.
> - * it is safe and okay to not free the entry.
> - * it is also okay to return !0
> - */
> + * If we get here because the page is already in swapcache, a
> + * load may be happening concurrently. It is safe and okay to
> + * not free the entry. It is also okay to return !0.
> + */
> return ret;
> }
>
> --
> 2.41.0
>