2023-06-07 20:05:19

by Yosry Ahmed

[permalink] [raw]
Subject: [PATCH v2 1/2] mm: zswap: support exclusive loads

Commit 71024cb4a0bf ("frontswap: remove frontswap_tmem_exclusive_gets")
removed support for exclusive loads from frontswap as it was not used.
Bring back exclusive loads support to frontswap by adding an "exclusive"
output parameter to frontswap_ops->load.

On the zswap side, add a module parameter to enable/disable exclusive
loads, and a config option to control the boot default value.
Refactor zswap entry invalidation in zswap_frontswap_invalidate_page()
into zswap_invalidate_entry() to reuse it in zswap_frontswap_load() if
exclusive loads are enabled.

With exclusive loads, we avoid having two copies of the same page in
memory (compressed & uncompressed) after faulting it in from zswap. On
the other hand, if the page is to be reclaimed again without being
dirtied, it will be re-compressed. Compression is not usually slow, and
a page that was just faulted in is less likely to be reclaimed again
soon.

Suggested-by: Yu Zhao <[email protected]>
Signed-off-by: Yosry Ahmed <[email protected]>
---

v1 -> v2:
- Add a module parameter to control whether exclusive loads are enabled
or not, the config option now controls the default boot value instead.
Replaced frontswap_ops->exclusive_loads by an output parameter to
frontswap_ops->load() (Johannes Weiner).

---
include/linux/frontswap.h | 2 +-
mm/Kconfig | 16 ++++++++++++++++
mm/frontswap.c | 10 ++++++++--
mm/zswap.c | 28 ++++++++++++++++++++--------
4 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index a631bac12220..eaa0ac5f9003 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -10,7 +10,7 @@
struct frontswap_ops {
void (*init)(unsigned); /* this swap type was just swapon'ed */
int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
- int (*load)(unsigned, pgoff_t, struct page *); /* load a page */
+ int (*load)(unsigned, pgoff_t, struct page *, bool *); /* load a page */
void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */
void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */
};
diff --git a/mm/Kconfig b/mm/Kconfig
index 7672a22647b4..12f32f8d26bf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -46,6 +46,22 @@ config ZSWAP_DEFAULT_ON
The selection made here can be overridden by using the kernel
command line 'zswap.enabled=' option.

+config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON
+ bool "Invalidate zswap entries when pages are loaded"
+ depends on ZSWAP
+ help
+ If selected, exclusive loads for zswap will be enabled at boot,
+ otherwise it will be disabled.
+
+ If exclusive loads are enabled, when a page is loaded from zswap,
+ the zswap entry is invalidated at once, as opposed to leaving it
+ in zswap until the swap entry is freed.
+
+ This avoids having two copies of the same page in memory
+ (compressed and uncompressed) after faulting in a page from zswap.
+ The cost is that if the page was never dirtied and needs to be
+ swapped out again, it will be re-compressed.
+
choice
prompt "Default compressor"
depends on ZSWAP
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 279e55b4ed87..2fb5df3384b8 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -206,6 +206,7 @@ int __frontswap_load(struct page *page)
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
+ bool exclusive = false;

VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(!PageLocked(page));
@@ -215,9 +216,14 @@ int __frontswap_load(struct page *page)
return -1;

/* Try loading from each implementation, until one succeeds. */
- ret = frontswap_ops->load(type, offset, page);
- if (ret == 0)
+ ret = frontswap_ops->load(type, offset, page, &exclusive);
+ if (ret == 0) {
inc_frontswap_loads();
+ if (exclusive) {
+ SetPageDirty(page);
+ __frontswap_clear(sis, offset);
+ }
+ }
return ret;
}

diff --git a/mm/zswap.c b/mm/zswap.c
index 59da2a415fbb..bfbcedce9c89 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -137,6 +137,10 @@ static bool zswap_non_same_filled_pages_enabled = true;
module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
bool, 0644);

+static bool zswap_exclusive_loads_enabled = IS_ENABLED(
+ CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
+module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
+
/*********************************
* data structures
**********************************/
@@ -1329,12 +1333,22 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
goto reject;
}

+static void zswap_invalidate_entry(struct zswap_tree *tree,
+ struct zswap_entry *entry)
+{
+ /* remove from rbtree */
+ zswap_rb_erase(&tree->rbroot, entry);
+
+ /* drop the initial reference from entry creation */
+ zswap_entry_put(tree, entry);
+}
+
/*
* returns 0 if the page was successfully decompressed
* return -1 on entry not found or error
*/
static int zswap_frontswap_load(unsigned type, pgoff_t offset,
- struct page *page)
+ struct page *page, bool *exclusive)
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
@@ -1404,6 +1418,10 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
freeentry:
spin_lock(&tree->lock);
zswap_entry_put(tree, entry);
+ if (!ret && zswap_exclusive_loads_enabled) {
+ zswap_invalidate_entry(tree, entry);
+ *exclusive = true;
+ }
spin_unlock(&tree->lock);

return ret;
@@ -1423,13 +1441,7 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
spin_unlock(&tree->lock);
return;
}
-
- /* remove from rbtree */
- zswap_rb_erase(&tree->rbroot, entry);
-
- /* drop the initial reference from entry creation */
- zswap_entry_put(tree, entry);
-
+ zswap_invalidate_entry(tree, entry);
spin_unlock(&tree->lock);
}

--
2.41.0.162.gfafddb0af9-goog



2023-06-07 21:41:28

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] mm: zswap: support exclusive loads

On Wed, Jun 07, 2023 at 07:51:43PM +0000, Yosry Ahmed wrote:
> Commit 71024cb4a0bf ("frontswap: remove frontswap_tmem_exclusive_gets")
> removed support for exclusive loads from frontswap as it was not used.
> Bring back exclusive loads support to frontswap by adding an "exclusive"
> output parameter to frontswap_ops->load.
>
> On the zswap side, add a module parameter to enable/disable exclusive
> loads, and a config option to control the boot default value.
> Refactor zswap entry invalidation in zswap_frontswap_invalidate_page()
> into zswap_invalidate_entry() to reuse it in zswap_frontswap_load() if
> exclusive loads are enabled.
>
> With exclusive loads, we avoid having two copies of the same page in
> memory (compressed & uncompressed) after faulting it in from zswap. On
> the other hand, if the page is to be reclaimed again without being
> dirtied, it will be re-compressed. Compression is not usually slow, and
> a page that was just faulted in is less likely to be reclaimed again
> soon.
>
> Suggested-by: Yu Zhao <[email protected]>
> Signed-off-by: Yosry Ahmed <[email protected]>

Thanks!

Acked-by: Johannes Weiner <[email protected]>

2023-06-21 07:19:07

by Hyeonggon Yoo

[permalink] [raw]
Subject: [BUG mm-unstable] "kernel BUG at mm/swap.c:393!" on commit b9c91c43412f2e

On Wed, Jun 07, 2023 at 07:51:43PM +0000, Yosry Ahmed wrote:
> Commit 71024cb4a0bf ("frontswap: remove frontswap_tmem_exclusive_gets")
> removed support for exclusive loads from frontswap as it was not used.
> Bring back exclusive loads support to frontswap by adding an "exclusive"
> output parameter to frontswap_ops->load.
>
> On the zswap side, add a module parameter to enable/disable exclusive
> loads, and a config option to control the boot default value.
> Refactor zswap entry invalidation in zswap_frontswap_invalidate_page()
> into zswap_invalidate_entry() to reuse it in zswap_frontswap_load() if
> exclusive loads are enabled.
>
> With exclusive loads, we avoid having two copies of the same page in
> memory (compressed & uncompressed) after faulting it in from zswap. On
> the other hand, if the page is to be reclaimed again without being
> dirtied, it will be re-compressed. Compression is not usually slow, and
> a page that was just faulted in is less likely to be reclaimed again
> soon.
>
> Suggested-by: Yu Zhao <[email protected]>
> Signed-off-by: Yosry Ahmed <[email protected]>
> ---
>
> v1 -> v2:
> - Add a module parameter to control whether exclusive loads are enabled
> or not, the config option now controls the default boot value instead.
> Replaced frontswap_ops->exclusive_loads by an output parameter to
> frontswap_ops->load() (Johannes Weiner).
> ---

Hi Yosry, I was testing the latest mm-unstable and encountered a bug.
It was bisectable and this is the first bad commit.

Attached config file and bisect log.
The oops message is available at:

https://social.kernel.org/media/eace06d71655b3cc76411366573e4a8ce240ad65b8fd20977d7c73eec9dc2253.jpg

(the head commit is b9c91c43412f2e07 "mm: zswap: support exclusive loads")
(it's an image because I tested it on real machine)


This is what I have as swap space:

$ cat /proc/swaps
Filename Type Size Used Priority
/var/swap file 134217724 0 -2
/dev/zram0 partition 8388604 0 100


Attachments:
(No filename) (2.05 kB)
.config (262.68 kB)
bisect-log.txt (2.68 kB)
Download all attachments

2023-06-21 07:25:56

by Hyeonggon Yoo

[permalink] [raw]
Subject: Re: [BUG mm-unstable] "kernel BUG at mm/swap.c:393!" on commit b9c91c43412f2e

On Wed, Jun 21, 2023 at 4:01 PM Hyeonggon Yoo <[email protected]> wrote:
>
> On Wed, Jun 07, 2023 at 07:51:43PM +0000, Yosry Ahmed wrote:
> > Commit 71024cb4a0bf ("frontswap: remove frontswap_tmem_exclusive_gets")
> > removed support for exclusive loads from frontswap as it was not used.
> > Bring back exclusive loads support to frontswap by adding an "exclusive"
> > output parameter to frontswap_ops->load.
> >
> > On the zswap side, add a module parameter to enable/disable exclusive
> > loads, and a config option to control the boot default value.
> > Refactor zswap entry invalidation in zswap_frontswap_invalidate_page()
> > into zswap_invalidate_entry() to reuse it in zswap_frontswap_load() if
> > exclusive loads are enabled.
> >
> > With exclusive loads, we avoid having two copies of the same page in
> > memory (compressed & uncompressed) after faulting it in from zswap. On
> > the other hand, if the page is to be reclaimed again without being
> > dirtied, it will be re-compressed. Compression is not usually slow, and
> > a page that was just faulted in is less likely to be reclaimed again
> > soon.
> >
> > Suggested-by: Yu Zhao <[email protected]>
> > Signed-off-by: Yosry Ahmed <[email protected]>
> > ---
> >
> > v1 -> v2:
> > - Add a module parameter to control whether exclusive loads are enabled
> > or not, the config option now controls the default boot value instead.
> > Replaced frontswap_ops->exclusive_loads by an output parameter to
> > frontswap_ops->load() (Johannes Weiner).
> > ---
>
> Hi Yosry, I was testing the latest mm-unstable and encountered a bug.
> It was bisectable and this is the first bad commit.

Oh, I forgot to mention how I was able to reproduce this:

$ stress-ng --bigheap 12
(my box have 24 cores)

>
> Attached config file and bisect log.
> The oops message is available at:
>
> https://social.kernel.org/media/eace06d71655b3cc76411366573e4a8ce240ad65b8fd20977d7c73eec9dc2253.jpg
>
> (the head commit is b9c91c43412f2e07 "mm: zswap: support exclusive loads")
> (it's an image because I tested it on real machine)
>
>
> This is what I have as swap space:
>
> $ cat /proc/swaps
> Filename Type Size Used Priority
> /var/swap file 134217724 0 -2
> /dev/zram0 partition 8388604 0 100

2023-06-21 08:32:39

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [BUG mm-unstable] "kernel BUG at mm/swap.c:393!" on commit b9c91c43412f2e

On Wed, Jun 21, 2023 at 12:01 AM Hyeonggon Yoo <[email protected]> wrote:
>
> On Wed, Jun 07, 2023 at 07:51:43PM +0000, Yosry Ahmed wrote:
> > Commit 71024cb4a0bf ("frontswap: remove frontswap_tmem_exclusive_gets")
> > removed support for exclusive loads from frontswap as it was not used.
> > Bring back exclusive loads support to frontswap by adding an "exclusive"
> > output parameter to frontswap_ops->load.
> >
> > On the zswap side, add a module parameter to enable/disable exclusive
> > loads, and a config option to control the boot default value.
> > Refactor zswap entry invalidation in zswap_frontswap_invalidate_page()
> > into zswap_invalidate_entry() to reuse it in zswap_frontswap_load() if
> > exclusive loads are enabled.
> >
> > With exclusive loads, we avoid having two copies of the same page in
> > memory (compressed & uncompressed) after faulting it in from zswap. On
> > the other hand, if the page is to be reclaimed again without being
> > dirtied, it will be re-compressed. Compression is not usually slow, and
> > a page that was just faulted in is less likely to be reclaimed again
> > soon.
> >
> > Suggested-by: Yu Zhao <[email protected]>
> > Signed-off-by: Yosry Ahmed <[email protected]>
> > ---
> >
> > v1 -> v2:
> > - Add a module parameter to control whether exclusive loads are enabled
> > or not, the config option now controls the default boot value instead.
> > Replaced frontswap_ops->exclusive_loads by an output parameter to
> > frontswap_ops->load() (Johannes Weiner).
> > ---
>
> Hi Yosry, I was testing the latest mm-unstable and encountered a bug.
> It was bisectable and this is the first bad commit.
>
>
> Attached config file and bisect log.
> The oops message is available at:
>
> https://social.kernel.org/media/eace06d71655b3cc76411366573e4a8ce240ad65b8fd20977d7c73eec9dc2253.jpg
>
> (the head commit is b9c91c43412f2e07 "mm: zswap: support exclusive loads")
> (it's an image because I tested it on real machine)
>
>
> This is what I have as swap space:
>
> $ cat /proc/swaps
> Filename Type Size Used Priority
> /var/swap file 134217724 0 -2
> /dev/zram0 partition 8388604 0 100


Hi Hyeonggon,

Thanks for reporting this! I think I know what went wrong. Could you
please verify if the below fix works if possible?

Domenico, I believe the below fix would also fix a problem with the
recent writeback series. If the entry is invalidated before we grab the
lock to put the local ref in zswap_frontswap_load(), then the entry
will be freed once we call zswap_entry_put(), and the movement to the
beginning LRU will be operating on a freed entry. It also modifies
your recently added commit 418fd29d9de5 ("mm: zswap: invaldiate entry
after writeback"). I would appreciate it if you also take a look.

If this works as intended, I can send a formal patch (applies on top
of fd247f029cd0 ("mm/gup: do not return 0 from pin_user_pages_fast()
for bad args")):

From 4b7f949b3ffb42d969d525d5b576fad474f55276 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <[email protected]>
Date: Wed, 21 Jun 2023 07:43:51 +0000
Subject: [PATCH] mm: zswap: fix double invalidate with exclusive loads

If exclusive loads are enabled for zswap, we invalidate the entry before
returning from zswap_frontswap_load(), after dropping the local
reference. However, the tree lock is dropped during decompression after
the local reference is acquired, so the entry could be invalidated
before we drop the local ref. If this happens, the entry is freed once
we drop the local ref, and zswap_invalidate_entry() tries to invalidate
an already freed entry.

Fix this by:
(a) Making sure zswap_invalidate_entry() is always called with a local
ref held, to avoid being called on a freed entry.
(b) Making sure zswap_invalidate_entry() only drops the ref if the entry
was actually on the rbtree. Otherwise, another invalidation could
have already happened, and the initial ref is already dropped.

With these changes, there is no need to check that there is no need to
make sure the entry still exists in the tree in zswap_reclaim_entry()
before invalidating it, as zswap_reclaim_entry() will make this check
internally.

Fixes: b9c91c43412f ("mm: zswap: support exclusive loads")
Reported-by: Hyeonggon Yoo <[email protected]>
Signed-off-by: Yosry Ahmed <[email protected]>
---
mm/zswap.c | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 87b204233115..62195f72bf56 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -355,12 +355,14 @@ static int zswap_rb_insert(struct rb_root *root,
struct zswap_entry *entry,
return 0;
}

-static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
{
if (!RB_EMPTY_NODE(&entry->rbnode)) {
rb_erase(&entry->rbnode, root);
RB_CLEAR_NODE(&entry->rbnode);
+ return true;
}
+ return false;
}

/*
@@ -599,14 +601,16 @@ static struct zswap_pool
*zswap_pool_find_get(char *type, char *compressor)
return NULL;
}

+/*
+ * If the entry is still valid in the tree, drop the initial ref and remove it
+ * from the tree. This function must be called with an additional ref held,
+ * otherwise it may race with another invalidation freeing the entry.
+ */
static void zswap_invalidate_entry(struct zswap_tree *tree,
struct zswap_entry *entry)
{
- /* remove from rbtree */
- zswap_rb_erase(&tree->rbroot, entry);
-
- /* drop the initial reference from entry creation */
- zswap_entry_put(tree, entry);
+ if (zswap_rb_erase(&tree->rbroot, entry))
+ zswap_entry_put(tree, entry);
}

static int zswap_reclaim_entry(struct zswap_pool *pool)
@@ -659,8 +663,7 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
* swapcache. Drop the entry from zswap - unless invalidate already
* took it out while we had the tree->lock released for IO.
*/
- if (entry == zswap_rb_search(&tree->rbroot, swpoffset))
- zswap_invalidate_entry(tree, entry);
+ zswap_invalidate_entry(tree, entry);

put_unlock:
/* Drop local reference */
@@ -1466,7 +1469,6 @@ static int zswap_frontswap_load(unsigned type,
pgoff_t offset,
count_objcg_event(entry->objcg, ZSWPIN);
freeentry:
spin_lock(&tree->lock);
- zswap_entry_put(tree, entry);
if (!ret && zswap_exclusive_loads_enabled) {
zswap_invalidate_entry(tree, entry);
*exclusive = true;
@@ -1475,6 +1477,7 @@ static int zswap_frontswap_load(unsigned type,
pgoff_t offset,
list_move(&entry->lru, &entry->pool->lru);
spin_unlock(&entry->pool->lru_lock);
}
+ zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);

return ret;
--
2.41.0.162.gfafddb0af9-goog

2023-06-21 09:20:49

by Hyeonggon Yoo

[permalink] [raw]
Subject: Re: [BUG mm-unstable] "kernel BUG at mm/swap.c:393!" on commit b9c91c43412f2e

On Wed, Jun 21, 2023 at 01:05:56AM -0700, Yosry Ahmed wrote:
> On Wed, Jun 21, 2023 at 12:01 AM Hyeonggon Yoo <[email protected]> wrote:
> > Hi Yosry, I was testing the latest mm-unstable and encountered a bug.
> > It was bisectable and this is the first bad commit.
> >
> >
> > Attached config file and bisect log.
> > The oops message is available at:
> >
> > https://social.kernel.org/media/eace06d71655b3cc76411366573e4a8ce240ad65b8fd20977d7c73eec9dc2253.jpg
> >
> > (the head commit is b9c91c43412f2e07 "mm: zswap: support exclusive loads")
> > (it's an image because I tested it on real machine)
> >
> >
> > This is what I have as swap space:
> >
> > $ cat /proc/swaps
> > Filename Type Size Used Priority
> > /var/swap file 134217724 0 -2
> > /dev/zram0 partition 8388604 0 100
>
>
> Hi Hyeonggon,
>
> Thanks for reporting this! I think I know what went wrong. Could you
> please verify if the below fix works if possible?
>

Works fine and I was not able to reproduce the bug with the patch
applied.

Not sure Andrew would prefer squashing it into original one or applying it
as separate patch, though (I'm totally fine with both way).

Anyway:

Tested-by: Hyeonggon Yoo <[email protected]>

> Domenico, I believe the below fix would also fix a problem with the
> recent writeback series. If the entry is invalidated before we grab the
> lock to put the local ref in zswap_frontswap_load(), then the entry
> will be freed once we call zswap_entry_put(), and the movement to the
> beginning LRU will be operating on a freed entry. It also modifies
> your recently added commit 418fd29d9de5 ("mm: zswap: invaldiate entry
> after writeback"). I would appreciate it if you also take a look.
>
> If this works as intended, I can send a formal patch (applies on top
> of fd247f029cd0 ("mm/gup: do not return 0 from pin_user_pages_fast()
> for bad args")):
>
> From 4b7f949b3ffb42d969d525d5b576fad474f55276 Mon Sep 17 00:00:00 2001
> From: Yosry Ahmed <[email protected]>
> Date: Wed, 21 Jun 2023 07:43:51 +0000
> Subject: [PATCH] mm: zswap: fix double invalidate with exclusive loads
>
> If exclusive loads are enabled for zswap, we invalidate the entry before
> returning from zswap_frontswap_load(), after dropping the local
> reference. However, the tree lock is dropped during decompression after
> the local reference is acquired, so the entry could be invalidated
> before we drop the local ref. If this happens, the entry is freed once
> we drop the local ref, and zswap_invalidate_entry() tries to invalidate
> an already freed entry.
>
> Fix this by:
> (a) Making sure zswap_invalidate_entry() is always called with a local
> ref held, to avoid being called on a freed entry.
> (b) Making sure zswap_invalidate_entry() only drops the ref if the entry
> was actually on the rbtree. Otherwise, another invalidation could
> have already happened, and the initial ref is already dropped.
>
> With these changes, there is no need to check that there is no need to
> make sure the entry still exists in the tree in zswap_reclaim_entry()
> before invalidating it, as zswap_reclaim_entry() will make this check
> internally.
>
> Fixes: b9c91c43412f ("mm: zswap: support exclusive loads")
> Reported-by: Hyeonggon Yoo <[email protected]>
> Signed-off-by: Yosry Ahmed <[email protected]>

<...snip...>

--
Hyeonggon Yoo

Undergraduate | Chungnam National University

2023-06-21 09:25:00

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [BUG mm-unstable] "kernel BUG at mm/swap.c:393!" on commit b9c91c43412f2e

On Wed, Jun 21, 2023 at 2:06 AM Hyeonggon Yoo <[email protected]> wrote:
>
> On Wed, Jun 21, 2023 at 01:05:56AM -0700, Yosry Ahmed wrote:
> > On Wed, Jun 21, 2023 at 12:01 AM Hyeonggon Yoo <[email protected]> wrote:
> > > Hi Yosry, I was testing the latest mm-unstable and encountered a bug.
> > > It was bisectable and this is the first bad commit.
> > >
> > >
> > > Attached config file and bisect log.
> > > The oops message is available at:
> > >
> > > https://social.kernel.org/media/eace06d71655b3cc76411366573e4a8ce240ad65b8fd20977d7c73eec9dc2253.jpg
> > >
> > > (the head commit is b9c91c43412f2e07 "mm: zswap: support exclusive loads")
> > > (it's an image because I tested it on real machine)
> > >
> > >
> > > This is what I have as swap space:
> > >
> > > $ cat /proc/swaps
> > > Filename Type Size Used Priority
> > > /var/swap file 134217724 0 -2
> > > /dev/zram0 partition 8388604 0 100
> >
> >
> > Hi Hyeonggon,
> >
> > Thanks for reporting this! I think I know what went wrong. Could you
> > please verify if the below fix works if possible?
> >
>
> Works fine and I was not able to reproduce the bug with the patch
> applied.
>
> Not sure Andrew would prefer squashing it into original one or applying it
> as separate patch, though (I'm totally fine with both way).

I think it already landed in mm-stable so it cannot be squashed at this point.

>
> Anyway:
>
> Tested-by: Hyeonggon Yoo <[email protected]>

Thanks a lot for reporting and testing this!

I will wait for Domenico to also respond then send the fix to Andrew.
Hopefully it's not too late for this rc.

>
> > Domenico, I believe the below fix would also fix a problem with the
> > recent writeback series. If the entry is invalidated before we grab the
> > lock to put the local ref in zswap_frontswap_load(), then the entry
> > will be freed once we call zswap_entry_put(), and the movement to the
> > beginning LRU will be operating on a freed entry. It also modifies
> > your recently added commit 418fd29d9de5 ("mm: zswap: invaldiate entry
> > after writeback"). I would appreciate it if you also take a look.
> >
> > If this works as intended, I can send a formal patch (applies on top
> > of fd247f029cd0 ("mm/gup: do not return 0 from pin_user_pages_fast()
> > for bad args")):
> >
> > From 4b7f949b3ffb42d969d525d5b576fad474f55276 Mon Sep 17 00:00:00 2001
> > From: Yosry Ahmed <[email protected]>
> > Date: Wed, 21 Jun 2023 07:43:51 +0000
> > Subject: [PATCH] mm: zswap: fix double invalidate with exclusive loads
> >
> > If exclusive loads are enabled for zswap, we invalidate the entry before
> > returning from zswap_frontswap_load(), after dropping the local
> > reference. However, the tree lock is dropped during decompression after
> > the local reference is acquired, so the entry could be invalidated
> > before we drop the local ref. If this happens, the entry is freed once
> > we drop the local ref, and zswap_invalidate_entry() tries to invalidate
> > an already freed entry.
> >
> > Fix this by:
> > (a) Making sure zswap_invalidate_entry() is always called with a local
> > ref held, to avoid being called on a freed entry.
> > (b) Making sure zswap_invalidate_entry() only drops the ref if the entry
> > was actually on the rbtree. Otherwise, another invalidation could
> > have already happened, and the initial ref is already dropped.
> >
> > With these changes, there is no need to check that there is no need to
> > make sure the entry still exists in the tree in zswap_reclaim_entry()
> > before invalidating it, as zswap_reclaim_entry() will make this check
> > internally.
> >
> > Fixes: b9c91c43412f ("mm: zswap: support exclusive loads")
> > Reported-by: Hyeonggon Yoo <[email protected]>
> > Signed-off-by: Yosry Ahmed <[email protected]>
>
> <...snip...>
>
> --
> Hyeonggon Yoo
>
> Undergraduate | Chungnam National University

2023-06-21 09:31:56

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [BUG mm-unstable] "kernel BUG at mm/swap.c:393!" on commit b9c91c43412f2e

On Wed, Jun 21, 2023 at 2:19 AM Domenico Cerasuolo
<[email protected]> wrote:
>
> On Wed, Jun 21, 2023 at 10:06 AM Yosry Ahmed <[email protected]> wrote:
> >
> > On Wed, Jun 21, 2023 at 12:01 AM Hyeonggon Yoo <[email protected]> wrote:
> > >
> > > On Wed, Jun 07, 2023 at 07:51:43PM +0000, Yosry Ahmed wrote:
> > > > Commit 71024cb4a0bf ("frontswap: remove frontswap_tmem_exclusive_gets")
> > > > removed support for exclusive loads from frontswap as it was not used.
> > > > Bring back exclusive loads support to frontswap by adding an "exclusive"
> > > > output parameter to frontswap_ops->load.
> > > >
> > > > On the zswap side, add a module parameter to enable/disable exclusive
> > > > loads, and a config option to control the boot default value.
> > > > Refactor zswap entry invalidation in zswap_frontswap_invalidate_page()
> > > > into zswap_invalidate_entry() to reuse it in zswap_frontswap_load() if
> > > > exclusive loads are enabled.
> > > >
> > > > With exclusive loads, we avoid having two copies of the same page in
> > > > memory (compressed & uncompressed) after faulting it in from zswap. On
> > > > the other hand, if the page is to be reclaimed again without being
> > > > dirtied, it will be re-compressed. Compression is not usually slow, and
> > > > a page that was just faulted in is less likely to be reclaimed again
> > > > soon.
> > > >
> > > > Suggested-by: Yu Zhao <[email protected]>
> > > > Signed-off-by: Yosry Ahmed <[email protected]>
> > > > ---
> > > >
> > > > v1 -> v2:
> > > > - Add a module parameter to control whether exclusive loads are enabled
> > > > or not, the config option now controls the default boot value instead.
> > > > Replaced frontswap_ops->exclusive_loads by an output parameter to
> > > > frontswap_ops->load() (Johannes Weiner).
> > > > ---
> > >
> > > Hi Yosry, I was testing the latest mm-unstable and encountered a bug.
> > > It was bisectable and this is the first bad commit.
> > >
> > >
> > > Attached config file and bisect log.
> > > The oops message is available at:
> > >
> > > https://social.kernel.org/media/eace06d71655b3cc76411366573e4a8ce240ad65b8fd20977d7c73eec9dc2253.jpg
> > >
> > > (the head commit is b9c91c43412f2e07 "mm: zswap: support exclusive loads")
> > > (it's an image because I tested it on real machine)
> > >
> > >
> > > This is what I have as swap space:
> > >
> > > $ cat /proc/swaps
> > > Filename Type Size Used Priority
> > > /var/swap file 134217724 0 -2
> > > /dev/zram0 partition 8388604 0 100
> >
> >
> > Hi Hyeonggon,
> >
> > Thanks for reporting this! I think I know what went wrong. Could you
> > please verify if the below fix works if possible?
> >
> > Domenico, I believe the below fix would also fix a problem with the
> > recent writeback series. If the entry is invalidated before we grab the
> > lock to put the local ref in zswap_frontswap_load(), then the entry
> > will be freed once we call zswap_entry_put(), and the movement to the
> > beginning LRU will be operating on a freed entry. It also modifies
> > your recently added commit 418fd29d9de5 ("mm: zswap: invaldiate entry
> > after writeback"). I would appreciate it if you also take a look.
>
> Hi Yosry,
>
> Thanks, this makes sense indeed. I've been running a stress test too for
> an hour now and it seems fine.

Thanks! I will send the patch to Andrew then!

>
> >
> > If this works as intended, I can send a formal patch (applies on top
> > of fd247f029cd0 ("mm/gup: do not return 0 from pin_user_pages_fast()
> > for bad args")):
> >
> > From 4b7f949b3ffb42d969d525d5b576fad474f55276 Mon Sep 17 00:00:00 2001
> > From: Yosry Ahmed <[email protected]>
> > Date: Wed, 21 Jun 2023 07:43:51 +0000
> > Subject: [PATCH] mm: zswap: fix double invalidate with exclusive loads
> >
> > If exclusive loads are enabled for zswap, we invalidate the entry before
> > returning from zswap_frontswap_load(), after dropping the local
> > reference. However, the tree lock is dropped during decompression after
> > the local reference is acquired, so the entry could be invalidated
> > before we drop the local ref. If this happens, the entry is freed once
> > we drop the local ref, and zswap_invalidate_entry() tries to invalidate
> > an already freed entry.
> >
> > Fix this by:
> > (a) Making sure zswap_invalidate_entry() is always called with a local
> > ref held, to avoid being called on a freed entry.
> > (b) Making sure zswap_invalidate_entry() only drops the ref if the entry
> > was actually on the rbtree. Otherwise, another invalidation could
> > have already happened, and the initial ref is already dropped.
> >
> > With these changes, there is no need to check that there is no need to
> > make sure the entry still exists in the tree in zswap_reclaim_entry()
> > before invalidating it, as zswap_reclaim_entry() will make this check
> > internally.
> >
> > Fixes: b9c91c43412f ("mm: zswap: support exclusive loads")
> > Reported-by: Hyeonggon Yoo <[email protected]>
> > Signed-off-by: Yosry Ahmed <[email protected]>
> > ---
> > mm/zswap.c | 21 ++++++++++++---------
> > 1 file changed, 12 insertions(+), 9 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 87b204233115..62195f72bf56 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -355,12 +355,14 @@ static int zswap_rb_insert(struct rb_root *root,
> > struct zswap_entry *entry,
> > return 0;
> > }
> >
> > -static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
> > +static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
> > {
> > if (!RB_EMPTY_NODE(&entry->rbnode)) {
> > rb_erase(&entry->rbnode, root);
> > RB_CLEAR_NODE(&entry->rbnode);
> > + return true;
> > }
> > + return false;
> > }
> >
> > /*
> > @@ -599,14 +601,16 @@ static struct zswap_pool
> > *zswap_pool_find_get(char *type, char *compressor)
> > return NULL;
> > }
> >
> > +/*
> > + * If the entry is still valid in the tree, drop the initial ref and remove it
> > + * from the tree. This function must be called with an additional ref held,
> > + * otherwise it may race with another invalidation freeing the entry.
> > + */
> > static void zswap_invalidate_entry(struct zswap_tree *tree,
> > struct zswap_entry *entry)
> > {
> > - /* remove from rbtree */
> > - zswap_rb_erase(&tree->rbroot, entry);
> > -
> > - /* drop the initial reference from entry creation */
> > - zswap_entry_put(tree, entry);
> > + if (zswap_rb_erase(&tree->rbroot, entry))
> > + zswap_entry_put(tree, entry);
> > }
> >
> > static int zswap_reclaim_entry(struct zswap_pool *pool)
> > @@ -659,8 +663,7 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
> > * swapcache. Drop the entry from zswap - unless invalidate already
> > * took it out while we had the tree->lock released for IO.
> > */
> > - if (entry == zswap_rb_search(&tree->rbroot, swpoffset))
> > - zswap_invalidate_entry(tree, entry);
> > + zswap_invalidate_entry(tree, entry);
> >
> > put_unlock:
> > /* Drop local reference */
> > @@ -1466,7 +1469,6 @@ static int zswap_frontswap_load(unsigned type,
> > pgoff_t offset,
> > count_objcg_event(entry->objcg, ZSWPIN);
> > freeentry:
> > spin_lock(&tree->lock);
> > - zswap_entry_put(tree, entry);
> > if (!ret && zswap_exclusive_loads_enabled) {
> > zswap_invalidate_entry(tree, entry);
> > *exclusive = true;
> > @@ -1475,6 +1477,7 @@ static int zswap_frontswap_load(unsigned type,
> > pgoff_t offset,
> > list_move(&entry->lru, &entry->pool->lru);
> > spin_unlock(&entry->pool->lru_lock);
> > }
> > + zswap_entry_put(tree, entry);
> > spin_unlock(&tree->lock);
> >
> > return ret;
> > --
> > 2.41.0.162.gfafddb0af9-goog

2023-06-21 09:33:30

by Domenico Cerasuolo

[permalink] [raw]
Subject: Re: [BUG mm-unstable] "kernel BUG at mm/swap.c:393!" on commit b9c91c43412f2e

On Wed, Jun 21, 2023 at 10:06 AM Yosry Ahmed <[email protected]> wrote:
>
> On Wed, Jun 21, 2023 at 12:01 AM Hyeonggon Yoo <[email protected]> wrote:
> >
> > On Wed, Jun 07, 2023 at 07:51:43PM +0000, Yosry Ahmed wrote:
> > > Commit 71024cb4a0bf ("frontswap: remove frontswap_tmem_exclusive_gets")
> > > removed support for exclusive loads from frontswap as it was not used.
> > > Bring back exclusive loads support to frontswap by adding an "exclusive"
> > > output parameter to frontswap_ops->load.
> > >
> > > On the zswap side, add a module parameter to enable/disable exclusive
> > > loads, and a config option to control the boot default value.
> > > Refactor zswap entry invalidation in zswap_frontswap_invalidate_page()
> > > into zswap_invalidate_entry() to reuse it in zswap_frontswap_load() if
> > > exclusive loads are enabled.
> > >
> > > With exclusive loads, we avoid having two copies of the same page in
> > > memory (compressed & uncompressed) after faulting it in from zswap. On
> > > the other hand, if the page is to be reclaimed again without being
> > > dirtied, it will be re-compressed. Compression is not usually slow, and
> > > a page that was just faulted in is less likely to be reclaimed again
> > > soon.
> > >
> > > Suggested-by: Yu Zhao <[email protected]>
> > > Signed-off-by: Yosry Ahmed <[email protected]>
> > > ---
> > >
> > > v1 -> v2:
> > > - Add a module parameter to control whether exclusive loads are enabled
> > > or not, the config option now controls the default boot value instead.
> > > Replaced frontswap_ops->exclusive_loads by an output parameter to
> > > frontswap_ops->load() (Johannes Weiner).
> > > ---
> >
> > Hi Yosry, I was testing the latest mm-unstable and encountered a bug.
> > It was bisectable and this is the first bad commit.
> >
> >
> > Attached config file and bisect log.
> > The oops message is available at:
> >
> > https://social.kernel.org/media/eace06d71655b3cc76411366573e4a8ce240ad65b8fd20977d7c73eec9dc2253.jpg
> >
> > (the head commit is b9c91c43412f2e07 "mm: zswap: support exclusive loads")
> > (it's an image because I tested it on real machine)
> >
> >
> > This is what I have as swap space:
> >
> > $ cat /proc/swaps
> > Filename Type Size Used Priority
> > /var/swap file 134217724 0 -2
> > /dev/zram0 partition 8388604 0 100
>
>
> Hi Hyeonggon,
>
> Thanks for reporting this! I think I know what went wrong. Could you
> please verify if the below fix works if possible?
>
> Domenico, I believe the below fix would also fix a problem with the
> recent writeback series. If the entry is invalidated before we grab the
> lock to put the local ref in zswap_frontswap_load(), then the entry
> will be freed once we call zswap_entry_put(), and the movement to the
> beginning LRU will be operating on a freed entry. It also modifies
> your recently added commit 418fd29d9de5 ("mm: zswap: invaldiate entry
> after writeback"). I would appreciate it if you also take a look.

Hi Yosry,

Thanks, this makes sense indeed. I've been running a stress test too for
an hour now and it seems fine.

>
> If this works as intended, I can send a formal patch (applies on top
> of fd247f029cd0 ("mm/gup: do not return 0 from pin_user_pages_fast()
> for bad args")):
>
> From 4b7f949b3ffb42d969d525d5b576fad474f55276 Mon Sep 17 00:00:00 2001
> From: Yosry Ahmed <[email protected]>
> Date: Wed, 21 Jun 2023 07:43:51 +0000
> Subject: [PATCH] mm: zswap: fix double invalidate with exclusive loads
>
> If exclusive loads are enabled for zswap, we invalidate the entry before
> returning from zswap_frontswap_load(), after dropping the local
> reference. However, the tree lock is dropped during decompression after
> the local reference is acquired, so the entry could be invalidated
> before we drop the local ref. If this happens, the entry is freed once
> we drop the local ref, and zswap_invalidate_entry() tries to invalidate
> an already freed entry.
>
> Fix this by:
> (a) Making sure zswap_invalidate_entry() is always called with a local
> ref held, to avoid being called on a freed entry.
> (b) Making sure zswap_invalidate_entry() only drops the ref if the entry
> was actually on the rbtree. Otherwise, another invalidation could
> have already happened, and the initial ref is already dropped.
>
> With these changes, there is no need to check that there is no need to
> make sure the entry still exists in the tree in zswap_reclaim_entry()
> before invalidating it, as zswap_reclaim_entry() will make this check
> internally.
>
> Fixes: b9c91c43412f ("mm: zswap: support exclusive loads")
> Reported-by: Hyeonggon Yoo <[email protected]>
> Signed-off-by: Yosry Ahmed <[email protected]>
> ---
> mm/zswap.c | 21 ++++++++++++---------
> 1 file changed, 12 insertions(+), 9 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 87b204233115..62195f72bf56 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -355,12 +355,14 @@ static int zswap_rb_insert(struct rb_root *root,
> struct zswap_entry *entry,
> return 0;
> }
>
> -static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
> +static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
> {
> if (!RB_EMPTY_NODE(&entry->rbnode)) {
> rb_erase(&entry->rbnode, root);
> RB_CLEAR_NODE(&entry->rbnode);
> + return true;
> }
> + return false;
> }
>
> /*
> @@ -599,14 +601,16 @@ static struct zswap_pool
> *zswap_pool_find_get(char *type, char *compressor)
> return NULL;
> }
>
> +/*
> + * If the entry is still valid in the tree, drop the initial ref and remove it
> + * from the tree. This function must be called with an additional ref held,
> + * otherwise it may race with another invalidation freeing the entry.
> + */
> static void zswap_invalidate_entry(struct zswap_tree *tree,
> struct zswap_entry *entry)
> {
> - /* remove from rbtree */
> - zswap_rb_erase(&tree->rbroot, entry);
> -
> - /* drop the initial reference from entry creation */
> - zswap_entry_put(tree, entry);
> + if (zswap_rb_erase(&tree->rbroot, entry))
> + zswap_entry_put(tree, entry);
> }
>
> static int zswap_reclaim_entry(struct zswap_pool *pool)
> @@ -659,8 +663,7 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
> * swapcache. Drop the entry from zswap - unless invalidate already
> * took it out while we had the tree->lock released for IO.
> */
> - if (entry == zswap_rb_search(&tree->rbroot, swpoffset))
> - zswap_invalidate_entry(tree, entry);
> + zswap_invalidate_entry(tree, entry);
>
> put_unlock:
> /* Drop local reference */
> @@ -1466,7 +1469,6 @@ static int zswap_frontswap_load(unsigned type,
> pgoff_t offset,
> count_objcg_event(entry->objcg, ZSWPIN);
> freeentry:
> spin_lock(&tree->lock);
> - zswap_entry_put(tree, entry);
> if (!ret && zswap_exclusive_loads_enabled) {
> zswap_invalidate_entry(tree, entry);
> *exclusive = true;
> @@ -1475,6 +1477,7 @@ static int zswap_frontswap_load(unsigned type,
> pgoff_t offset,
> list_move(&entry->lru, &entry->pool->lru);
> spin_unlock(&entry->pool->lru_lock);
> }
> + zswap_entry_put(tree, entry);
> spin_unlock(&tree->lock);
>
> return ret;
> --
> 2.41.0.162.gfafddb0af9-goog