2021-04-05 18:00:57

by Miaohe Lin

[permalink] [raw]
Subject: [PATCH] mm/frontswap: fix frontswap_register_ops() race with swapon and swapoff

frontswap_register_ops can race with swapon. Consider the following scene:

CPU1 CPU2
---- ----
frontswap_register_ops
fill bitmap a
ops->init
sys_swapon
enable_swap_info
frontswap_init without new ops
add ops to frontswap_ops list
check if swap_active_head changed
add to swap_active_head

So the frontswap_ops init is missed on the new swap device. Consider the
another scene:
CPU1 CPU2
---- ----
frontswap_register_ops
fill bitmap a
ops->init
add ops to frontswap_ops list
sys_swapon
enable_swap_info
frontswap_init with new ops
add to swap_active_head
check if swap_active_head changed
ops->init for new swap device [twice!]

The frontswap_ops init will be called two times on the new swap device this
time. frontswap_register_ops can also race with swapoff. Consider the
following scene:

CPU1 CPU2
---- ----
sys_swapoff
removed from swap_active_head
frontswap_register_ops
fill bitmap a
ops->init without swap device
add ops to frontswap_ops list
invalidate_area with new ops
check if swap_active_head changed

We could call invalidate_area on a swap device under swapoff with frontswap
is uninitialized yet. Fix all these by using swapon_mutex to guard against
race with swapon and add swap_info_get_if_under_swapoff() to collect swap
devices under swapoff.

Fixes: d1dc6f1bcf1e ("frontswap: allow multiple backends")
Signed-off-by: Miaohe Lin <[email protected]>
---
include/linux/swapfile.h | 2 ++
mm/frontswap.c | 40 +++++++++++++++++-----------------------
mm/swapfile.c | 13 ++++++++++++-
3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index e06febf62978..7ae15d917828 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -9,8 +9,10 @@
extern spinlock_t swap_lock;
extern struct plist_head swap_active_head;
extern struct swap_info_struct *swap_info[];
+extern struct mutex swapon_mutex;
extern int try_to_unuse(unsigned int, bool, unsigned long);
extern unsigned long generic_max_swapfile_size(void);
extern unsigned long max_swapfile_size(void);
+extern struct swap_info_struct *swap_info_get_if_under_swapoff(int type);

#endif /* _LINUX_SWAPFILE_H */
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 130e301c5ac0..c16bfc7550b5 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -123,12 +123,26 @@ void frontswap_register_ops(struct frontswap_ops *ops)

bitmap_zero(a, MAX_SWAPFILES);
bitmap_zero(b, MAX_SWAPFILES);
-
+ mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
plist_for_each_entry(si, &swap_active_head, list) {
if (!WARN_ON(!si->frontswap_map))
set_bit(si->type, a);
}
+ /*
+ * There might be some swap devices under swapoff, i.e. they are
+ * removed from swap_active_head but frontswap_invalidate_area()
+ * is not called yet due to swapon_mutex is held here. We must
+ * collect these swap devices and call ops->init on them or they
+ * might invalidate frontswap area while frontswap is uninitialized.
+ */
+ for_each_clear_bit(i, a, MAX_SWAPFILES) {
+ si = swap_info_get_if_under_swapoff(i);
+ if (!si || !si->frontswap_map)
+ continue;
+ set_bit(si->type, b);
+ }
+ bitmap_or(a, a, b, MAX_SWAPFILES);
spin_unlock(&swap_lock);

/* the new ops needs to know the currently active swap devices */
@@ -144,29 +158,9 @@ void frontswap_register_ops(struct frontswap_ops *ops)
ops->next = frontswap_ops;
} while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);

- static_branch_inc(&frontswap_enabled_key);
-
- spin_lock(&swap_lock);
- plist_for_each_entry(si, &swap_active_head, list) {
- if (si->frontswap_map)
- set_bit(si->type, b);
- }
- spin_unlock(&swap_lock);
+ mutex_unlock(&swapon_mutex);

- /*
- * On the very unlikely chance that a swap device was added or
- * removed between setting the "a" list bits and the ops init
- * calls, we re-check and do init or invalidate for any changed
- * bits.
- */
- if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
- for (i = 0; i < MAX_SWAPFILES; i++) {
- if (!test_bit(i, a) && test_bit(i, b))
- ops->init(i);
- else if (test_bit(i, a) && !test_bit(i, b))
- ops->invalidate_area(i);
- }
- }
+ static_branch_inc(&frontswap_enabled_key);
}
EXPORT_SYMBOL(frontswap_register_ops);

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 149e77454e3c..ee736533717f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -89,7 +89,7 @@ static DEFINE_SPINLOCK(swap_avail_lock);

struct swap_info_struct *swap_info[MAX_SWAPFILES];

-static DEFINE_MUTEX(swapon_mutex);
+DEFINE_MUTEX(swapon_mutex);

static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
/* Activity counter to indicate that a swapon or swapoff has occurred */
@@ -2958,6 +2958,17 @@ __weak unsigned long max_swapfile_size(void)
return generic_max_swapfile_size();
}

+struct swap_info_struct *swap_info_get_if_under_swapoff(int type)
+{
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
+
+ if (!si || !si->swap_map)
+ return NULL;
+ if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
+ return si;
+ return NULL;
+}
+
static unsigned long read_swap_header(struct swap_info_struct *p,
union swap_header *swap_header,
struct inode *inode)
--
2.19.1


2021-04-10 07:31:44

by Miaohe Lin

[permalink] [raw]
Subject: Re: [PATCH] mm/frontswap: fix frontswap_register_ops() race with swapon and swapoff

Hi:
On 2021/4/5 18:20, Miaohe Lin wrote:
> frontswap_register_ops can race with swapon. Consider the following scene:

Any comment or suggestion? Or is this race window too theoretical to fix?
Thanks.

>
> CPU1 CPU2
> ---- ----
> frontswap_register_ops
> fill bitmap a
> ops->init
> sys_swapon
> enable_swap_info
> frontswap_init without new ops
> add ops to frontswap_ops list
> check if swap_active_head changed
> add to swap_active_head
>
> So the frontswap_ops init is missed on the new swap device. Consider the
> another scene:
> CPU1 CPU2
> ---- ----
> frontswap_register_ops
> fill bitmap a
> ops->init
> add ops to frontswap_ops list
> sys_swapon
> enable_swap_info
> frontswap_init with new ops
> add to swap_active_head
> check if swap_active_head changed
> ops->init for new swap device [twice!]
>
> The frontswap_ops init will be called two times on the new swap device this
> time. frontswap_register_ops can also race with swapoff. Consider the
> following scene:
>
> CPU1 CPU2
> ---- ----
> sys_swapoff
> removed from swap_active_head
> frontswap_register_ops
> fill bitmap a
> ops->init without swap device
> add ops to frontswap_ops list
> invalidate_area with new ops
> check if swap_active_head changed
>
> We could call invalidate_area on a swap device under swapoff with frontswap
> is uninitialized yet. Fix all these by using swapon_mutex to guard against
> race with swapon and add swap_info_get_if_under_swapoff() to collect swap
> devices under swapoff.
>
> Fixes: d1dc6f1bcf1e ("frontswap: allow multiple backends")
> Signed-off-by: Miaohe Lin <[email protected]>
> ---
> include/linux/swapfile.h | 2 ++
> mm/frontswap.c | 40 +++++++++++++++++-----------------------
> mm/swapfile.c | 13 ++++++++++++-
> 3 files changed, 31 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
> index e06febf62978..7ae15d917828 100644
> --- a/include/linux/swapfile.h
> +++ b/include/linux/swapfile.h
> @@ -9,8 +9,10 @@
> extern spinlock_t swap_lock;
> extern struct plist_head swap_active_head;
> extern struct swap_info_struct *swap_info[];
> +extern struct mutex swapon_mutex;
> extern int try_to_unuse(unsigned int, bool, unsigned long);
> extern unsigned long generic_max_swapfile_size(void);
> extern unsigned long max_swapfile_size(void);
> +extern struct swap_info_struct *swap_info_get_if_under_swapoff(int type);
>
> #endif /* _LINUX_SWAPFILE_H */
> diff --git a/mm/frontswap.c b/mm/frontswap.c
> index 130e301c5ac0..c16bfc7550b5 100644
> --- a/mm/frontswap.c
> +++ b/mm/frontswap.c
> @@ -123,12 +123,26 @@ void frontswap_register_ops(struct frontswap_ops *ops)
>
> bitmap_zero(a, MAX_SWAPFILES);
> bitmap_zero(b, MAX_SWAPFILES);
> -
> + mutex_lock(&swapon_mutex);
> spin_lock(&swap_lock);
> plist_for_each_entry(si, &swap_active_head, list) {
> if (!WARN_ON(!si->frontswap_map))
> set_bit(si->type, a);
> }
> + /*
> + * There might be some swap devices under swapoff, i.e. they are
> + * removed from swap_active_head but frontswap_invalidate_area()
> + * is not called yet due to swapon_mutex is held here. We must
> + * collect these swap devices and call ops->init on them or they
> + * might invalidate frontswap area while frontswap is uninitialized.
> + */
> + for_each_clear_bit(i, a, MAX_SWAPFILES) {
> + si = swap_info_get_if_under_swapoff(i);
> + if (!si || !si->frontswap_map)
> + continue;
> + set_bit(si->type, b);
> + }
> + bitmap_or(a, a, b, MAX_SWAPFILES);
> spin_unlock(&swap_lock);
>
> /* the new ops needs to know the currently active swap devices */
> @@ -144,29 +158,9 @@ void frontswap_register_ops(struct frontswap_ops *ops)
> ops->next = frontswap_ops;
> } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
>
> - static_branch_inc(&frontswap_enabled_key);
> -
> - spin_lock(&swap_lock);
> - plist_for_each_entry(si, &swap_active_head, list) {
> - if (si->frontswap_map)
> - set_bit(si->type, b);
> - }
> - spin_unlock(&swap_lock);
> + mutex_unlock(&swapon_mutex);
>
> - /*
> - * On the very unlikely chance that a swap device was added or
> - * removed between setting the "a" list bits and the ops init
> - * calls, we re-check and do init or invalidate for any changed
> - * bits.
> - */
> - if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
> - for (i = 0; i < MAX_SWAPFILES; i++) {
> - if (!test_bit(i, a) && test_bit(i, b))
> - ops->init(i);
> - else if (test_bit(i, a) && !test_bit(i, b))
> - ops->invalidate_area(i);
> - }
> - }
> + static_branch_inc(&frontswap_enabled_key);
> }
> EXPORT_SYMBOL(frontswap_register_ops);
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 149e77454e3c..ee736533717f 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -89,7 +89,7 @@ static DEFINE_SPINLOCK(swap_avail_lock);
>
> struct swap_info_struct *swap_info[MAX_SWAPFILES];
>
> -static DEFINE_MUTEX(swapon_mutex);
> +DEFINE_MUTEX(swapon_mutex);
>
> static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
> /* Activity counter to indicate that a swapon or swapoff has occurred */
> @@ -2958,6 +2958,17 @@ __weak unsigned long max_swapfile_size(void)
> return generic_max_swapfile_size();
> }
>
> +struct swap_info_struct *swap_info_get_if_under_swapoff(int type)
> +{
> + struct swap_info_struct *si = swap_type_to_swap_info(type);
> +
> + if (!si || !si->swap_map)
> + return NULL;
> + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
> + return si;
> + return NULL;
> +}
> +
> static unsigned long read_swap_header(struct swap_info_struct *p,
> union swap_header *swap_header,
> struct inode *inode)
>

2021-04-10 10:44:57

by Yu Zhao

[permalink] [raw]
Subject: Re: [PATCH] mm/frontswap: fix frontswap_register_ops() race with swapon and swapoff

On Sat, Apr 10, 2021 at 1:30 AM Miaohe Lin <[email protected]> wrote:
>
> Hi:
> On 2021/4/5 18:20, Miaohe Lin wrote:
> > frontswap_register_ops can race with swapon. Consider the following scene:
>
> Any comment or suggestion? Or is this race window too theoretical to fix?
> Thanks.

Let me run a stress test and get back to you (within a day or so).

> >
> > CPU1 CPU2
> > ---- ----
> > frontswap_register_ops
> > fill bitmap a
> > ops->init
> > sys_swapon
> > enable_swap_info
> > frontswap_init without new ops
> > add ops to frontswap_ops list
> > check if swap_active_head changed
> > add to swap_active_head
> >
> > So the frontswap_ops init is missed on the new swap device. Consider the
> > another scene:
> > CPU1 CPU2
> > ---- ----
> > frontswap_register_ops
> > fill bitmap a
> > ops->init
> > add ops to frontswap_ops list
> > sys_swapon
> > enable_swap_info
> > frontswap_init with new ops
> > add to swap_active_head
> > check if swap_active_head changed
> > ops->init for new swap device [twice!]
> >
> > The frontswap_ops init will be called two times on the new swap device this
> > time. frontswap_register_ops can also race with swapoff. Consider the
> > following scene:
> >
> > CPU1 CPU2
> > ---- ----
> > sys_swapoff
> > removed from swap_active_head
> > frontswap_register_ops
> > fill bitmap a
> > ops->init without swap device
> > add ops to frontswap_ops list
> > invalidate_area with new ops
> > check if swap_active_head changed
> >
> > We could call invalidate_area on a swap device under swapoff with frontswap
> > is uninitialized yet. Fix all these by using swapon_mutex to guard against
> > race with swapon and add swap_info_get_if_under_swapoff() to collect swap
> > devices under swapoff.
> >
> > Fixes: d1dc6f1bcf1e ("frontswap: allow multiple backends")
> > Signed-off-by: Miaohe Lin <[email protected]>
> > ---
> > include/linux/swapfile.h | 2 ++
> > mm/frontswap.c | 40 +++++++++++++++++-----------------------
> > mm/swapfile.c | 13 ++++++++++++-
> > 3 files changed, 31 insertions(+), 24 deletions(-)
> >
> > diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
> > index e06febf62978..7ae15d917828 100644
> > --- a/include/linux/swapfile.h
> > +++ b/include/linux/swapfile.h
> > @@ -9,8 +9,10 @@
> > extern spinlock_t swap_lock;
> > extern struct plist_head swap_active_head;
> > extern struct swap_info_struct *swap_info[];
> > +extern struct mutex swapon_mutex;
> > extern int try_to_unuse(unsigned int, bool, unsigned long);
> > extern unsigned long generic_max_swapfile_size(void);
> > extern unsigned long max_swapfile_size(void);
> > +extern struct swap_info_struct *swap_info_get_if_under_swapoff(int type);
> >
> > #endif /* _LINUX_SWAPFILE_H */
> > diff --git a/mm/frontswap.c b/mm/frontswap.c
> > index 130e301c5ac0..c16bfc7550b5 100644
> > --- a/mm/frontswap.c
> > +++ b/mm/frontswap.c
> > @@ -123,12 +123,26 @@ void frontswap_register_ops(struct frontswap_ops *ops)
> >
> > bitmap_zero(a, MAX_SWAPFILES);
> > bitmap_zero(b, MAX_SWAPFILES);
> > -
> > + mutex_lock(&swapon_mutex);
> > spin_lock(&swap_lock);
> > plist_for_each_entry(si, &swap_active_head, list) {
> > if (!WARN_ON(!si->frontswap_map))
> > set_bit(si->type, a);
> > }
> > + /*
> > + * There might be some swap devices under swapoff, i.e. they are
> > + * removed from swap_active_head but frontswap_invalidate_area()
> > + * is not called yet due to swapon_mutex is held here. We must
> > + * collect these swap devices and call ops->init on them or they
> > + * might invalidate frontswap area while frontswap is uninitialized.
> > + */
> > + for_each_clear_bit(i, a, MAX_SWAPFILES) {
> > + si = swap_info_get_if_under_swapoff(i);
> > + if (!si || !si->frontswap_map)
> > + continue;
> > + set_bit(si->type, b);
> > + }
> > + bitmap_or(a, a, b, MAX_SWAPFILES);
> > spin_unlock(&swap_lock);
> >
> > /* the new ops needs to know the currently active swap devices */
> > @@ -144,29 +158,9 @@ void frontswap_register_ops(struct frontswap_ops *ops)
> > ops->next = frontswap_ops;
> > } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
> >
> > - static_branch_inc(&frontswap_enabled_key);
> > -
> > - spin_lock(&swap_lock);
> > - plist_for_each_entry(si, &swap_active_head, list) {
> > - if (si->frontswap_map)
> > - set_bit(si->type, b);
> > - }
> > - spin_unlock(&swap_lock);
> > + mutex_unlock(&swapon_mutex);
> >
> > - /*
> > - * On the very unlikely chance that a swap device was added or
> > - * removed between setting the "a" list bits and the ops init
> > - * calls, we re-check and do init or invalidate for any changed
> > - * bits.
> > - */
> > - if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
> > - for (i = 0; i < MAX_SWAPFILES; i++) {
> > - if (!test_bit(i, a) && test_bit(i, b))
> > - ops->init(i);
> > - else if (test_bit(i, a) && !test_bit(i, b))
> > - ops->invalidate_area(i);
> > - }
> > - }
> > + static_branch_inc(&frontswap_enabled_key);
> > }
> > EXPORT_SYMBOL(frontswap_register_ops);
> >
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 149e77454e3c..ee736533717f 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -89,7 +89,7 @@ static DEFINE_SPINLOCK(swap_avail_lock);
> >
> > struct swap_info_struct *swap_info[MAX_SWAPFILES];
> >
> > -static DEFINE_MUTEX(swapon_mutex);
> > +DEFINE_MUTEX(swapon_mutex);
> >
> > static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
> > /* Activity counter to indicate that a swapon or swapoff has occurred */
> > @@ -2958,6 +2958,17 @@ __weak unsigned long max_swapfile_size(void)
> > return generic_max_swapfile_size();
> > }
> >
> > +struct swap_info_struct *swap_info_get_if_under_swapoff(int type)
> > +{
> > + struct swap_info_struct *si = swap_type_to_swap_info(type);
> > +
> > + if (!si || !si->swap_map)
> > + return NULL;
> > + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
> > + return si;
> > + return NULL;
> > +}
> > +
> > static unsigned long read_swap_header(struct swap_info_struct *p,
> > union swap_header *swap_header,
> > struct inode *inode)
> >
>
>

2021-04-10 11:03:51

by Miaohe Lin

[permalink] [raw]
Subject: Re: [PATCH] mm/frontswap: fix frontswap_register_ops() race with swapon and swapoff

On 2021/4/10 18:42, Yu Zhao wrote:
> On Sat, Apr 10, 2021 at 1:30 AM Miaohe Lin <[email protected]> wrote:
>>
>> Hi:
>> On 2021/4/5 18:20, Miaohe Lin wrote:
>>> frontswap_register_ops can race with swapon. Consider the following scene:
>>
>> Any comment or suggestion? Or is this race window too theoretical to fix?
>> Thanks.
>
> Let me run a stress test and get back to you (within a day or so).

That's very kind of you. Many thanks!

>
>>>
>>> CPU1 CPU2
>>> ---- ----
>>> frontswap_register_ops
>>> fill bitmap a
>>> ops->init
>>> sys_swapon
>>> enable_swap_info
>>> frontswap_init without new ops
>>> add ops to frontswap_ops list
>>> check if swap_active_head changed
>>> add to swap_active_head
>>>
>>> So the frontswap_ops init is missed on the new swap device. Consider the
>>> another scene:
>>> CPU1 CPU2
>>> ---- ----
>>> frontswap_register_ops
>>> fill bitmap a
>>> ops->init
>>> add ops to frontswap_ops list
>>> sys_swapon
>>> enable_swap_info
>>> frontswap_init with new ops
>>> add to swap_active_head
>>> check if swap_active_head changed
>>> ops->init for new swap device [twice!]
>>>
>>> The frontswap_ops init will be called two times on the new swap device this
>>> time. frontswap_register_ops can also race with swapoff. Consider the
>>> following scene:
>>>
>>> CPU1 CPU2
>>> ---- ----
>>> sys_swapoff
>>> removed from swap_active_head
>>> frontswap_register_ops
>>> fill bitmap a
>>> ops->init without swap device
>>> add ops to frontswap_ops list
>>> invalidate_area with new ops
>>> check if swap_active_head changed
>>>
>>> We could call invalidate_area on a swap device under swapoff with frontswap
>>> is uninitialized yet. Fix all these by using swapon_mutex to guard against
>>> race with swapon and add swap_info_get_if_under_swapoff() to collect swap
>>> devices under swapoff.
>>>
>>> Fixes: d1dc6f1bcf1e ("frontswap: allow multiple backends")
>>> Signed-off-by: Miaohe Lin <[email protected]>
>>> ---
>>> include/linux/swapfile.h | 2 ++
>>> mm/frontswap.c | 40 +++++++++++++++++-----------------------
>>> mm/swapfile.c | 13 ++++++++++++-
>>> 3 files changed, 31 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
>>> index e06febf62978..7ae15d917828 100644
>>> --- a/include/linux/swapfile.h
>>> +++ b/include/linux/swapfile.h
>>> @@ -9,8 +9,10 @@
>>> extern spinlock_t swap_lock;
>>> extern struct plist_head swap_active_head;
>>> extern struct swap_info_struct *swap_info[];
>>> +extern struct mutex swapon_mutex;
>>> extern int try_to_unuse(unsigned int, bool, unsigned long);
>>> extern unsigned long generic_max_swapfile_size(void);
>>> extern unsigned long max_swapfile_size(void);
>>> +extern struct swap_info_struct *swap_info_get_if_under_swapoff(int type);
>>>
>>> #endif /* _LINUX_SWAPFILE_H */
>>> diff --git a/mm/frontswap.c b/mm/frontswap.c
>>> index 130e301c5ac0..c16bfc7550b5 100644
>>> --- a/mm/frontswap.c
>>> +++ b/mm/frontswap.c
>>> @@ -123,12 +123,26 @@ void frontswap_register_ops(struct frontswap_ops *ops)
>>>
>>> bitmap_zero(a, MAX_SWAPFILES);
>>> bitmap_zero(b, MAX_SWAPFILES);
>>> -
>>> + mutex_lock(&swapon_mutex);
>>> spin_lock(&swap_lock);
>>> plist_for_each_entry(si, &swap_active_head, list) {
>>> if (!WARN_ON(!si->frontswap_map))
>>> set_bit(si->type, a);
>>> }
>>> + /*
>>> + * There might be some swap devices under swapoff, i.e. they are
>>> + * removed from swap_active_head but frontswap_invalidate_area()
>>> + * is not called yet due to swapon_mutex is held here. We must
>>> + * collect these swap devices and call ops->init on them or they
>>> + * might invalidate frontswap area while frontswap is uninitialized.
>>> + */
>>> + for_each_clear_bit(i, a, MAX_SWAPFILES) {
>>> + si = swap_info_get_if_under_swapoff(i);
>>> + if (!si || !si->frontswap_map)
>>> + continue;
>>> + set_bit(si->type, b);
>>> + }
>>> + bitmap_or(a, a, b, MAX_SWAPFILES);
>>> spin_unlock(&swap_lock);
>>>
>>> /* the new ops needs to know the currently active swap devices */
>>> @@ -144,29 +158,9 @@ void frontswap_register_ops(struct frontswap_ops *ops)
>>> ops->next = frontswap_ops;
>>> } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
>>>
>>> - static_branch_inc(&frontswap_enabled_key);
>>> -
>>> - spin_lock(&swap_lock);
>>> - plist_for_each_entry(si, &swap_active_head, list) {
>>> - if (si->frontswap_map)
>>> - set_bit(si->type, b);
>>> - }
>>> - spin_unlock(&swap_lock);
>>> + mutex_unlock(&swapon_mutex);
>>>
>>> - /*
>>> - * On the very unlikely chance that a swap device was added or
>>> - * removed between setting the "a" list bits and the ops init
>>> - * calls, we re-check and do init or invalidate for any changed
>>> - * bits.
>>> - */
>>> - if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
>>> - for (i = 0; i < MAX_SWAPFILES; i++) {
>>> - if (!test_bit(i, a) && test_bit(i, b))
>>> - ops->init(i);
>>> - else if (test_bit(i, a) && !test_bit(i, b))
>>> - ops->invalidate_area(i);
>>> - }
>>> - }
>>> + static_branch_inc(&frontswap_enabled_key);
>>> }
>>> EXPORT_SYMBOL(frontswap_register_ops);
>>>
>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>> index 149e77454e3c..ee736533717f 100644
>>> --- a/mm/swapfile.c
>>> +++ b/mm/swapfile.c
>>> @@ -89,7 +89,7 @@ static DEFINE_SPINLOCK(swap_avail_lock);
>>>
>>> struct swap_info_struct *swap_info[MAX_SWAPFILES];
>>>
>>> -static DEFINE_MUTEX(swapon_mutex);
>>> +DEFINE_MUTEX(swapon_mutex);
>>>
>>> static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
>>> /* Activity counter to indicate that a swapon or swapoff has occurred */
>>> @@ -2958,6 +2958,17 @@ __weak unsigned long max_swapfile_size(void)
>>> return generic_max_swapfile_size();
>>> }
>>>
>>> +struct swap_info_struct *swap_info_get_if_under_swapoff(int type)
>>> +{
>>> + struct swap_info_struct *si = swap_type_to_swap_info(type);
>>> +
>>> + if (!si || !si->swap_map)
>>> + return NULL;
>>> + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
>>> + return si;
>>> + return NULL;
>>> +}
>>> +
>>> static unsigned long read_swap_header(struct swap_info_struct *p,
>>> union swap_header *swap_header,
>>> struct inode *inode)
>>>
>>
>>
> .
>

2021-04-10 20:05:31

by Yu Zhao

[permalink] [raw]
Subject: Re: [PATCH] mm/frontswap: fix frontswap_register_ops() race with swapon and swapoff

On Sat, Apr 10, 2021 at 5:01 AM Miaohe Lin <[email protected]> wrote:
>
> On 2021/4/10 18:42, Yu Zhao wrote:
> > On Sat, Apr 10, 2021 at 1:30 AM Miaohe Lin <[email protected]> wrote:
> >>
> >> Hi:
> >> On 2021/4/5 18:20, Miaohe Lin wrote:
> >>> frontswap_register_ops can race with swapon. Consider the following scene:
> >>
> >> Any comment or suggestion? Or is this race window too theoretical to fix?
> >> Thanks.
> >
> > Let me run a stress test and get back to you (within a day or so).
>
> That's very kind of you. Many thanks!

I'm still getting the following crash. Probably I should try the other
series you sent earlier too?

BUG: unable to handle page fault for address: ffffaa7937d82000
RIP: 0010:scan_swap_map_slots+0x12b/0x7f0
Call Trace:
get_swap_pages+0x278/0x590
get_swap_page+0x1ab/0x280
add_to_swap+0x7d/0x130
shrink_page_list+0xf84/0x25f0
reclaim_pages+0x313/0x430
madvise_cold_or_pageout_pte_range+0x95c/0xaa0
walk_p4d_range+0x43f/0x790
walk_pgd_range+0xf1/0x150
__walk_page_range+0x6f/0x1b0
walk_page_range+0xbe/0x1e
do_madvise+0x271/0x720

> >>> CPU1 CPU2
> >>> ---- ----
> >>> frontswap_register_ops
> >>> fill bitmap a
> >>> ops->init
> >>> sys_swapon
> >>> enable_swap_info
> >>> frontswap_init without new ops
> >>> add ops to frontswap_ops list
> >>> check if swap_active_head changed
> >>> add to swap_active_head
> >>>
> >>> So the frontswap_ops init is missed on the new swap device. Consider the
> >>> another scene:
> >>> CPU1 CPU2
> >>> ---- ----
> >>> frontswap_register_ops
> >>> fill bitmap a
> >>> ops->init
> >>> add ops to frontswap_ops list
> >>> sys_swapon
> >>> enable_swap_info
> >>> frontswap_init with new ops
> >>> add to swap_active_head
> >>> check if swap_active_head changed
> >>> ops->init for new swap device [twice!]
> >>>
> >>> The frontswap_ops init will be called two times on the new swap device this
> >>> time. frontswap_register_ops can also race with swapoff. Consider the
> >>> following scene:
> >>>
> >>> CPU1 CPU2
> >>> ---- ----
> >>> sys_swapoff
> >>> removed from swap_active_head
> >>> frontswap_register_ops
> >>> fill bitmap a
> >>> ops->init without swap device
> >>> add ops to frontswap_ops list
> >>> invalidate_area with new ops
> >>> check if swap_active_head changed
> >>>
> >>> We could call invalidate_area on a swap device under swapoff with frontswap
> >>> is uninitialized yet. Fix all these by using swapon_mutex to guard against
> >>> race with swapon and add swap_info_get_if_under_swapoff() to collect swap
> >>> devices under swapoff.
> >>>
> >>> Fixes: d1dc6f1bcf1e ("frontswap: allow multiple backends")
> >>> Signed-off-by: Miaohe Lin <[email protected]>
> >>> ---
> >>> include/linux/swapfile.h | 2 ++
> >>> mm/frontswap.c | 40 +++++++++++++++++-----------------------
> >>> mm/swapfile.c | 13 ++++++++++++-
> >>> 3 files changed, 31 insertions(+), 24 deletions(-)
> >>>
> >>> diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
> >>> index e06febf62978..7ae15d917828 100644
> >>> --- a/include/linux/swapfile.h
> >>> +++ b/include/linux/swapfile.h
> >>> @@ -9,8 +9,10 @@
> >>> extern spinlock_t swap_lock;
> >>> extern struct plist_head swap_active_head;
> >>> extern struct swap_info_struct *swap_info[];
> >>> +extern struct mutex swapon_mutex;
> >>> extern int try_to_unuse(unsigned int, bool, unsigned long);
> >>> extern unsigned long generic_max_swapfile_size(void);
> >>> extern unsigned long max_swapfile_size(void);
> >>> +extern struct swap_info_struct *swap_info_get_if_under_swapoff(int type);
> >>>
> >>> #endif /* _LINUX_SWAPFILE_H */
> >>> diff --git a/mm/frontswap.c b/mm/frontswap.c
> >>> index 130e301c5ac0..c16bfc7550b5 100644
> >>> --- a/mm/frontswap.c
> >>> +++ b/mm/frontswap.c
> >>> @@ -123,12 +123,26 @@ void frontswap_register_ops(struct frontswap_ops *ops)
> >>>
> >>> bitmap_zero(a, MAX_SWAPFILES);
> >>> bitmap_zero(b, MAX_SWAPFILES);
> >>> -
> >>> + mutex_lock(&swapon_mutex);
> >>> spin_lock(&swap_lock);
> >>> plist_for_each_entry(si, &swap_active_head, list) {
> >>> if (!WARN_ON(!si->frontswap_map))
> >>> set_bit(si->type, a);
> >>> }
> >>> + /*
> >>> + * There might be some swap devices under swapoff, i.e. they are
> >>> + * removed from swap_active_head but frontswap_invalidate_area()
> >>> + * is not called yet due to swapon_mutex is held here. We must
> >>> + * collect these swap devices and call ops->init on them or they
> >>> + * might invalidate frontswap area while frontswap is uninitialized.
> >>> + */
> >>> + for_each_clear_bit(i, a, MAX_SWAPFILES) {
> >>> + si = swap_info_get_if_under_swapoff(i);
> >>> + if (!si || !si->frontswap_map)
> >>> + continue;
> >>> + set_bit(si->type, b);
> >>> + }
> >>> + bitmap_or(a, a, b, MAX_SWAPFILES);
> >>> spin_unlock(&swap_lock);
> >>>
> >>> /* the new ops needs to know the currently active swap devices */
> >>> @@ -144,29 +158,9 @@ void frontswap_register_ops(struct frontswap_ops *ops)
> >>> ops->next = frontswap_ops;
> >>> } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
> >>>
> >>> - static_branch_inc(&frontswap_enabled_key);
> >>> -
> >>> - spin_lock(&swap_lock);
> >>> - plist_for_each_entry(si, &swap_active_head, list) {
> >>> - if (si->frontswap_map)
> >>> - set_bit(si->type, b);
> >>> - }
> >>> - spin_unlock(&swap_lock);
> >>> + mutex_unlock(&swapon_mutex);
> >>>
> >>> - /*
> >>> - * On the very unlikely chance that a swap device was added or
> >>> - * removed between setting the "a" list bits and the ops init
> >>> - * calls, we re-check and do init or invalidate for any changed
> >>> - * bits.
> >>> - */
> >>> - if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
> >>> - for (i = 0; i < MAX_SWAPFILES; i++) {
> >>> - if (!test_bit(i, a) && test_bit(i, b))
> >>> - ops->init(i);
> >>> - else if (test_bit(i, a) && !test_bit(i, b))
> >>> - ops->invalidate_area(i);
> >>> - }
> >>> - }
> >>> + static_branch_inc(&frontswap_enabled_key);
> >>> }
> >>> EXPORT_SYMBOL(frontswap_register_ops);
> >>>
> >>> diff --git a/mm/swapfile.c b/mm/swapfile.c
> >>> index 149e77454e3c..ee736533717f 100644
> >>> --- a/mm/swapfile.c
> >>> +++ b/mm/swapfile.c
> >>> @@ -89,7 +89,7 @@ static DEFINE_SPINLOCK(swap_avail_lock);
> >>>
> >>> struct swap_info_struct *swap_info[MAX_SWAPFILES];
> >>>
> >>> -static DEFINE_MUTEX(swapon_mutex);
> >>> +DEFINE_MUTEX(swapon_mutex);
> >>>
> >>> static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
> >>> /* Activity counter to indicate that a swapon or swapoff has occurred */
> >>> @@ -2958,6 +2958,17 @@ __weak unsigned long max_swapfile_size(void)
> >>> return generic_max_swapfile_size();
> >>> }
> >>>
> >>> +struct swap_info_struct *swap_info_get_if_under_swapoff(int type)
> >>> +{
> >>> + struct swap_info_struct *si = swap_type_to_swap_info(type);
> >>> +
> >>> + if (!si || !si->swap_map)
> >>> + return NULL;
> >>> + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
> >>> + return si;
> >>> + return NULL;
> >>> +}
> >>> +
> >>> static unsigned long read_swap_header(struct swap_info_struct *p,
> >>> union swap_header *swap_header,
> >>> struct inode *inode)
> >>>
> >>
> >>
> > .
> >
>
>

2021-04-12 03:17:47

by Miaohe Lin

[permalink] [raw]
Subject: Re: [PATCH] mm/frontswap: fix frontswap_register_ops() race with swapon and swapoff

On 2021/4/11 4:02, Yu Zhao wrote:
> On Sat, Apr 10, 2021 at 5:01 AM Miaohe Lin <[email protected]> wrote:
>>
>> On 2021/4/10 18:42, Yu Zhao wrote:
>>> On Sat, Apr 10, 2021 at 1:30 AM Miaohe Lin <[email protected]> wrote:
>>>>
>>>> Hi:
>>>> On 2021/4/5 18:20, Miaohe Lin wrote:
>>>>> frontswap_register_ops can race with swapon. Consider the following scene:
>>>>
>>>> Any comment or suggestion? Or is this race window too theoretical to fix?
>>>> Thanks.
>>>
>>> Let me run a stress test and get back to you (within a day or so).
>>
>> That's very kind of you. Many thanks!
>
> I'm still getting the following crash. Probably I should try the other
> series you sent earlier too?
>
> BUG: unable to handle page fault for address: ffffaa7937d82000
> RIP: 0010:scan_swap_map_slots+0x12b/0x7f0
> Call Trace:
> get_swap_pages+0x278/0x590
> get_swap_page+0x1ab/0x280
> add_to_swap+0x7d/0x130
> shrink_page_list+0xf84/0x25f0
> reclaim_pages+0x313/0x430
> madvise_cold_or_pageout_pte_range+0x95c/0xaa0
> walk_p4d_range+0x43f/0x790
> walk_pgd_range+0xf1/0x150
> __walk_page_range+0x6f/0x1b0
> walk_page_range+0xbe/0x1e
> do_madvise+0x271/0x720

Sorry about it! This patch is fix the frontswap_register_ops() race with swapon/swapoff.
And the earlier patch is fix the race I found when I was investigating the swap code. So
this crash may not be included.

>
>>>>> CPU1 CPU2
>>>>> ---- ----
>>>>> frontswap_register_ops
>>>>> fill bitmap a
>>>>> ops->init
>>>>> sys_swapon
>>>>> enable_swap_info
>>>>> frontswap_init without new ops
>>>>> add ops to frontswap_ops list
>>>>> check if swap_active_head changed
>>>>> add to swap_active_head
>>>>>
>>>>> So the frontswap_ops init is missed on the new swap device. Consider the
>>>>> another scene:
>>>>> CPU1 CPU2
>>>>> ---- ----
>>>>> frontswap_register_ops
>>>>> fill bitmap a
>>>>> ops->init
>>>>> add ops to frontswap_ops list
>>>>> sys_swapon
>>>>> enable_swap_info
>>>>> frontswap_init with new ops
>>>>> add to swap_active_head
>>>>> check if swap_active_head changed
>>>>> ops->init for new swap device [twice!]
>>>>>
>>>>> The frontswap_ops init will be called two times on the new swap device this
>>>>> time. frontswap_register_ops can also race with swapoff. Consider the
>>>>> following scene:
>>>>>
>>>>> CPU1 CPU2
>>>>> ---- ----
>>>>> sys_swapoff
>>>>> removed from swap_active_head
>>>>> frontswap_register_ops
>>>>> fill bitmap a
>>>>> ops->init without swap device
>>>>> add ops to frontswap_ops list
>>>>> invalidate_area with new ops
>>>>> check if swap_active_head changed
>>>>>
>>>>> We could call invalidate_area on a swap device under swapoff with frontswap
>>>>> is uninitialized yet. Fix all these by using swapon_mutex to guard against
>>>>> race with swapon and add swap_info_get_if_under_swapoff() to collect swap
>>>>> devices under swapoff.
>>>>>
>>>>> Fixes: d1dc6f1bcf1e ("frontswap: allow multiple backends")
>>>>> Signed-off-by: Miaohe Lin <[email protected]>
>>>>> ---
>>>>> include/linux/swapfile.h | 2 ++
>>>>> mm/frontswap.c | 40 +++++++++++++++++-----------------------
>>>>> mm/swapfile.c | 13 ++++++++++++-
>>>>> 3 files changed, 31 insertions(+), 24 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
>>>>> index e06febf62978..7ae15d917828 100644
>>>>> --- a/include/linux/swapfile.h
>>>>> +++ b/include/linux/swapfile.h
>>>>> @@ -9,8 +9,10 @@
>>>>> extern spinlock_t swap_lock;
>>>>> extern struct plist_head swap_active_head;
>>>>> extern struct swap_info_struct *swap_info[];
>>>>> +extern struct mutex swapon_mutex;
>>>>> extern int try_to_unuse(unsigned int, bool, unsigned long);
>>>>> extern unsigned long generic_max_swapfile_size(void);
>>>>> extern unsigned long max_swapfile_size(void);
>>>>> +extern struct swap_info_struct *swap_info_get_if_under_swapoff(int type);
>>>>>
>>>>> #endif /* _LINUX_SWAPFILE_H */
>>>>> diff --git a/mm/frontswap.c b/mm/frontswap.c
>>>>> index 130e301c5ac0..c16bfc7550b5 100644
>>>>> --- a/mm/frontswap.c
>>>>> +++ b/mm/frontswap.c
>>>>> @@ -123,12 +123,26 @@ void frontswap_register_ops(struct frontswap_ops *ops)
>>>>>
>>>>> bitmap_zero(a, MAX_SWAPFILES);
>>>>> bitmap_zero(b, MAX_SWAPFILES);
>>>>> -
>>>>> + mutex_lock(&swapon_mutex);
>>>>> spin_lock(&swap_lock);
>>>>> plist_for_each_entry(si, &swap_active_head, list) {
>>>>> if (!WARN_ON(!si->frontswap_map))
>>>>> set_bit(si->type, a);
>>>>> }
>>>>> + /*
>>>>> + * There might be some swap devices under swapoff, i.e. they are
>>>>> + * removed from swap_active_head but frontswap_invalidate_area()
>>>>> + * is not called yet due to swapon_mutex is held here. We must
>>>>> + * collect these swap devices and call ops->init on them or they
>>>>> + * might invalidate frontswap area while frontswap is uninitialized.
>>>>> + */
>>>>> + for_each_clear_bit(i, a, MAX_SWAPFILES) {
>>>>> + si = swap_info_get_if_under_swapoff(i);
>>>>> + if (!si || !si->frontswap_map)
>>>>> + continue;
>>>>> + set_bit(si->type, b);
>>>>> + }
>>>>> + bitmap_or(a, a, b, MAX_SWAPFILES);
>>>>> spin_unlock(&swap_lock);
>>>>>
>>>>> /* the new ops needs to know the currently active swap devices */
>>>>> @@ -144,29 +158,9 @@ void frontswap_register_ops(struct frontswap_ops *ops)
>>>>> ops->next = frontswap_ops;
>>>>> } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
>>>>>
>>>>> - static_branch_inc(&frontswap_enabled_key);
>>>>> -
>>>>> - spin_lock(&swap_lock);
>>>>> - plist_for_each_entry(si, &swap_active_head, list) {
>>>>> - if (si->frontswap_map)
>>>>> - set_bit(si->type, b);
>>>>> - }
>>>>> - spin_unlock(&swap_lock);
>>>>> + mutex_unlock(&swapon_mutex);
>>>>>
>>>>> - /*
>>>>> - * On the very unlikely chance that a swap device was added or
>>>>> - * removed between setting the "a" list bits and the ops init
>>>>> - * calls, we re-check and do init or invalidate for any changed
>>>>> - * bits.
>>>>> - */
>>>>> - if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
>>>>> - for (i = 0; i < MAX_SWAPFILES; i++) {
>>>>> - if (!test_bit(i, a) && test_bit(i, b))
>>>>> - ops->init(i);
>>>>> - else if (test_bit(i, a) && !test_bit(i, b))
>>>>> - ops->invalidate_area(i);
>>>>> - }
>>>>> - }
>>>>> + static_branch_inc(&frontswap_enabled_key);
>>>>> }
>>>>> EXPORT_SYMBOL(frontswap_register_ops);
>>>>>
>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>> index 149e77454e3c..ee736533717f 100644
>>>>> --- a/mm/swapfile.c
>>>>> +++ b/mm/swapfile.c
>>>>> @@ -89,7 +89,7 @@ static DEFINE_SPINLOCK(swap_avail_lock);
>>>>>
>>>>> struct swap_info_struct *swap_info[MAX_SWAPFILES];
>>>>>
>>>>> -static DEFINE_MUTEX(swapon_mutex);
>>>>> +DEFINE_MUTEX(swapon_mutex);
>>>>>
>>>>> static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
>>>>> /* Activity counter to indicate that a swapon or swapoff has occurred */
>>>>> @@ -2958,6 +2958,17 @@ __weak unsigned long max_swapfile_size(void)
>>>>> return generic_max_swapfile_size();
>>>>> }
>>>>>
>>>>> +struct swap_info_struct *swap_info_get_if_under_swapoff(int type)
>>>>> +{
>>>>> + struct swap_info_struct *si = swap_type_to_swap_info(type);
>>>>> +
>>>>> + if (!si || !si->swap_map)
>>>>> + return NULL;
>>>>> + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
>>>>> + return si;
>>>>> + return NULL;
>>>>> +}
>>>>> +
>>>>> static unsigned long read_swap_header(struct swap_info_struct *p,
>>>>> union swap_header *swap_header,
>>>>> struct inode *inode)
>>>>>
>>>>
>>>>
>>> .
>>>
>>
>>
> .
>