This patch fixes a data race in commit 779750d20b93 ("shmem: split huge pages
beyond i_size under memory pressure").
Here are call traces:
Call Trace 1:
shmem_unused_huge_shrink+0x3ae/0x410
? __list_lru_walk_one.isra.5+0x33/0x160
super_cache_scan+0x17c/0x190
shrink_slab.part.55+0x1ef/0x3f0
shrink_node+0x10e/0x330
kswapd+0x380/0x740
kthread+0xfc/0x130
? mem_cgroup_shrink_node+0x170/0x170
? kthread_create_on_node+0x70/0x70
ret_from_fork+0x1f/0x30
Call Trace 2:
shmem_evict_inode+0xd8/0x190
evict+0xbe/0x1c0
do_unlinkat+0x137/0x330
do_syscall_64+0x76/0x120
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
The simultaneous deletion of adjacent elements in the local list (@list)
by shmem_unused_huge_shrink and shmem_evict_inode will break the list.
Image there are 3 items in the local list (@list).
In the first traversal, A is not deleted from @list.
1) A->B->C
^
|
pos (leave)
In the second traversal, B is deleted from @list. Concurrently, A is
deleted from @list through shmem_evict_inode() since last reference counter of
inode is dropped by other thread. Then the @list is corrupted.
2) A->B->C
^ ^
| |
evict pos (drop)
Fix:
We should make sure the item is either on the global list or deleted from
any local list before iput().
Fixed by moving inodes that are on @list and will not be deleted back to
global list before iput.
Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure")
Signed-off-by: Gang Li <[email protected]>
---
Changes in v3:
- Add more comment.
- Use list_move(&info->shrinklist, &sbinfo->shrinklist) instead of
list_move(pos, &sbinfo->shrinklist) for consistency.
Changes in v2: https://lore.kernel.org/all/[email protected]/
- Move spinlock to the front of iput instead of changing lock type
since iput will call evict which may cause deadlock by requesting
shrinklist_lock.
- Add call trace in commit message.
v1: https://lore.kernel.org/lkml/[email protected]/
---
mm/shmem.c | 35 ++++++++++++++++++++---------------
1 file changed, 20 insertions(+), 15 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c
index 9023103ee7d8..ab2df692bd58 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -569,7 +569,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
/* inode is about to be evicted */
if (!inode) {
list_del_init(&info->shrinklist);
- removed++;
goto next;
}
@@ -577,15 +576,16 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
if (round_up(inode->i_size, PAGE_SIZE) ==
round_up(inode->i_size, HPAGE_PMD_SIZE)) {
list_move(&info->shrinklist, &to_remove);
- removed++;
goto next;
}
list_move(&info->shrinklist, &list);
next:
+ removed++;
if (!--batch)
break;
}
+ sbinfo->shrinklist_len -= removed;
spin_unlock(&sbinfo->shrinklist_lock);
list_for_each_safe(pos, next, &to_remove) {
@@ -602,7 +602,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
inode = &info->vfs_inode;
if (nr_to_split && split >= nr_to_split)
- goto leave;
+ goto move_back;
page = find_get_page(inode->i_mapping,
(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
@@ -616,38 +616,43 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
}
/*
- * Leave the inode on the list if we failed to lock
- * the page at this time.
+ * Move the inode on the list back to shrinklist if we failed
+ * to lock the page at this time.
*
* Waiting for the lock may lead to deadlock in the
* reclaim path.
*/
if (!trylock_page(page)) {
put_page(page);
- goto leave;
+ goto move_back;
}
ret = split_huge_page(page);
unlock_page(page);
put_page(page);
- /* If split failed leave the inode on the list */
+ /* If split failed move the inode on the list back to shrinklist */
if (ret)
- goto leave;
+ goto move_back;
split++;
drop:
list_del_init(&info->shrinklist);
- removed++;
-leave:
+ goto put;
+move_back:
+ /* inodes that are on @list and will not be deleted must be moved back to
+ * global list before iput for two reasons:
+ * 1. iput in lock: iput call shmem_evict_inode, then cause deadlock.
+ * 2. iput before lock: shmem_evict_inode may grab the inode on @list,
+ * which will cause race.
+ */
+ spin_lock(&sbinfo->shrinklist_lock);
+ list_move(&info->shrinklist, &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ spin_unlock(&sbinfo->shrinklist_lock);
+put:
iput(inode);
}
- spin_lock(&sbinfo->shrinklist_lock);
- list_splice_tail(&list, &sbinfo->shrinklist);
- sbinfo->shrinklist_len -= removed;
- spin_unlock(&sbinfo->shrinklist_lock);
-
return split;
}
--
2.20.1
On Wed, Nov 24, 2021 at 05:43:16PM +0800, Gang Li wrote:
> +move_back:
> + /* inodes that are on @list and will not be deleted must be moved back to
> + * global list before iput for two reasons:
> + * 1. iput in lock: iput call shmem_evict_inode, then cause deadlock.
> + * 2. iput before lock: shmem_evict_inode may grab the inode on @list,
> + * which will cause race.
> + */
> + spin_lock(&sbinfo->shrinklist_lock);
> + list_move(&info->shrinklist, &sbinfo->shrinklist);
> + sbinfo->shrinklist_len++;
> + spin_unlock(&sbinfo->shrinklist_lock);
> +put:
> iput(inode);
> }
>
> - spin_lock(&sbinfo->shrinklist_lock);
> - list_splice_tail(&list, &sbinfo->shrinklist);
> - sbinfo->shrinklist_len -= removed;
> - spin_unlock(&sbinfo->shrinklist_lock);
> -
> return split;
> }
Okay, I guess it works. Locking is not pretty, but well..
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Wed, Nov 24, 2021 at 5:43 PM Gang Li <[email protected]> wrote:
>
> This patch fixes a data race in commit 779750d20b93 ("shmem: split huge pages
> beyond i_size under memory pressure").
>
> Here are call traces:
>
> Call Trace 1:
> shmem_unused_huge_shrink+0x3ae/0x410
> ? __list_lru_walk_one.isra.5+0x33/0x160
> super_cache_scan+0x17c/0x190
> shrink_slab.part.55+0x1ef/0x3f0
> shrink_node+0x10e/0x330
> kswapd+0x380/0x740
> kthread+0xfc/0x130
> ? mem_cgroup_shrink_node+0x170/0x170
> ? kthread_create_on_node+0x70/0x70
> ret_from_fork+0x1f/0x30
>
> Call Trace 2:
> shmem_evict_inode+0xd8/0x190
> evict+0xbe/0x1c0
> do_unlinkat+0x137/0x330
> do_syscall_64+0x76/0x120
> entry_SYSCALL_64_after_hwframe+0x3d/0xa2
>
> The simultaneous deletion of adjacent elements in the local list (@list)
> by shmem_unused_huge_shrink and shmem_evict_inode will break the list.
>
> Image there are 3 items in the local list (@list).
> In the first traversal, A is not deleted from @list.
>
> 1) A->B->C
> ^
> |
> pos (leave)
>
> In the second traversal, B is deleted from @list. Concurrently, A is
> deleted from @list through shmem_evict_inode() since last reference counter of
> inode is dropped by other thread. Then the @list is corrupted.
>
> 2) A->B->C
> ^ ^
> | |
> evict pos (drop)
>
> Fix:
>
> We should make sure the item is either on the global list or deleted from
> any local list before iput().
>
> Fixed by moving inodes that are on @list and will not be deleted back to
> global list before iput.
>
> Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure")
> Signed-off-by: Gang Li <[email protected]>
>
> ---
>
> Changes in v3:
> - Add more comment.
> - Use list_move(&info->shrinklist, &sbinfo->shrinklist) instead of
> list_move(pos, &sbinfo->shrinklist) for consistency.
>
> Changes in v2: https://lore.kernel.org/all/[email protected]/
> - Move spinlock to the front of iput instead of changing lock type
> since iput will call evict which may cause deadlock by requesting
> shrinklist_lock.
> - Add call trace in commit message.
>
> v1: https://lore.kernel.org/lkml/[email protected]/
>
> ---
> mm/shmem.c | 35 ++++++++++++++++++++---------------
> 1 file changed, 20 insertions(+), 15 deletions(-)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 9023103ee7d8..ab2df692bd58 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -569,7 +569,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
> /* inode is about to be evicted */
> if (!inode) {
> list_del_init(&info->shrinklist);
> - removed++;
> goto next;
> }
>
> @@ -577,15 +576,16 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
> if (round_up(inode->i_size, PAGE_SIZE) ==
> round_up(inode->i_size, HPAGE_PMD_SIZE)) {
> list_move(&info->shrinklist, &to_remove);
> - removed++;
> goto next;
> }
>
> list_move(&info->shrinklist, &list);
> next:
> + removed++;
> if (!--batch)
> break;
> }
> + sbinfo->shrinklist_len -= removed;
> spin_unlock(&sbinfo->shrinklist_lock);
>
> list_for_each_safe(pos, next, &to_remove) {
> @@ -602,7 +602,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
> inode = &info->vfs_inode;
>
> if (nr_to_split && split >= nr_to_split)
> - goto leave;
> + goto move_back;
>
> page = find_get_page(inode->i_mapping,
> (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
> @@ -616,38 +616,43 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
> }
>
> /*
> - * Leave the inode on the list if we failed to lock
> - * the page at this time.
> + * Move the inode on the list back to shrinklist if we failed
> + * to lock the page at this time.
> *
> * Waiting for the lock may lead to deadlock in the
> * reclaim path.
> */
> if (!trylock_page(page)) {
> put_page(page);
> - goto leave;
> + goto move_back;
> }
>
> ret = split_huge_page(page);
> unlock_page(page);
> put_page(page);
>
> - /* If split failed leave the inode on the list */
> + /* If split failed move the inode on the list back to shrinklist */
> if (ret)
> - goto leave;
> + goto move_back;
>
> split++;
> drop:
> list_del_init(&info->shrinklist);
> - removed++;
> -leave:
> + goto put;
> +move_back:
> + /* inodes that are on @list and will not be deleted must be moved back to
> + * global list before iput for two reasons:
> + * 1. iput in lock: iput call shmem_evict_inode, then cause deadlock.
> + * 2. iput before lock: shmem_evict_inode may grab the inode on @list,
> + * which will cause race.
> + */
Multi-line comment is like the following format.
/*
* Comment here.
*/
And I also suggest reworking the comments here. Something like:
/*
* Make sure the inode is either on the global list or deleted from
* any local list before iput() since it could be deleted in another
* thread once we put the inode (then the local list is corrupted).
*/
With that.
Reviewed-by: Muchun Song <[email protected]>
> + spin_lock(&sbinfo->shrinklist_lock);
> + list_move(&info->shrinklist, &sbinfo->shrinklist);
> + sbinfo->shrinklist_len++;
> + spin_unlock(&sbinfo->shrinklist_lock);
> +put:
> iput(inode);
> }
>
> - spin_lock(&sbinfo->shrinklist_lock);
> - list_splice_tail(&list, &sbinfo->shrinklist);
> - sbinfo->shrinklist_len -= removed;
> - spin_unlock(&sbinfo->shrinklist_lock);
> -
> return split;
> }
>
> --
> 2.20.1
>