When remove_stable_node() races with __mmput() and squeezes in between ksm_exit() and exit_mmap(),
the WARN_ON_ONCE(page_mapped(page)) in remove_stable_node() could be triggered.
Should we just remove the warning? It seems to be safe to do, all callers are able to handle -EBUSY,
or there is a better way to fix this?
It's easily reproducible with the following script:
(ksm_test.c attached)
#!/bin/bash
gcc -lnuma -O2 ksm_test.c -o ksm_test
echo 1 > /sys/kernel/mm/ksm/run
./ksm_test &
sleep 1
echo 2 > /sys/kernel/mm/ksm/run
and the patch bellow which provokes that race.
---
include/linux/ksm.h | 4 +++-
include/linux/mm_types.h | 1 +
kernel/fork.c | 4 ++++
3 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index e48b1e453ff5..18384ea472f8 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -33,8 +33,10 @@ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
static inline void ksm_exit(struct mm_struct *mm)
{
- if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
+ if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
__ksm_exit(mm);
+ mm->ksm_wait = 1;
+ }
}
/*
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 270aa8fd2800..3df8290528c2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -463,6 +463,7 @@ struct mm_struct {
/* Architecture-specific MM context */
mm_context_t context;
+ unsigned long ksm_wait;
unsigned long flags; /* Must use atomic bitops to access */
diff --git a/kernel/fork.c b/kernel/fork.c
index 5fb7e1fa0b05..be6ef4e046f0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1074,6 +1074,10 @@ static inline void __mmput(struct mm_struct *mm)
uprobe_clear_state(mm);
exit_aio(mm);
ksm_exit(mm);
+
+ if (mm->ksm_wait)
+ schedule_timeout_uninterruptible(10*HZ);
+
khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
mm_put_huge_zero_page(mm);
--
2.23.0
On Wed, 13 Nov 2019, Andrey Ryabinin wrote:
> When remove_stable_node() races with __mmput() and squeezes in between ksm_exit() and exit_mmap(),
> the WARN_ON_ONCE(page_mapped(page)) in remove_stable_node() could be triggered.
Thanks for doing the work on that.
>
> Should we just remove the warning? It seems to be safe to do, all callers are able to handle -EBUSY,
> or there is a better way to fix this?
Please just remove the warning and update the comment.
At first I thought it would still be worth leaving the warning in,
to explain rare EBUSY; but after looking (as you did) at exactly how
remove_stable_node() gets to be called, the places that care about its
failure already expect KSM to be quiesced, and tell the admin EBUSY if
not. This case is not different, just slightly delayed. So there's
no need to try to be any cleverer about it.
Though now that you have found the case to be a real one, I do think
it would be an improvement for remove_stable_node_chain() not to give
up at the first -EBUSY, but remove as much as it can before finishing.
But perhaps there's a subtlety that prevents that - I'm not familiar
with the stable node chains.
Hugh
>
>
>
> It's easily reproducible with the following script:
> (ksm_test.c attached)
>
> #!/bin/bash
>
> gcc -lnuma -O2 ksm_test.c -o ksm_test
> echo 1 > /sys/kernel/mm/ksm/run
> ./ksm_test &
> sleep 1
> echo 2 > /sys/kernel/mm/ksm/run
>
> and the patch bellow which provokes that race.
>
> ---
> include/linux/ksm.h | 4 +++-
> include/linux/mm_types.h | 1 +
> kernel/fork.c | 4 ++++
> 3 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/ksm.h b/include/linux/ksm.h
> index e48b1e453ff5..18384ea472f8 100644
> --- a/include/linux/ksm.h
> +++ b/include/linux/ksm.h
> @@ -33,8 +33,10 @@ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
>
> static inline void ksm_exit(struct mm_struct *mm)
> {
> - if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
> + if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
> __ksm_exit(mm);
> + mm->ksm_wait = 1;
> + }
> }
>
> /*
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 270aa8fd2800..3df8290528c2 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -463,6 +463,7 @@ struct mm_struct {
>
> /* Architecture-specific MM context */
> mm_context_t context;
> + unsigned long ksm_wait;
>
> unsigned long flags; /* Must use atomic bitops to access */
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 5fb7e1fa0b05..be6ef4e046f0 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1074,6 +1074,10 @@ static inline void __mmput(struct mm_struct *mm)
> uprobe_clear_state(mm);
> exit_aio(mm);
> ksm_exit(mm);
> +
> + if (mm->ksm_wait)
> + schedule_timeout_uninterruptible(10*HZ);
> +
> khugepaged_exit(mm); /* must run before exit_mmap */
> exit_mmap(mm);
> mm_put_huge_zero_page(mm);
> --
> 2.23.0
>