khugepaged scans the entire address space in the
background for each given mm, looking for
opportunities to merge sequences of basic pages
into huge pages. However, when an mm is inserted
to the mm_slots list, and the MMF_DISABLE_THP flag
is set later, this scanning process becomes
unnecessary for that mm and can be skipped to avoid
redundant operations, especially in scenarios with
a large address space.
This commit introduces a check before each scanning
process to test the MMF_DISABLE_THP flag for the
given mm; if the flag is set, the scanning process
is bypassed, thereby improving the efficiency of
khugepaged.
Signed-off-by: Lance Yang <[email protected]>
---
mm/khugepaged.c | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2b219acb528e..d6a700834edc 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
+static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
+{
+ return hpage_collapse_test_exit(mm) ||
+ test_bit(MMF_DISABLE_THP, &mm->flags);
+}
+
void __khugepaged_enter(struct mm_struct *mm)
{
struct khugepaged_mm_slot *mm_slot;
@@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
lockdep_assert_held(&khugepaged_mm_lock);
- if (hpage_collapse_test_exit(mm)) {
+ if (hpage_collapse_test_exit_or_disable(mm)) {
/* free mm_slot */
hash_del(&slot->hash);
list_del(&slot->mm_node);
@@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
goto breakouterloop_mmap_lock;
progress++;
- if (unlikely(hpage_collapse_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
goto breakouterloop;
vma_iter_init(&vmi, mm, khugepaged_scan.address);
@@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
unsigned long hstart, hend;
cond_resched();
- if (unlikely(hpage_collapse_test_exit(mm))) {
+ if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
progress++;
break;
}
@@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
bool mmap_locked = true;
cond_resched();
- if (unlikely(hpage_collapse_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
goto breakouterloop;
VM_BUG_ON(khugepaged_scan.address < hstart ||
@@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
fput(file);
if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
mmap_read_lock(mm);
- if (hpage_collapse_test_exit(mm))
+ if (hpage_collapse_test_exit_or_disable(mm))
goto breakouterloop;
*result = collapse_pte_mapped_thp(mm,
khugepaged_scan.address, false);
@@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
*/
- if (hpage_collapse_test_exit(mm) || !vma) {
+ if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
/*
* Make sure that if mm_users is reaching zero while
* khugepaged runs here, khugepaged_exit will find
--
2.33.1
On Mon 29-01-24 13:45:51, Lance Yang wrote:
> khugepaged scans the entire address space in the
> background for each given mm, looking for
> opportunities to merge sequences of basic pages
> into huge pages. However, when an mm is inserted
> to the mm_slots list, and the MMF_DISABLE_THP flag
> is set later, this scanning process becomes
> unnecessary for that mm and can be skipped to avoid
> redundant operations, especially in scenarios with
> a large address space.
Is this a real problem? I thought that the prctl is called
on the parent before fork/exec. Or are you aware of any
applications which do call prctl late enough that the race
would be actually observable?
--
Michal Hocko
SUSE Labs
On Sun, Jan 28, 2024 at 9:46 PM Lance Yang <[email protected]> wrote:
>
> khugepaged scans the entire address space in the
> background for each given mm, looking for
> opportunities to merge sequences of basic pages
> into huge pages. However, when an mm is inserted
> to the mm_slots list, and the MMF_DISABLE_THP flag
> is set later, this scanning process becomes
> unnecessary for that mm and can be skipped to avoid
> redundant operations, especially in scenarios with
> a large address space.
>
> This commit introduces a check before each scanning
> process to test the MMF_DISABLE_THP flag for the
> given mm; if the flag is set, the scanning process
> is bypassed, thereby improving the efficiency of
> khugepaged.
>
> Signed-off-by: Lance Yang <[email protected]>
> ---
> mm/khugepaged.c | 18 ++++++++++++------
> 1 file changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 2b219acb528e..d6a700834edc 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
> return atomic_read(&mm->mm_users) == 0;
> }
>
> +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
> +{
> + return hpage_collapse_test_exit(mm) ||
> + test_bit(MMF_DISABLE_THP, &mm->flags);
> +}
> +
> void __khugepaged_enter(struct mm_struct *mm)
> {
> struct khugepaged_mm_slot *mm_slot;
> @@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
>
> lockdep_assert_held(&khugepaged_mm_lock);
>
> - if (hpage_collapse_test_exit(mm)) {
> + if (hpage_collapse_test_exit_or_disable(mm)) {
> /* free mm_slot */
> hash_del(&slot->hash);
> list_del(&slot->mm_node);
> @@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> goto breakouterloop_mmap_lock;
>
> progress++;
> - if (unlikely(hpage_collapse_test_exit(mm)))
> + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> goto breakouterloop;
>
> vma_iter_init(&vmi, mm, khugepaged_scan.address);
> @@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> unsigned long hstart, hend;
>
> cond_resched();
> - if (unlikely(hpage_collapse_test_exit(mm))) {
> + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
The later thp_vma_allowable_order() does check whether MMF_DISABLE_THP
is set or not. And the hugepage_vma_revalidate() after re-acquiring
mmap_lock does the same check too. The checking in khugepaged should
be already serialized with prctl, which takes mmap_lock in write.
> progress++;
> break;
> }
> @@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> bool mmap_locked = true;
>
> cond_resched();
> - if (unlikely(hpage_collapse_test_exit(mm)))
> + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> goto breakouterloop;
>
> VM_BUG_ON(khugepaged_scan.address < hstart ||
> @@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> fput(file);
> if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> mmap_read_lock(mm);
> - if (hpage_collapse_test_exit(mm))
> + if (hpage_collapse_test_exit_or_disable(mm))
> goto breakouterloop;
> *result = collapse_pte_mapped_thp(mm,
> khugepaged_scan.address, false);
> @@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> * Release the current mm_slot if this mm is about to die, or
> * if we scanned all vmas of this mm.
> */
> - if (hpage_collapse_test_exit(mm) || !vma) {
> + if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
> /*
> * Make sure that if mm_users is reaching zero while
> * khugepaged runs here, khugepaged_exit will find
> --
> 2.33.1
>
On Mon, Jan 29, 2024 at 10:53 AM Yang Shi <[email protected]> wrote:
>
> On Sun, Jan 28, 2024 at 9:46 PM Lance Yang <[email protected]> wrote:
> >
> > khugepaged scans the entire address space in the
> > background for each given mm, looking for
> > opportunities to merge sequences of basic pages
> > into huge pages. However, when an mm is inserted
> > to the mm_slots list, and the MMF_DISABLE_THP flag
> > is set later, this scanning process becomes
> > unnecessary for that mm and can be skipped to avoid
> > redundant operations, especially in scenarios with
> > a large address space.
> >
> > This commit introduces a check before each scanning
> > process to test the MMF_DISABLE_THP flag for the
> > given mm; if the flag is set, the scanning process
> > is bypassed, thereby improving the efficiency of
> > khugepaged.
> >
> > Signed-off-by: Lance Yang <[email protected]>
> > ---
> > mm/khugepaged.c | 18 ++++++++++++------
> > 1 file changed, 12 insertions(+), 6 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 2b219acb528e..d6a700834edc 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
> > return atomic_read(&mm->mm_users) == 0;
> > }
> >
> > +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
> > +{
> > + return hpage_collapse_test_exit(mm) ||
> > + test_bit(MMF_DISABLE_THP, &mm->flags);
> > +}
> > +
> > void __khugepaged_enter(struct mm_struct *mm)
> > {
> > struct khugepaged_mm_slot *mm_slot;
> > @@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
> >
> > lockdep_assert_held(&khugepaged_mm_lock);
> >
> > - if (hpage_collapse_test_exit(mm)) {
> > + if (hpage_collapse_test_exit_or_disable(mm)) {
> > /* free mm_slot */
> > hash_del(&slot->hash);
> > list_del(&slot->mm_node);
> > @@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > goto breakouterloop_mmap_lock;
> >
> > progress++;
> > - if (unlikely(hpage_collapse_test_exit(mm)))
> > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > goto breakouterloop;
> >
> > vma_iter_init(&vmi, mm, khugepaged_scan.address);
> > @@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > unsigned long hstart, hend;
> >
> > cond_resched();
> > - if (unlikely(hpage_collapse_test_exit(mm))) {
> > + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
>
> The later thp_vma_allowable_order() does check whether MMF_DISABLE_THP
> is set or not. And the hugepage_vma_revalidate() after re-acquiring
> mmap_lock does the same check too. The checking in khugepaged should
> be already serialized with prctl, which takes mmap_lock in write.
IIUC, there really isn't any correctness race. Claim is just that we
can avoid a number of per-vma checks. AFAICT, any task w/
MMF_DISABLE_THP set will always have each and every vma checked
(albeit, with a very inexpensive ->vm_mm->flags check)
Thanks,
Zach
> > progress++;
> > break;
> > }
> > @@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > bool mmap_locked = true;
> >
> > cond_resched();
> > - if (unlikely(hpage_collapse_test_exit(mm)))
> > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > goto breakouterloop;
> >
> > VM_BUG_ON(khugepaged_scan.address < hstart ||
> > @@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > fput(file);
> > if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> > mmap_read_lock(mm);
> > - if (hpage_collapse_test_exit(mm))
> > + if (hpage_collapse_test_exit_or_disable(mm))
> > goto breakouterloop;
> > *result = collapse_pte_mapped_thp(mm,
> > khugepaged_scan.address, false);
> > @@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > * Release the current mm_slot if this mm is about to die, or
> > * if we scanned all vmas of this mm.
> > */
> > - if (hpage_collapse_test_exit(mm) || !vma) {
> > + if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
> > /*
> > * Make sure that if mm_users is reaching zero while
> > * khugepaged runs here, khugepaged_exit will find
> > --
> > 2.33.1
> >
Hey Michal,
Thanks for taking time to review!
On some servers within our company, we deploy a
daemon responsible for monitoring and updating
local applications. Some applications prefer not to
use THP, so the daemon calls prctl to disable THP
before fork/exec. Conversely, for other applications,
the daemon calls prctl to enable THP before fork/exec.
Ideally, the daemon should invoke prctl after the fork,
but its current implementation follows the described
approach.
BR,
Lance
On Tue, Jan 30, 2024 at 12:28 AM Michal Hocko <[email protected]> wrote:
>
> On Mon 29-01-24 13:45:51, Lance Yang wrote:
> > khugepaged scans the entire address space in the
> > background for each given mm, looking for
> > opportunities to merge sequences of basic pages
> > into huge pages. However, when an mm is inserted
> > to the mm_slots list, and the MMF_DISABLE_THP flag
> > is set later, this scanning process becomes
> > unnecessary for that mm and can be skipped to avoid
> > redundant operations, especially in scenarios with
> > a large address space.
>
> Is this a real problem? I thought that the prctl is called
> on the parent before fork/exec. Or are you aware of any
> applications which do call prctl late enough that the race
> would be actually observable?
> --
> Michal Hocko
> SUSE Labs
Hey Yang,
Thanks for taking time to review!
thp_vma_allowable_order and hugepage_vma_revalidate
do check whether MMF_DISABLE_THP is set. IIRC, even if
MMF_DISABLE_THP is set, khugepaged will still continue to
scan the address space.
BR,
Lance
On Tue, Jan 30, 2024 at 2:53 AM Yang Shi <[email protected]> wrote:
>
> On Sun, Jan 28, 2024 at 9:46 PM Lance Yang <[email protected]> wrote:
> >
> > khugepaged scans the entire address space in the
> > background for each given mm, looking for
> > opportunities to merge sequences of basic pages
> > into huge pages. However, when an mm is inserted
> > to the mm_slots list, and the MMF_DISABLE_THP flag
> > is set later, this scanning process becomes
> > unnecessary for that mm and can be skipped to avoid
> > redundant operations, especially in scenarios with
> > a large address space.
> >
> > This commit introduces a check before each scanning
> > process to test the MMF_DISABLE_THP flag for the
> > given mm; if the flag is set, the scanning process
> > is bypassed, thereby improving the efficiency of
> > khugepaged.
> >
> > Signed-off-by: Lance Yang <[email protected]>
> > ---
> > mm/khugepaged.c | 18 ++++++++++++------
> > 1 file changed, 12 insertions(+), 6 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 2b219acb528e..d6a700834edc 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
> > return atomic_read(&mm->mm_users) == 0;
> > }
> >
> > +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
> > +{
> > + return hpage_collapse_test_exit(mm) ||
> > + test_bit(MMF_DISABLE_THP, &mm->flags);
> > +}
> > +
> > void __khugepaged_enter(struct mm_struct *mm)
> > {
> > struct khugepaged_mm_slot *mm_slot;
> > @@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
> >
> > lockdep_assert_held(&khugepaged_mm_lock);
> >
> > - if (hpage_collapse_test_exit(mm)) {
> > + if (hpage_collapse_test_exit_or_disable(mm)) {
> > /* free mm_slot */
> > hash_del(&slot->hash);
> > list_del(&slot->mm_node);
> > @@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > goto breakouterloop_mmap_lock;
> >
> > progress++;
> > - if (unlikely(hpage_collapse_test_exit(mm)))
> > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > goto breakouterloop;
> >
> > vma_iter_init(&vmi, mm, khugepaged_scan.address);
> > @@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > unsigned long hstart, hend;
> >
> > cond_resched();
> > - if (unlikely(hpage_collapse_test_exit(mm))) {
> > + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
>
> The later thp_vma_allowable_order() does check whether MMF_DISABLE_THP
> is set or not. And the hugepage_vma_revalidate() after re-acquiring
> mmap_lock does the same check too. The checking in khugepaged should
> be already serialized with prctl, which takes mmap_lock in write.
>
> > progress++;
> > break;
> > }
> > @@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > bool mmap_locked = true;
> >
> > cond_resched();
> > - if (unlikely(hpage_collapse_test_exit(mm)))
> > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > goto breakouterloop;
> >
> > VM_BUG_ON(khugepaged_scan.address < hstart ||
> > @@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > fput(file);
> > if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> > mmap_read_lock(mm);
> > - if (hpage_collapse_test_exit(mm))
> > + if (hpage_collapse_test_exit_or_disable(mm))
> > goto breakouterloop;
> > *result = collapse_pte_mapped_thp(mm,
> > khugepaged_scan.address, false);
> > @@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > * Release the current mm_slot if this mm is about to die, or
> > * if we scanned all vmas of this mm.
> > */
> > - if (hpage_collapse_test_exit(mm) || !vma) {
> > + if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
> > /*
> > * Make sure that if mm_users is reaching zero while
> > * khugepaged runs here, khugepaged_exit will find
> > --
> > 2.33.1
> >
Hey Zach,
Thanks for taking time to review!
On Tue, Jan 30, 2024 at 3:04 AM Zach O'Keefe <[email protected]> wrote:
[...]
> IIUC, there really isn't any correctness race. Claim is just that we
Yes, there is indeed no correctness race.
> can avoid a number of per-vma checks. AFAICT, any task w/
> MMF_DISABLE_THP set will always have each and every vma checked
> (albeit, with a very inexpensive ->vm_mm->flags check)
[...]
IMO, for any task with MMF_DISABLE_THP set, the check
for each VMA can be skipped to avoid redundant operations,
(with a very inexpensive ->mm->flags check)
especially in scenarios with a large address space.
BR,
Lance
On Tue, Jan 30, 2024 at 3:04 AM Zach O'Keefe <[email protected]> wrote:
>
> On Mon, Jan 29, 2024 at 10:53 AM Yang Shi <[email protected]> wrote:
> >
> > On Sun, Jan 28, 2024 at 9:46 PM Lance Yang <[email protected]> wrote:
> > >
> > > khugepaged scans the entire address space in the
> > > background for each given mm, looking for
> > > opportunities to merge sequences of basic pages
> > > into huge pages. However, when an mm is inserted
> > > to the mm_slots list, and the MMF_DISABLE_THP flag
> > > is set later, this scanning process becomes
> > > unnecessary for that mm and can be skipped to avoid
> > > redundant operations, especially in scenarios with
> > > a large address space.
> > >
> > > This commit introduces a check before each scanning
> > > process to test the MMF_DISABLE_THP flag for the
> > > given mm; if the flag is set, the scanning process
> > > is bypassed, thereby improving the efficiency of
> > > khugepaged.
> > >
> > > Signed-off-by: Lance Yang <[email protected]>
> > > ---
> > > mm/khugepaged.c | 18 ++++++++++++------
> > > 1 file changed, 12 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index 2b219acb528e..d6a700834edc 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
> > > return atomic_read(&mm->mm_users) == 0;
> > > }
> > >
> > > +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
> > > +{
> > > + return hpage_collapse_test_exit(mm) ||
> > > + test_bit(MMF_DISABLE_THP, &mm->flags);
> > > +}
> > > +
> > > void __khugepaged_enter(struct mm_struct *mm)
> > > {
> > > struct khugepaged_mm_slot *mm_slot;
> > > @@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
> > >
> > > lockdep_assert_held(&khugepaged_mm_lock);
> > >
> > > - if (hpage_collapse_test_exit(mm)) {
> > > + if (hpage_collapse_test_exit_or_disable(mm)) {
> > > /* free mm_slot */
> > > hash_del(&slot->hash);
> > > list_del(&slot->mm_node);
> > > @@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > goto breakouterloop_mmap_lock;
> > >
> > > progress++;
> > > - if (unlikely(hpage_collapse_test_exit(mm)))
> > > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > > goto breakouterloop;
> > >
> > > vma_iter_init(&vmi, mm, khugepaged_scan.address);
> > > @@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > unsigned long hstart, hend;
> > >
> > > cond_resched();
> > > - if (unlikely(hpage_collapse_test_exit(mm))) {
> > > + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
> >
> > The later thp_vma_allowable_order() does check whether MMF_DISABLE_THP
> > is set or not. And the hugepage_vma_revalidate() after re-acquiring
> > mmap_lock does the same check too. The checking in khugepaged should
> > be already serialized with prctl, which takes mmap_lock in write.
>
> IIUC, there really isn't any correctness race. Claim is just that we
> can avoid a number of per-vma checks. AFAICT, any task w/
> MMF_DISABLE_THP set will always have each and every vma checked
> (albeit, with a very inexpensive ->vm_mm->flags check)
>
> Thanks,
> Zach
>
> > > progress++;
> > > break;
> > > }
> > > @@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > bool mmap_locked = true;
> > >
> > > cond_resched();
> > > - if (unlikely(hpage_collapse_test_exit(mm)))
> > > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > > goto breakouterloop;
> > >
> > > VM_BUG_ON(khugepaged_scan.address < hstart ||
> > > @@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > fput(file);
> > > if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> > > mmap_read_lock(mm);
> > > - if (hpage_collapse_test_exit(mm))
> > > + if (hpage_collapse_test_exit_or_disable(mm))
> > > goto breakouterloop;
> > > *result = collapse_pte_mapped_thp(mm,
> > > khugepaged_scan.address, false);
> > > @@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > * Release the current mm_slot if this mm is about to die, or
> > > * if we scanned all vmas of this mm.
> > > */
> > > - if (hpage_collapse_test_exit(mm) || !vma) {
> > > + if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
> > > /*
> > > * Make sure that if mm_users is reaching zero while
> > > * khugepaged runs here, khugepaged_exit will find
> > > --
> > > 2.33.1
> > >
On Tue, Jan 30, 2024 at 10:12 AM Lance Yang <[email protected]> wrote:
>
> Hey Michal,
>
> Thanks for taking time to review!
>
> On some servers within our company, we deploy a
> daemon responsible for monitoring and updating
> local applications. Some applications prefer not to
> use THP, so the daemon calls prctl to disable THP
> before fork/exec. Conversely, for other applications,
> the daemon calls prctl to enable THP before fork/exec.
>
> Ideally, the daemon should invoke prctl after the fork,
> but its current implementation follows the described
> approach.
In the Go standard library, there is no direct encapsulation
of the fork system call. Instead, fork and execve are
combined into one through syscall.ForkExec.
>
> BR,
> Lance
>
> On Tue, Jan 30, 2024 at 12:28 AM Michal Hocko <[email protected]> wrote:
> >
> > On Mon 29-01-24 13:45:51, Lance Yang wrote:
> > > khugepaged scans the entire address space in the
> > > background for each given mm, looking for
> > > opportunities to merge sequences of basic pages
> > > into huge pages. However, when an mm is inserted
> > > to the mm_slots list, and the MMF_DISABLE_THP flag
> > > is set later, this scanning process becomes
> > > unnecessary for that mm and can be skipped to avoid
> > > redundant operations, especially in scenarios with
> > > a large address space.
> >
> > Is this a real problem? I thought that the prctl is called
> > on the parent before fork/exec. Or are you aware of any
> > applications which do call prctl late enough that the race
> > would be actually observable?
> > --
> > Michal Hocko
> > SUSE Labs
On Tue 30-01-24 11:08:10, Lance Yang wrote:
> On Tue, Jan 30, 2024 at 10:12 AM Lance Yang <[email protected]> wrote:
> >
> > Hey Michal,
> >
> > Thanks for taking time to review!
> >
> > On some servers within our company, we deploy a
> > daemon responsible for monitoring and updating
> > local applications. Some applications prefer not to
> > use THP, so the daemon calls prctl to disable THP
> > before fork/exec. Conversely, for other applications,
> > the daemon calls prctl to enable THP before fork/exec.
> >
> > Ideally, the daemon should invoke prctl after the fork,
> > but its current implementation follows the described
> > approach.
>
> In the Go standard library, there is no direct encapsulation
> of the fork system call. Instead, fork and execve are
> combined into one through syscall.ForkExec.
OK, this is an important detail. Something that should be a part
of the chnagelog. It is also important to note that this is not
a correctness issue but rather an optimization to save expensive
checks on each VMA when userspace cannot prctl itself before spawning
into the new process.
--
Michal Hocko
SUSE Labs
On Tue, Jan 30, 2024 at 5:35 PM Michal Hocko <[email protected]> wrote:
>
> On Tue 30-01-24 11:08:10, Lance Yang wrote:
> > On Tue, Jan 30, 2024 at 10:12 AM Lance Yang <[email protected]> wrote:
> > >
> > > Hey Michal,
> > >
> > > Thanks for taking time to review!
> > >
> > > On some servers within our company, we deploy a
> > > daemon responsible for monitoring and updating
> > > local applications. Some applications prefer not to
> > > use THP, so the daemon calls prctl to disable THP
> > > before fork/exec. Conversely, for other applications,
> > > the daemon calls prctl to enable THP before fork/exec.
> > >
> > > Ideally, the daemon should invoke prctl after the fork,
> > > but its current implementation follows the described
> > > approach.
> >
> > In the Go standard library, there is no direct encapsulation
> > of the fork system call. Instead, fork and execve are
> > combined into one through syscall.ForkExec.
>
> OK, this is an important detail. Something that should be a part
> of the chnagelog. It is also important to note that this is not
> a correctness issue but rather an optimization to save expensive
> checks on each VMA when userspace cannot prctl itself before spawning
> into the new process.
Thanks for pointing that out!
I'll include it in the changelog. Good to know it's an optimization
rather than a correctness issue.
Thanks,
Lance
> --
> Michal Hocko
> SUSE Labs
Hey Zach and Yang,
Could I start a new version?
Thanks,
Lance
On Tue, Jan 30, 2024 at 5:46 PM Lance Yang <[email protected]> wrote:
>
> On Tue, Jan 30, 2024 at 5:35 PM Michal Hocko <[email protected]> wrote:
> >
> > On Tue 30-01-24 11:08:10, Lance Yang wrote:
> > > On Tue, Jan 30, 2024 at 10:12 AM Lance Yang <[email protected]> wrote:
> > > >
> > > > Hey Michal,
> > > >
> > > > Thanks for taking time to review!
> > > >
> > > > On some servers within our company, we deploy a
> > > > daemon responsible for monitoring and updating
> > > > local applications. Some applications prefer not to
> > > > use THP, so the daemon calls prctl to disable THP
> > > > before fork/exec. Conversely, for other applications,
> > > > the daemon calls prctl to enable THP before fork/exec.
> > > >
> > > > Ideally, the daemon should invoke prctl after the fork,
> > > > but its current implementation follows the described
> > > > approach.
> > >
> > > In the Go standard library, there is no direct encapsulation
> > > of the fork system call. Instead, fork and execve are
> > > combined into one through syscall.ForkExec.
> >
> > OK, this is an important detail. Something that should be a part
> > of the chnagelog. It is also important to note that this is not
> > a correctness issue but rather an optimization to save expensive
> > checks on each VMA when userspace cannot prctl itself before spawning
> > into the new process.
>
> Thanks for pointing that out!
>
> I'll include it in the changelog. Good to know it's an optimization
> rather than a correctness issue.
>
> Thanks,
> Lance
>
> > --
> > Michal Hocko
> > SUSE Labs
Updating the change log.
khugepaged scans the entire address space in the
background for each given mm, looking for
opportunities to merge sequences of basic pages
into huge pages. However, when an mm is inserted
to the mm_slots list, and the MMF_DISABLE_THP
flag is set later, this scanning process becomes
unnecessary for that mm and can be skipped to
avoid redundant operations, especially in scenarios
with a large address space.
This commit introduces a check before each scanning
process to test the MMF_DISABLE_THP flag for the
given mm; if the flag is set, the scanning process is
bypassed, thereby improving the efficiency of khugepaged.
This optimization is not a correctness issue but rather an
enhancement to save expensive checks on each VMA
when userspace cannot prctl itself before spawning
into the new process.
On some servers within our company, we deploy a
daemon responsible for monitoring and updating local
applications. Some applications prefer not to use THP,
so the daemon calls prctl to disable THP before fork/exec.
Conversely, for other applications, the daemon calls prctl
to enable THP before fork/exec.
Ideally, the daemon should invoke prctl after the fork,
but its current implementation follows the described
approach. In the Go standard library, there is no direct
encapsulation of the fork system call; instead, fork and
execve are combined into one through syscall.ForkExec.
Thanks,
Lance
On Mon, Jan 29, 2024 at 1:46 PM Lance Yang <[email protected]> wrote:
>
> khugepaged scans the entire address space in the
> background for each given mm, looking for
> opportunities to merge sequences of basic pages
> into huge pages. However, when an mm is inserted
> to the mm_slots list, and the MMF_DISABLE_THP flag
> is set later, this scanning process becomes
> unnecessary for that mm and can be skipped to avoid
> redundant operations, especially in scenarios with
> a large address space.
>
> This commit introduces a check before each scanning
> process to test the MMF_DISABLE_THP flag for the
> given mm; if the flag is set, the scanning process
> is bypassed, thereby improving the efficiency of
> khugepaged.
>
> Signed-off-by: Lance Yang <[email protected]>
> ---
> mm/khugepaged.c | 18 ++++++++++++------
> 1 file changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 2b219acb528e..d6a700834edc 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
> return atomic_read(&mm->mm_users) == 0;
> }
>
> +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
> +{
> + return hpage_collapse_test_exit(mm) ||
> + test_bit(MMF_DISABLE_THP, &mm->flags);
> +}
> +
> void __khugepaged_enter(struct mm_struct *mm)
> {
> struct khugepaged_mm_slot *mm_slot;
> @@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
>
> lockdep_assert_held(&khugepaged_mm_lock);
>
> - if (hpage_collapse_test_exit(mm)) {
> + if (hpage_collapse_test_exit_or_disable(mm)) {
> /* free mm_slot */
> hash_del(&slot->hash);
> list_del(&slot->mm_node);
> @@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> goto breakouterloop_mmap_lock;
>
> progress++;
> - if (unlikely(hpage_collapse_test_exit(mm)))
> + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> goto breakouterloop;
>
> vma_iter_init(&vmi, mm, khugepaged_scan.address);
> @@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> unsigned long hstart, hend;
>
> cond_resched();
> - if (unlikely(hpage_collapse_test_exit(mm))) {
> + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
> progress++;
> break;
> }
> @@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> bool mmap_locked = true;
>
> cond_resched();
> - if (unlikely(hpage_collapse_test_exit(mm)))
> + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> goto breakouterloop;
>
> VM_BUG_ON(khugepaged_scan.address < hstart ||
> @@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> fput(file);
> if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> mmap_read_lock(mm);
> - if (hpage_collapse_test_exit(mm))
> + if (hpage_collapse_test_exit_or_disable(mm))
> goto breakouterloop;
> *result = collapse_pte_mapped_thp(mm,
> khugepaged_scan.address, false);
> @@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> * Release the current mm_slot if this mm is about to die, or
> * if we scanned all vmas of this mm.
> */
> - if (hpage_collapse_test_exit(mm) || !vma) {
> + if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
> /*
> * Make sure that if mm_users is reaching zero while
> * khugepaged runs here, khugepaged_exit will find
> --
> 2.33.1
>
On Wed, Jan 31, 2024 at 1:30 AM Lance Yang <[email protected]> wrote:
>
> Updating the change log.
>
> khugepaged scans the entire address space in the
> background for each given mm, looking for
> opportunities to merge sequences of basic pages
> into huge pages. However, when an mm is inserted
> to the mm_slots list, and the MMF_DISABLE_THP
> flag is set later, this scanning process becomes
> unnecessary for that mm and can be skipped to
> avoid redundant operations, especially in scenarios
> with a large address space.
>
> This commit introduces a check before each scanning
> process to test the MMF_DISABLE_THP flag for the
> given mm; if the flag is set, the scanning process is
> bypassed, thereby improving the efficiency of khugepaged.
>
> This optimization is not a correctness issue but rather an
> enhancement to save expensive checks on each VMA
> when userspace cannot prctl itself before spawning
> into the new process.
If this is an optimization, you'd better show some real numbers to help justify.
>
> On some servers within our company, we deploy a
> daemon responsible for monitoring and updating local
> applications. Some applications prefer not to use THP,
> so the daemon calls prctl to disable THP before fork/exec.
> Conversely, for other applications, the daemon calls prctl
> to enable THP before fork/exec.
If your daemon calls prctl with MMF_DISABLE_THP before fork, then you
end up having the child mm on the hash list in the first place, I
think it should be a bug in khugepaged_fork() IIUC. khugepaged_fork()
should check this flag and bail out if it is set. Did I miss
something?
>
> Ideally, the daemon should invoke prctl after the fork,
> but its current implementation follows the described
> approach. In the Go standard library, there is no direct
> encapsulation of the fork system call; instead, fork and
> execve are combined into one through syscall.ForkExec.
>
> Thanks,
> Lance
>
> On Mon, Jan 29, 2024 at 1:46 PM Lance Yang <[email protected]> wrote:
> >
> > khugepaged scans the entire address space in the
> > background for each given mm, looking for
> > opportunities to merge sequences of basic pages
> > into huge pages. However, when an mm is inserted
> > to the mm_slots list, and the MMF_DISABLE_THP flag
> > is set later, this scanning process becomes
> > unnecessary for that mm and can be skipped to avoid
> > redundant operations, especially in scenarios with
> > a large address space.
> >
> > This commit introduces a check before each scanning
> > process to test the MMF_DISABLE_THP flag for the
> > given mm; if the flag is set, the scanning process
> > is bypassed, thereby improving the efficiency of
> > khugepaged.
> >
> > Signed-off-by: Lance Yang <[email protected]>
> > ---
> > mm/khugepaged.c | 18 ++++++++++++------
> > 1 file changed, 12 insertions(+), 6 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 2b219acb528e..d6a700834edc 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
> > return atomic_read(&mm->mm_users) == 0;
> > }
> >
> > +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
> > +{
> > + return hpage_collapse_test_exit(mm) ||
> > + test_bit(MMF_DISABLE_THP, &mm->flags);
> > +}
> > +
> > void __khugepaged_enter(struct mm_struct *mm)
> > {
> > struct khugepaged_mm_slot *mm_slot;
> > @@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
> >
> > lockdep_assert_held(&khugepaged_mm_lock);
> >
> > - if (hpage_collapse_test_exit(mm)) {
> > + if (hpage_collapse_test_exit_or_disable(mm)) {
> > /* free mm_slot */
> > hash_del(&slot->hash);
> > list_del(&slot->mm_node);
> > @@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > goto breakouterloop_mmap_lock;
> >
> > progress++;
> > - if (unlikely(hpage_collapse_test_exit(mm)))
> > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > goto breakouterloop;
> >
> > vma_iter_init(&vmi, mm, khugepaged_scan.address);
> > @@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > unsigned long hstart, hend;
> >
> > cond_resched();
> > - if (unlikely(hpage_collapse_test_exit(mm))) {
> > + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
> > progress++;
> > break;
> > }
> > @@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > bool mmap_locked = true;
> >
> > cond_resched();
> > - if (unlikely(hpage_collapse_test_exit(mm)))
> > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > goto breakouterloop;
> >
> > VM_BUG_ON(khugepaged_scan.address < hstart ||
> > @@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > fput(file);
> > if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> > mmap_read_lock(mm);
> > - if (hpage_collapse_test_exit(mm))
> > + if (hpage_collapse_test_exit_or_disable(mm))
> > goto breakouterloop;
> > *result = collapse_pte_mapped_thp(mm,
> > khugepaged_scan.address, false);
> > @@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > * Release the current mm_slot if this mm is about to die, or
> > * if we scanned all vmas of this mm.
> > */
> > - if (hpage_collapse_test_exit(mm) || !vma) {
> > + if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
> > /*
> > * Make sure that if mm_users is reaching zero while
> > * khugepaged runs here, khugepaged_exit will find
> > --
> > 2.33.1
> >
Hey Yang,
Thank you for the clarification.
You're correct. If the daemon calls prctl with
MMF_DISABLE_THP before fork, the child
mm won't be on the hash list.
What I meant is that the daemon mm might
already be on the hash list before fork.
Therefore, khugepaged might still scan the
address space for the daemon.
Thanks,
Lance
On Thu, Feb 1, 2024 at 4:06 AM Yang Shi <[email protected]> wrote:
>
> On Wed, Jan 31, 2024 at 1:30 AM Lance Yang <[email protected]> wrote:
> >
> > Updating the change log.
> >
> > khugepaged scans the entire address space in the
> > background for each given mm, looking for
> > opportunities to merge sequences of basic pages
> > into huge pages. However, when an mm is inserted
> > to the mm_slots list, and the MMF_DISABLE_THP
> > flag is set later, this scanning process becomes
> > unnecessary for that mm and can be skipped to
> > avoid redundant operations, especially in scenarios
> > with a large address space.
> >
> > This commit introduces a check before each scanning
> > process to test the MMF_DISABLE_THP flag for the
> > given mm; if the flag is set, the scanning process is
> > bypassed, thereby improving the efficiency of khugepaged.
> >
> > This optimization is not a correctness issue but rather an
> > enhancement to save expensive checks on each VMA
> > when userspace cannot prctl itself before spawning
> > into the new process.
>
> If this is an optimization, you'd better show some real numbers to help justify.
>
> >
> > On some servers within our company, we deploy a
> > daemon responsible for monitoring and updating local
> > applications. Some applications prefer not to use THP,
> > so the daemon calls prctl to disable THP before fork/exec.
> > Conversely, for other applications, the daemon calls prctl
> > to enable THP before fork/exec.
>
> If your daemon calls prctl with MMF_DISABLE_THP before fork, then you
> end up having the child mm on the hash list in the first place, I
> think it should be a bug in khugepaged_fork() IIUC. khugepaged_fork()
> should check this flag and bail out if it is set. Did I miss
> something?
>
> >
> > Ideally, the daemon should invoke prctl after the fork,
> > but its current implementation follows the described
> > approach. In the Go standard library, there is no direct
> > encapsulation of the fork system call; instead, fork and
> > execve are combined into one through syscall.ForkExec.
> >
> > Thanks,
> > Lance
> >
> > On Mon, Jan 29, 2024 at 1:46 PM Lance Yang <[email protected]> wrote:
> > >
> > > khugepaged scans the entire address space in the
> > > background for each given mm, looking for
> > > opportunities to merge sequences of basic pages
> > > into huge pages. However, when an mm is inserted
> > > to the mm_slots list, and the MMF_DISABLE_THP flag
> > > is set later, this scanning process becomes
> > > unnecessary for that mm and can be skipped to avoid
> > > redundant operations, especially in scenarios with
> > > a large address space.
> > >
> > > This commit introduces a check before each scanning
> > > process to test the MMF_DISABLE_THP flag for the
> > > given mm; if the flag is set, the scanning process
> > > is bypassed, thereby improving the efficiency of
> > > khugepaged.
> > >
> > > Signed-off-by: Lance Yang <[email protected]>
> > > ---
> > > mm/khugepaged.c | 18 ++++++++++++------
> > > 1 file changed, 12 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index 2b219acb528e..d6a700834edc 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
> > > return atomic_read(&mm->mm_users) == 0;
> > > }
> > >
> > > +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
> > > +{
> > > + return hpage_collapse_test_exit(mm) ||
> > > + test_bit(MMF_DISABLE_THP, &mm->flags);
> > > +}
> > > +
> > > void __khugepaged_enter(struct mm_struct *mm)
> > > {
> > > struct khugepaged_mm_slot *mm_slot;
> > > @@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
> > >
> > > lockdep_assert_held(&khugepaged_mm_lock);
> > >
> > > - if (hpage_collapse_test_exit(mm)) {
> > > + if (hpage_collapse_test_exit_or_disable(mm)) {
> > > /* free mm_slot */
> > > hash_del(&slot->hash);
> > > list_del(&slot->mm_node);
> > > @@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > goto breakouterloop_mmap_lock;
> > >
> > > progress++;
> > > - if (unlikely(hpage_collapse_test_exit(mm)))
> > > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > > goto breakouterloop;
> > >
> > > vma_iter_init(&vmi, mm, khugepaged_scan.address);
> > > @@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > unsigned long hstart, hend;
> > >
> > > cond_resched();
> > > - if (unlikely(hpage_collapse_test_exit(mm))) {
> > > + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
> > > progress++;
> > > break;
> > > }
> > > @@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > bool mmap_locked = true;
> > >
> > > cond_resched();
> > > - if (unlikely(hpage_collapse_test_exit(mm)))
> > > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > > goto breakouterloop;
> > >
> > > VM_BUG_ON(khugepaged_scan.address < hstart ||
> > > @@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > fput(file);
> > > if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> > > mmap_read_lock(mm);
> > > - if (hpage_collapse_test_exit(mm))
> > > + if (hpage_collapse_test_exit_or_disable(mm))
> > > goto breakouterloop;
> > > *result = collapse_pte_mapped_thp(mm,
> > > khugepaged_scan.address, false);
> > > @@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > * Release the current mm_slot if this mm is about to die, or
> > > * if we scanned all vmas of this mm.
> > > */
> > > - if (hpage_collapse_test_exit(mm) || !vma) {
> > > + if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
> > > /*
> > > * Make sure that if mm_users is reaching zero while
> > > * khugepaged runs here, khugepaged_exit will find
> > > --
> > > 2.33.1
> > >
On Wed, Jan 31, 2024 at 5:13 PM Lance Yang <[email protected]> wrote:
>
> Hey Yang,
>
> Thank you for the clarification.
>
> You're correct. If the daemon calls prctl with
> MMF_DISABLE_THP before fork, the child
> mm won't be on the hash list.
>
> What I meant is that the daemon mm might
> already be on the hash list before fork.
> Therefore, khugepaged might still scan the
> address space for the daemon.
OK, I thought you don't care about the daemon since you mentioned the
daemon would call prctl to disable THP or enable THP for different
children, so the daemon's THP preference may be constantly changed or
doesn't matter at all.
So the actual cost is actually traversing the maple tree for the
daemon. Does the daemon have excessive vmas? I'm not sure whether the
improvement is noticeable or not.
>
> Thanks,
> Lance
>
> On Thu, Feb 1, 2024 at 4:06 AM Yang Shi <[email protected]> wrote:
> >
> > On Wed, Jan 31, 2024 at 1:30 AM Lance Yang <[email protected]> wrote:
> > >
> > > Updating the change log.
> > >
> > > khugepaged scans the entire address space in the
> > > background for each given mm, looking for
> > > opportunities to merge sequences of basic pages
> > > into huge pages. However, when an mm is inserted
> > > to the mm_slots list, and the MMF_DISABLE_THP
> > > flag is set later, this scanning process becomes
> > > unnecessary for that mm and can be skipped to
> > > avoid redundant operations, especially in scenarios
> > > with a large address space.
> > >
> > > This commit introduces a check before each scanning
> > > process to test the MMF_DISABLE_THP flag for the
> > > given mm; if the flag is set, the scanning process is
> > > bypassed, thereby improving the efficiency of khugepaged.
> > >
> > > This optimization is not a correctness issue but rather an
> > > enhancement to save expensive checks on each VMA
> > > when userspace cannot prctl itself before spawning
> > > into the new process.
> >
> > If this is an optimization, you'd better show some real numbers to help justify.
> >
> > >
> > > On some servers within our company, we deploy a
> > > daemon responsible for monitoring and updating local
> > > applications. Some applications prefer not to use THP,
> > > so the daemon calls prctl to disable THP before fork/exec.
> > > Conversely, for other applications, the daemon calls prctl
> > > to enable THP before fork/exec.
> >
> > If your daemon calls prctl with MMF_DISABLE_THP before fork, then you
> > end up having the child mm on the hash list in the first place, I
> > think it should be a bug in khugepaged_fork() IIUC. khugepaged_fork()
> > should check this flag and bail out if it is set. Did I miss
> > something?
> >
> > >
> > > Ideally, the daemon should invoke prctl after the fork,
> > > but its current implementation follows the described
> > > approach. In the Go standard library, there is no direct
> > > encapsulation of the fork system call; instead, fork and
> > > execve are combined into one through syscall.ForkExec.
> > >
> > > Thanks,
> > > Lance
> > >
> > > On Mon, Jan 29, 2024 at 1:46 PM Lance Yang <[email protected]> wrote:
> > > >
> > > > khugepaged scans the entire address space in the
> > > > background for each given mm, looking for
> > > > opportunities to merge sequences of basic pages
> > > > into huge pages. However, when an mm is inserted
> > > > to the mm_slots list, and the MMF_DISABLE_THP flag
> > > > is set later, this scanning process becomes
> > > > unnecessary for that mm and can be skipped to avoid
> > > > redundant operations, especially in scenarios with
> > > > a large address space.
> > > >
> > > > This commit introduces a check before each scanning
> > > > process to test the MMF_DISABLE_THP flag for the
> > > > given mm; if the flag is set, the scanning process
> > > > is bypassed, thereby improving the efficiency of
> > > > khugepaged.
> > > >
> > > > Signed-off-by: Lance Yang <[email protected]>
> > > > ---
> > > > mm/khugepaged.c | 18 ++++++++++++------
> > > > 1 file changed, 12 insertions(+), 6 deletions(-)
> > > >
> > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > index 2b219acb528e..d6a700834edc 100644
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
> > > > return atomic_read(&mm->mm_users) == 0;
> > > > }
> > > >
> > > > +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
> > > > +{
> > > > + return hpage_collapse_test_exit(mm) ||
> > > > + test_bit(MMF_DISABLE_THP, &mm->flags);
> > > > +}
> > > > +
> > > > void __khugepaged_enter(struct mm_struct *mm)
> > > > {
> > > > struct khugepaged_mm_slot *mm_slot;
> > > > @@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
> > > >
> > > > lockdep_assert_held(&khugepaged_mm_lock);
> > > >
> > > > - if (hpage_collapse_test_exit(mm)) {
> > > > + if (hpage_collapse_test_exit_or_disable(mm)) {
> > > > /* free mm_slot */
> > > > hash_del(&slot->hash);
> > > > list_del(&slot->mm_node);
> > > > @@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > > goto breakouterloop_mmap_lock;
> > > >
> > > > progress++;
> > > > - if (unlikely(hpage_collapse_test_exit(mm)))
> > > > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > > > goto breakouterloop;
> > > >
> > > > vma_iter_init(&vmi, mm, khugepaged_scan.address);
> > > > @@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > > unsigned long hstart, hend;
> > > >
> > > > cond_resched();
> > > > - if (unlikely(hpage_collapse_test_exit(mm))) {
> > > > + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
> > > > progress++;
> > > > break;
> > > > }
> > > > @@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > > bool mmap_locked = true;
> > > >
> > > > cond_resched();
> > > > - if (unlikely(hpage_collapse_test_exit(mm)))
> > > > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > > > goto breakouterloop;
> > > >
> > > > VM_BUG_ON(khugepaged_scan.address < hstart ||
> > > > @@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > > fput(file);
> > > > if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> > > > mmap_read_lock(mm);
> > > > - if (hpage_collapse_test_exit(mm))
> > > > + if (hpage_collapse_test_exit_or_disable(mm))
> > > > goto breakouterloop;
> > > > *result = collapse_pte_mapped_thp(mm,
> > > > khugepaged_scan.address, false);
> > > > @@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > > * Release the current mm_slot if this mm is about to die, or
> > > > * if we scanned all vmas of this mm.
> > > > */
> > > > - if (hpage_collapse_test_exit(mm) || !vma) {
> > > > + if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
> > > > /*
> > > > * Make sure that if mm_users is reaching zero while
> > > > * khugepaged runs here, khugepaged_exit will find
> > > > --
> > > > 2.33.1
> > > >
I asked the maintainer of the daemon.
Currently, the daemon is not optimized for huge
pages. It has over 200 VMAs, with the majority
being file-mapped.
On Fri, Feb 2, 2024 at 2:56 AM Yang Shi <[email protected]> wrote:
>
> On Wed, Jan 31, 2024 at 5:13 PM Lance Yang <[email protected]> wrote:
> >
> > Hey Yang,
> >
> > Thank you for the clarification.
> >
> > You're correct. If the daemon calls prctl with
> > MMF_DISABLE_THP before fork, the child
> > mm won't be on the hash list.
> >
> > What I meant is that the daemon mm might
> > already be on the hash list before fork.
> > Therefore, khugepaged might still scan the
> > address space for the daemon.
>
> OK, I thought you don't care about the daemon since you mentioned the
> daemon would call prctl to disable THP or enable THP for different
> children, so the daemon's THP preference may be constantly changed or
> doesn't matter at all.
>
> So the actual cost is actually traversing the maple tree for the
> daemon. Does the daemon have excessive vmas? I'm not sure whether the
> improvement is noticeable or not.
>
> >
> > Thanks,
> > Lance
> >
> > On Thu, Feb 1, 2024 at 4:06 AM Yang Shi <[email protected]> wrote:
> > >
> > > On Wed, Jan 31, 2024 at 1:30 AM Lance Yang <[email protected]> wrote:
> > > >
> > > > Updating the change log.
> > > >
> > > > khugepaged scans the entire address space in the
> > > > background for each given mm, looking for
> > > > opportunities to merge sequences of basic pages
> > > > into huge pages. However, when an mm is inserted
> > > > to the mm_slots list, and the MMF_DISABLE_THP
> > > > flag is set later, this scanning process becomes
> > > > unnecessary for that mm and can be skipped to
> > > > avoid redundant operations, especially in scenarios
> > > > with a large address space.
> > > >
> > > > This commit introduces a check before each scanning
> > > > process to test the MMF_DISABLE_THP flag for the
> > > > given mm; if the flag is set, the scanning process is
> > > > bypassed, thereby improving the efficiency of khugepaged.
> > > >
> > > > This optimization is not a correctness issue but rather an
> > > > enhancement to save expensive checks on each VMA
> > > > when userspace cannot prctl itself before spawning
> > > > into the new process.
> > >
> > > If this is an optimization, you'd better show some real numbers to help justify.
> > >
> > > >
> > > > On some servers within our company, we deploy a
> > > > daemon responsible for monitoring and updating local
> > > > applications. Some applications prefer not to use THP,
> > > > so the daemon calls prctl to disable THP before fork/exec.
> > > > Conversely, for other applications, the daemon calls prctl
> > > > to enable THP before fork/exec.
> > >
> > > If your daemon calls prctl with MMF_DISABLE_THP before fork, then you
> > > end up having the child mm on the hash list in the first place, I
> > > think it should be a bug in khugepaged_fork() IIUC. khugepaged_fork()
> > > should check this flag and bail out if it is set. Did I miss
> > > something?
> > >
> > > >
> > > > Ideally, the daemon should invoke prctl after the fork,
> > > > but its current implementation follows the described
> > > > approach. In the Go standard library, there is no direct
> > > > encapsulation of the fork system call; instead, fork and
> > > > execve are combined into one through syscall.ForkExec.
> > > >
> > > > Thanks,
> > > > Lance
> > > >
> > > > On Mon, Jan 29, 2024 at 1:46 PM Lance Yang <ioworker0@gmailcom> wrote:
> > > > >
> > > > > khugepaged scans the entire address space in the
> > > > > background for each given mm, looking for
> > > > > opportunities to merge sequences of basic pages
> > > > > into huge pages. However, when an mm is inserted
> > > > > to the mm_slots list, and the MMF_DISABLE_THP flag
> > > > > is set later, this scanning process becomes
> > > > > unnecessary for that mm and can be skipped to avoid
> > > > > redundant operations, especially in scenarios with
> > > > > a large address space.
> > > > >
> > > > > This commit introduces a check before each scanning
> > > > > process to test the MMF_DISABLE_THP flag for the
> > > > > given mm; if the flag is set, the scanning process
> > > > > is bypassed, thereby improving the efficiency of
> > > > > khugepaged.
> > > > >
> > > > > Signed-off-by: Lance Yang <[email protected]>
> > > > > ---
> > > > > mm/khugepaged.c | 18 ++++++++++++------
> > > > > 1 file changed, 12 insertions(+), 6 deletions(-)
> > > > >
> > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > > index 2b219acb528e..d6a700834edc 100644
> > > > > --- a/mm/khugepaged.c
> > > > > +++ b/mm/khugepaged.c
> > > > > @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
> > > > > return atomic_read(&mm->mm_users) == 0;
> > > > > }
> > > > >
> > > > > +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
> > > > > +{
> > > > > + return hpage_collapse_test_exit(mm) ||
> > > > > + test_bit(MMF_DISABLE_THP, &mm->flags);
> > > > > +}
> > > > > +
> > > > > void __khugepaged_enter(struct mm_struct *mm)
> > > > > {
> > > > > struct khugepaged_mm_slot *mm_slot;
> > > > > @@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
> > > > >
> > > > > lockdep_assert_held(&khugepaged_mm_lock);
> > > > >
> > > > > - if (hpage_collapse_test_exit(mm)) {
> > > > > + if (hpage_collapse_test_exit_or_disable(mm)) {
> > > > > /* free mm_slot */
> > > > > hash_del(&slot->hash);
> > > > > list_del(&slot->mm_node);
> > > > > @@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > > > goto breakouterloop_mmap_lock;
> > > > >
> > > > > progress++;
> > > > > - if (unlikely(hpage_collapse_test_exit(mm)))
> > > > > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > > > > goto breakouterloop;
> > > > >
> > > > > vma_iter_init(&vmi, mm, khugepaged_scan.address);
> > > > > @@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > > > unsigned long hstart, hend;
> > > > >
> > > > > cond_resched();
> > > > > - if (unlikely(hpage_collapse_test_exit(mm))) {
> > > > > + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
> > > > > progress++;
> > > > > break;
> > > > > }
> > > > > @@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > > > bool mmap_locked = true;
> > > > >
> > > > > cond_resched();
> > > > > - if (unlikely(hpage_collapse_test_exit(mm)))
> > > > > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > > > > goto breakouterloop;
> > > > >
> > > > > VM_BUG_ON(khugepaged_scan.address < hstart ||
> > > > > @@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > > > fput(file);
> > > > > if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> > > > > mmap_read_lock(mm);
> > > > > - if (hpage_collapse_test_exit(mm))
> > > > > + if (hpage_collapse_test_exit_or_disable(mm))
> > > > > goto breakouterloop;
> > > > > *result = collapse_pte_mapped_thp(mm,
> > > > > khugepaged_scan.address, false);
> > > > > @@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > > > > * Release the current mm_slot if this mm is about to die, or
> > > > > * if we scanned all vmas of this mm.
> > > > > */
> > > > > - if (hpage_collapse_test_exit(mm) || !vma) {
> > > > > + if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
> > > > > /*
> > > > > * Make sure that if mm_users is reaching zero while
> > > > > * khugepaged runs here, khugepaged_exit will find
> > > > > --
> > > > > 2.33.1
> > > > >
On Wed, 31 Jan 2024 17:30:11 +0800 Lance Yang <[email protected]> wrote:
> Updating the change log.
>
> khugepaged scans the entire address space in the
> background for each given mm, looking for
> opportunities to merge sequences of basic pages
> into huge pages. However, when an mm is inserted
> to the mm_slots list, and the MMF_DISABLE_THP
> flag is set later, this scanning process becomes
> unnecessary for that mm and can be skipped to
> avoid redundant operations, especially in scenarios
> with a large address space.
>
> This commit introduces a check before each scanning
> process to test the MMF_DISABLE_THP flag for the
> given mm; if the flag is set, the scanning process is
> bypassed, thereby improving the efficiency of khugepaged.
>
> This optimization is not a correctness issue but rather an
> enhancement to save expensive checks on each VMA
> when userspace cannot prctl itself before spawning
> into the new process.
>
> On some servers within our company, we deploy a
> daemon responsible for monitoring and updating local
> applications. Some applications prefer not to use THP,
> so the daemon calls prctl to disable THP before fork/exec.
> Conversely, for other applications, the daemon calls prctl
> to enable THP before fork/exec.
>
> Ideally, the daemon should invoke prctl after the fork,
> but its current implementation follows the described
> approach. In the Go standard library, there is no direct
> encapsulation of the fork system call; instead, fork and
> execve are combined into one through syscall.ForkExec.
I pasted the above into the v1 patch's changelog.
However I'm not seeing a good level of reviewer enthusiasm. Pertially
because of the lack of quantitative testing results. Is is possible to
generate such results, to give people an overall feel of the
desirability of this change?
Hey Andrew,
Thanks for taking time to review!
I appreciate your suggestion and will be
supplementing with test results shortly.
Best,
Lance
On Thu, Feb 22, 2024 at 6:12 AM Andrew Morton <[email protected]> wrote:
>
> On Wed, 31 Jan 2024 17:30:11 +0800 Lance Yang <[email protected]> wrote:
>
> > Updating the change log.
> >
> > khugepaged scans the entire address space in the
> > background for each given mm, looking for
> > opportunities to merge sequences of basic pages
> > into huge pages. However, when an mm is inserted
> > to the mm_slots list, and the MMF_DISABLE_THP
> > flag is set later, this scanning process becomes
> > unnecessary for that mm and can be skipped to
> > avoid redundant operations, especially in scenarios
> > with a large address space.
> >
> > This commit introduces a check before each scanning
> > process to test the MMF_DISABLE_THP flag for the
> > given mm; if the flag is set, the scanning process is
> > bypassed, thereby improving the efficiency of khugepaged.
> >
> > This optimization is not a correctness issue but rather an
> > enhancement to save expensive checks on each VMA
> > when userspace cannot prctl itself before spawning
> > into the new process.
> >
> > On some servers within our company, we deploy a
> > daemon responsible for monitoring and updating local
> > applications. Some applications prefer not to use THP,
> > so the daemon calls prctl to disable THP before fork/exec.
> > Conversely, for other applications, the daemon calls prctl
> > to enable THP before fork/exec.
> >
> > Ideally, the daemon should invoke prctl after the fork,
> > but its current implementation follows the described
> > approach. In the Go standard library, there is no direct
> > encapsulation of the fork system call; instead, fork and
> > execve are combined into one through syscall.ForkExec.
>
> I pasted the above into the v1 patch's changelog.
>
> However I'm not seeing a good level of reviewer enthusiasm. Pertially
> because of the lack of quantitative testing results. Is is possible to
> generate such results, to give people an overall feel of the
> desirability of this change?
>
Hey,
On an Intel Core i5 CPU, the time taken by
khugepaged to scan the address space of
the process, which has been set with the
MMF_DISABLE_THP flag after being added
to the mm_slots list, is as follows (shorter is better):
VMA Count | Old | New | Change
---------------------------------------
50 | 23us | 9us | -60.9%
100 | 32us | 9us | -71.9%
200 | 44us | 9us | -79.5%
400 | 75us | 9us | -88.0%
800 | 98us | 9us | -90.8%
IIUC, once the count of VMAs for the process
exceeds page_to_scan, khugepaged needs to
wait for scan_sleep_millisecs ms before scanning
the next process. IMO, unnecessary scans could
actually be skipped with a very inexpensive
mm->flags check in this case.
Best,
Lance
On Wed, Jan 31, 2024 at 5:30 PM Lance Yang <[email protected]> wrote:
>
> Updating the change log.
[...]
> On Mon, Jan 29, 2024 at 1:46 PM Lance Yang <[email protected]> wrote:
> >
> > khugepaged scans the entire address space in the
> > background for each given mm, looking for
> > opportunities to merge sequences of basic pages
> > into huge pages. However, when an mm is inserted
> > to the mm_slots list, and the MMF_DISABLE_THP flag
> > is set later, this scanning process becomes
> > unnecessary for that mm and can be skipped to avoid
> > redundant operations, especially in scenarios with
> > a large address space.
[...]
> > Signed-off-by: Lance Yang <[email protected]>
On 22.02.24 08:51, Lance Yang wrote:
> Hey,
>
> On an Intel Core i5 CPU, the time taken by
> khugepaged to scan the address space of
> the process, which has been set with the
> MMF_DISABLE_THP flag after being added
> to the mm_slots list, is as follows (shorter is better):
>
> VMA Count | Old | New | Change
> ---------------------------------------
> 50 | 23us | 9us | -60.9%
> 100 | 32us | 9us | -71.9%
> 200 | 44us | 9us | -79.5%
> 400 | 75us | 9us | -88.0%
> 800 | 98us | 9us | -90.8%
>
> IIUC, once the count of VMAs for the process
> exceeds page_to_scan, khugepaged needs to
> wait for scan_sleep_millisecs ms before scanning
> the next process. IMO, unnecessary scans could
> actually be skipped with a very inexpensive
> mm->flags check in this case.
FWIW
Acked-by: David Hildenbrand <[email protected]>
--
Cheers,
David / dhildenb
Thanks, David!
On Thu, Feb 22, 2024 at 4:51 PM David Hildenbrand <[email protected]> wrote:
>
> On 22.02.24 08:51, Lance Yang wrote:
> > Hey,
> >
> > On an Intel Core i5 CPU, the time taken by
> > khugepaged to scan the address space of
> > the process, which has been set with the
> > MMF_DISABLE_THP flag after being added
> > to the mm_slots list, is as follows (shorter is better):
> >
> > VMA Count | Old | New | Change
> > ---------------------------------------
> > 50 | 23us | 9us | -60.9%
> > 100 | 32us | 9us | -71.9%
> > 200 | 44us | 9us | -79.5%
> > 400 | 75us | 9us | -88.0%
> > 800 | 98us | 9us | -90.8%
> >
> > IIUC, once the count of VMAs for the process
> > exceeds page_to_scan, khugepaged needs to
> > wait for scan_sleep_millisecs ms before scanning
> > the next process. IMO, unnecessary scans could
> > actually be skipped with a very inexpensive
> > mm->flags check in this case.
>
> FWIW
>
> Acked-by: David Hildenbrand <[email protected]>
>
> --
> Cheers,
>
> David / dhildenb
>
On Wed, Feb 21, 2024 at 11:51 PM Lance Yang <[email protected]> wrote:
>
> Hey,
>
> On an Intel Core i5 CPU, the time taken by
> khugepaged to scan the address space of
> the process, which has been set with the
> MMF_DISABLE_THP flag after being added
> to the mm_slots list, is as follows (shorter is better):
>
> VMA Count | Old | New | Change
> ---------------------------------------
> 50 | 23us | 9us | -60.9%
> 100 | 32us | 9us | -71.9%
> 200 | 44us | 9us | -79.5%
> 400 | 75us | 9us | -88.0%
> 800 | 98us | 9us | -90.8%
>
> IIUC, once the count of VMAs for the process
> exceeds page_to_scan, khugepaged needs to
> wait for scan_sleep_millisecs ms before scanning
> the next process. IMO, unnecessary scans could
> actually be skipped with a very inexpensive
> mm->flags check in this case.
Thanks for following up on this, can you please capture all the
information in the commit log?
>
> Best,
> Lance
>
> On Wed, Jan 31, 2024 at 5:30 PM Lance Yang <[email protected]> wrote:
> >
> > Updating the change log.
> [...]
>
> > On Mon, Jan 29, 2024 at 1:46 PM Lance Yang <[email protected]> wrote:
> > >
> > > khugepaged scans the entire address space in the
> > > background for each given mm, looking for
> > > opportunities to merge sequences of basic pages
> > > into huge pages. However, when an mm is inserted
> > > to the mm_slots list, and the MMF_DISABLE_THP flag
> > > is set later, this scanning process becomes
> > > unnecessary for that mm and can be skipped to avoid
> > > redundant operations, especially in scenarios with
> > > a large address space.
> [...]
> > > Signed-off-by: Lance Yang <[email protected]>
On Thu, 22 Feb 2024 12:23:21 -0800 Yang Shi <[email protected]> wrote:
> > VMA Count | Old | New | Change
> > ---------------------------------------
> > 50 | 23us | 9us | -60.9%
> > 100 | 32us | 9us | -71.9%
> > 200 | 44us | 9us | -79.5%
> > 400 | 75us | 9us | -88.0%
> > 800 | 98us | 9us | -90.8%
> >
> > IIUC, once the count of VMAs for the process
> > exceeds page_to_scan, khugepaged needs to
> > wait for scan_sleep_millisecs ms before scanning
> > the next process. IMO, unnecessary scans could
> > actually be skipped with a very inexpensive
> > mm->flags check in this case.
>
> Thanks for following up on this, can you please capture all the
> information in the commit log?
I added it.
--- a/txt/mm-khugepaged-bypassing-unnecessary-scans-with-mmf_disable_thp-check.txt
+++ b/txt/mm-khugepaged-bypassing-unnecessary-scans-with-mmf_disable_thp-check.txt
@@ -9,6 +9,24 @@ and the MMF_DISABLE_THP flag is set later, this scanning process
becomes unnecessary for that mm and can be skipped to avoid redundant
operations, especially in scenarios with a large address space.
+On an Intel Core i5 CPU, the time taken by khugepaged to scan the
+address space of the process, which has been set with the
+MMF_DISABLE_THP flag after being added to the mm_slots list, is as
+follows (shorter is better):
+
+VMA Count | Old | New | Change
+---------------------------------------
+ 50 | 23us | 9us | -60.9%
+ 100 | 32us | 9us | -71.9%
+ 200 | 44us | 9us | -79.5%
+ 400 | 75us | 9us | -88.0%
+ 800 | 98us | 9us | -90.8%
+
+Once the count of VMAs for the process exceeds page_to_scan, khugepaged
+needs to wait for scan_sleep_millisecs ms before scanning the next
+process. IMO, unnecessary scans could actually be skipped with a very
+inexpensive mm->flags check in this case.
+
This commit introduces a check before each scanning process to test the
MMF_DISABLE_THP flag for the given mm; if the flag is set, the scanning
process is bypassed, thereby improving the efficiency of khugepaged.
Thanks for taking the time to look into this!
Thanks, Yang and Andrew!
Best,
Lance
On Fri, Feb 23, 2024 at 5:11 AM Andrew Morton <[email protected]> wrote:
>
> On Thu, 22 Feb 2024 12:23:21 -0800 Yang Shi <[email protected]> wrote:
>
> > > VMA Count | Old | New | Change
> > > ---------------------------------------
> > > 50 | 23us | 9us | -60.9%
> > > 100 | 32us | 9us | -71.9%
> > > 200 | 44us | 9us | -79.5%
> > > 400 | 75us | 9us | -88.0%
> > > 800 | 98us | 9us | -90.8%
> > >
> > > IIUC, once the count of VMAs for the process
> > > exceeds page_to_scan, khugepaged needs to
> > > wait for scan_sleep_millisecs ms before scanning
> > > the next process. IMO, unnecessary scans could
> > > actually be skipped with a very inexpensive
> > > mm->flags check in this case.
> >
> > Thanks for following up on this, can you please capture all the
> > information in the commit log?
>
> I added it.
>
> --- a/txt/mm-khugepaged-bypassing-unnecessary-scans-with-mmf_disable_thp-check.txt
> +++ b/txt/mm-khugepaged-bypassing-unnecessary-scans-with-mmf_disable_thp-check.txt
> @@ -9,6 +9,24 @@ and the MMF_DISABLE_THP flag is set later, this scanning process
> becomes unnecessary for that mm and can be skipped to avoid redundant
> operations, especially in scenarios with a large address space.
>
> +On an Intel Core i5 CPU, the time taken by khugepaged to scan the
> +address space of the process, which has been set with the
> +MMF_DISABLE_THP flag after being added to the mm_slots list, is as
> +follows (shorter is better):
> +
> +VMA Count | Old | New | Change
> +---------------------------------------
> + 50 | 23us | 9us | -60.9%
> + 100 | 32us | 9us | -71.9%
> + 200 | 44us | 9us | -79.5%
> + 400 | 75us | 9us | -88.0%
> + 800 | 98us | 9us | -90.8%
> +
> +Once the count of VMAs for the process exceeds page_to_scan, khugepaged
> +needs to wait for scan_sleep_millisecs ms before scanning the next
> +process. IMO, unnecessary scans could actually be skipped with a very
> +inexpensive mm->flags check in this case.
> +
> This commit introduces a check before each scanning process to test the
> MMF_DISABLE_THP flag for the given mm; if the flag is set, the scanning
> process is bypassed, thereby improving the efficiency of khugepaged.
>
On Sun, Jan 28, 2024 at 9:46 PM Lance Yang <[email protected]> wrote:
>
> khugepaged scans the entire address space in the
> background for each given mm, looking for
> opportunities to merge sequences of basic pages
> into huge pages. However, when an mm is inserted
> to the mm_slots list, and the MMF_DISABLE_THP flag
> is set later, this scanning process becomes
> unnecessary for that mm and can be skipped to avoid
> redundant operations, especially in scenarios with
> a large address space.
>
> This commit introduces a check before each scanning
> process to test the MMF_DISABLE_THP flag for the
> given mm; if the flag is set, the scanning process
> is bypassed, thereby improving the efficiency of
> khugepaged.
>
> Signed-off-by: Lance Yang <[email protected]>
> ---
> mm/khugepaged.c | 18 ++++++++++++------
> 1 file changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 2b219acb528e..d6a700834edc 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
> return atomic_read(&mm->mm_users) == 0;
> }
>
> +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
> +{
> + return hpage_collapse_test_exit(mm) ||
> + test_bit(MMF_DISABLE_THP, &mm->flags);
> +}
> +
> void __khugepaged_enter(struct mm_struct *mm)
> {
> struct khugepaged_mm_slot *mm_slot;
> @@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
>
> lockdep_assert_held(&khugepaged_mm_lock);
>
> - if (hpage_collapse_test_exit(mm)) {
> + if (hpage_collapse_test_exit_or_disable(mm)) {
I'm not quite sure whether we should remove the mm from mm_slot and
drop mm_count or not since clearing MMF_THP_DISABLE flag doesn't add
the mm back. Creating new vma may add the mm back, but I don't think
the behavior should depend on this.
> /* free mm_slot */
> hash_del(&slot->hash);
> list_del(&slot->mm_node);
> @@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> goto breakouterloop_mmap_lock;
>
> progress++;
> - if (unlikely(hpage_collapse_test_exit(mm)))
> + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> goto breakouterloop;
>
> vma_iter_init(&vmi, mm, khugepaged_scan.address);
> @@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> unsigned long hstart, hend;
>
> cond_resched();
> - if (unlikely(hpage_collapse_test_exit(mm))) {
> + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
> progress++;
> break;
> }
> @@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> bool mmap_locked = true;
>
> cond_resched();
> - if (unlikely(hpage_collapse_test_exit(mm)))
> + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> goto breakouterloop;
>
> VM_BUG_ON(khugepaged_scan.address < hstart ||
> @@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> fput(file);
> if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> mmap_read_lock(mm);
> - if (hpage_collapse_test_exit(mm))
> + if (hpage_collapse_test_exit_or_disable(mm))
> goto breakouterloop;
> *result = collapse_pte_mapped_thp(mm,
> khugepaged_scan.address, false);
> @@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> * Release the current mm_slot if this mm is about to die, or
> * if we scanned all vmas of this mm.
> */
> - if (hpage_collapse_test_exit(mm) || !vma) {
> + if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
Same like the above comment.
In addition, didn't you need to change hpage_collapse_test_exit() to
hpage_collapse_test_exit_or_disable() in hugepage_vma_revalidate()?
> /*
> * Make sure that if mm_users is reaching zero while
> * khugepaged runs here, khugepaged_exit will find
> --
> 2.33.1
>
On Thu, Feb 22, 2024 at 1:11 PM Andrew Morton <[email protected]> wrote:
>
> On Thu, 22 Feb 2024 12:23:21 -0800 Yang Shi <[email protected]> wrote:
>
> > > VMA Count | Old | New | Change
> > > ---------------------------------------
> > > 50 | 23us | 9us | -60.9%
> > > 100 | 32us | 9us | -71.9%
> > > 200 | 44us | 9us | -79.5%
> > > 400 | 75us | 9us | -88.0%
> > > 800 | 98us | 9us | -90.8%
> > >
> > > IIUC, once the count of VMAs for the process
> > > exceeds page_to_scan, khugepaged needs to
> > > wait for scan_sleep_millisecs ms before scanning
> > > the next process. IMO, unnecessary scans could
> > > actually be skipped with a very inexpensive
> > > mm->flags check in this case.
> >
> > Thanks for following up on this, can you please capture all the
> > information in the commit log?
>
> I added it.
>
> --- a/txt/mm-khugepaged-bypassing-unnecessary-scans-with-mmf_disable_thp-check.txt
> +++ b/txt/mm-khugepaged-bypassing-unnecessary-scans-with-mmf_disable_thp-check.txt
> @@ -9,6 +9,24 @@ and the MMF_DISABLE_THP flag is set later, this scanning process
> becomes unnecessary for that mm and can be skipped to avoid redundant
> operations, especially in scenarios with a large address space.
>
> +On an Intel Core i5 CPU, the time taken by khugepaged to scan the
> +address space of the process, which has been set with the
> +MMF_DISABLE_THP flag after being added to the mm_slots list, is as
> +follows (shorter is better):
> +
> +VMA Count | Old | New | Change
> +---------------------------------------
> + 50 | 23us | 9us | -60.9%
> + 100 | 32us | 9us | -71.9%
> + 200 | 44us | 9us | -79.5%
> + 400 | 75us | 9us | -88.0%
> + 800 | 98us | 9us | -90.8%
> +
> +Once the count of VMAs for the process exceeds page_to_scan, khugepaged
> +needs to wait for scan_sleep_millisecs ms before scanning the next
> +process. IMO, unnecessary scans could actually be skipped with a very
> +inexpensive mm->flags check in this case.
> +
Thanks, Andrew.
> This commit introduces a check before each scanning process to test the
> MMF_DISABLE_THP flag for the given mm; if the flag is set, the scanning
> process is bypassed, thereby improving the efficiency of khugepaged.
>
Hey Yang,
Thanks for taking time to review!
On Sat, Feb 24, 2024 at 9:46 AM Yang Shi <[email protected]> wrote:
>
> On Sun, Jan 28, 2024 at 9:46 PM Lance Yang <[email protected]> wrote:
> >
> > khugepaged scans the entire address space in the
> > background for each given mm, looking for
> > opportunities to merge sequences of basic pages
> > into huge pages. However, when an mm is inserted
> > to the mm_slots list, and the MMF_DISABLE_THP flag
> > is set later, this scanning process becomes
> > unnecessary for that mm and can be skipped to avoid
> > redundant operations, especially in scenarios with
> > a large address space.
> >
> > This commit introduces a check before each scanning
> > process to test the MMF_DISABLE_THP flag for the
> > given mm; if the flag is set, the scanning process
> > is bypassed, thereby improving the efficiency of
> > khugepaged.
> >
> > Signed-off-by: Lance Yang <[email protected]>
> > ---
> > mm/khugepaged.c | 18 ++++++++++++------
> > 1 file changed, 12 insertions(+), 6 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 2b219acb528e..d6a700834edc 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
> > return atomic_read(&mm->mm_users) == 0;
> > }
> >
> > +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
> > +{
> > + return hpage_collapse_test_exit(mm) ||
> > + test_bit(MMF_DISABLE_THP, &mm->flags);
> > +}
> > +
> > void __khugepaged_enter(struct mm_struct *mm)
> > {
> > struct khugepaged_mm_slot *mm_slot;
> > @@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
> >
> > lockdep_assert_held(&khugepaged_mm_lock);
> >
> > - if (hpage_collapse_test_exit(mm)) {
> > + if (hpage_collapse_test_exit_or_disable(mm)) {
>
> I'm not quite sure whether we should remove the mm from mm_slot and
> drop mm_count or not since clearing MMF_THP_DISABLE flag doesn't add
> the mm back. Creating new vma may add the mm back, but I don't think
> the behavior should depend on this.
Thanks for bringing this up! I agree with your point.
I'll remove the MMF_THP_DISABLE flag check in
collect_mm_slot().
>
> > /* free mm_slot */
> > hash_del(&slot->hash);
> > list_del(&slot->mm_node);
> > @@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > goto breakouterloop_mmap_lock;
> >
> > progress++;
> > - if (unlikely(hpage_collapse_test_exit(mm)))
> > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > goto breakouterloop;
> >
> > vma_iter_init(&vmi, mm, khugepaged_scan.address);
> > @@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > unsigned long hstart, hend;
> >
> > cond_resched();
> > - if (unlikely(hpage_collapse_test_exit(mm))) {
> > + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
> > progress++;
> > break;
> > }
> > @@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > bool mmap_locked = true;
> >
> > cond_resched();
> > - if (unlikely(hpage_collapse_test_exit(mm)))
> > + if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > goto breakouterloop;
> >
> > VM_BUG_ON(khugepaged_scan.address < hstart ||
> > @@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > fput(file);
> > if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> > mmap_read_lock(mm);
> > - if (hpage_collapse_test_exit(mm))
> > + if (hpage_collapse_test_exit_or_disable(mm))
> > goto breakouterloop;
> > *result = collapse_pte_mapped_thp(mm,
> > khugepaged_scan.address, false);
> > @@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > * Release the current mm_slot if this mm is about to die, or
> > * if we scanned all vmas of this mm.
> > */
> > - if (hpage_collapse_test_exit(mm) || !vma) {
> > + if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
>
> Same like the above comment.
I‘ll also remove the MMF_THP_DISABLE flag check here.
>
> In addition, didn't you need to change hpage_collapse_test_exit() to
> hpage_collapse_test_exit_or_disable() in hugepage_vma_revalidate()?
Thanks for your suggestion! It makes sense to change
hpage_collapse_test_exit() to hpage_collapse_test_exit_or_disable()
in hugepage_vma_revalidate().
Thanks again for your time!
Lance
>
> > /*
> > * Make sure that if mm_users is reaching zero while
> > * khugepaged runs here, khugepaged_exit will find
> > --
> > 2.33.1
> >
As a diff against mm-stable.
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2771fc043b3b..1c0073daad82 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -920,7 +920,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
{
struct vm_area_struct *vma;
- if (unlikely(hpage_collapse_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
return SCAN_ANY_PROCESS;
*vmap = vma = find_vma(mm, address);
@@ -1428,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
lockdep_assert_held(&khugepaged_mm_lock);
- if (hpage_collapse_test_exit_or_disable(mm)) {
+ if (hpage_collapse_test_exit(mm)) {
/* free mm_slot */
hash_del(&slot->hash);
list_del(&slot->mm_node);
@@ -2456,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
*/
- if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
+ if (hpage_collapse_test_exit(mm) || !vma) {
/*
* Make sure that if mm_users is reaching zero while
* khugepaged runs here, khugepaged_exit will find