2023-09-26 03:21:58

by Rik van Riel

[permalink] [raw]
Subject: [PATCH v4 0/3] hugetlbfs: close race between MADV_DONTNEED and page fault

v4: fix unmap_vmas locking issue pointed out by Mike Kravetz, and resulting lockdep fallout
v3: fix compile error w/ lockdep and test case errors with patch 3
v2: fix the locking bug found with the libhugetlbfs tests.

Malloc libraries, like jemalloc and tcalloc, take decisions on when
to call madvise independently from the code in the main application.

This sometimes results in the application page faulting on an address,
right after the malloc library has shot down the backing memory with
MADV_DONTNEED.

Usually this is harmless, because we always have some 4kB pages
sitting around to satisfy a page fault. However, with hugetlbfs
systems often allocate only the exact number of huge pages that
the application wants.

Due to TLB batching, hugetlbfs MADV_DONTNEED will free pages outside of
any lock taken on the page fault path, which can open up the following
race condition:

CPU 1 CPU 2

MADV_DONTNEED
unmap page
shoot down TLB entry
page fault
fail to allocate a huge page
killed with SIGBUS
free page

Fix that race by extending the hugetlb_vma_lock locking scheme to also
cover private hugetlb mappings (with resv_map), and pulling the locking
from __unmap_hugepage_final_range into helper functions called from
zap_page_range_single. This ensures page faults stay locked out of
the MADV_DONTNEED VMA until the huge pages have actually been freed.

The third patch in the series is more of an RFC. Using the
invalidate_lock instead of the hugetlb_vma_lock greatly simplifies
the code, but at the cost of turning a per-VMA lock into a lock
per backing hugetlbfs file, which could slow things down when
multiple processes are mapping the same hugetlbfs file.



2023-09-26 03:22:12

by Rik van Riel

[permalink] [raw]
Subject: [PATCH 1/3] hugetlbfs: extend hugetlb_vma_lock to private VMAs

From: Rik van Riel <[email protected]>

Extend the locking scheme used to protect shared hugetlb mappings
from truncate vs page fault races, in order to protect private
hugetlb mappings (with resv_map) against MADV_DONTNEED.

Add a read-write semaphore to the resv_map data structure, and
use that from the hugetlb_vma_(un)lock_* functions, in preparation
for closing the race between MADV_DONTNEED and page faults.

Signed-off-by: Rik van Riel <[email protected]>
Reviewed-by: Mike Kravetz <[email protected]>
---
include/linux/hugetlb.h | 6 ++++++
mm/hugetlb.c | 41 +++++++++++++++++++++++++++++++++++++----
2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5b2626063f4f..694928fa06a3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -60,6 +60,7 @@ struct resv_map {
long adds_in_progress;
struct list_head region_cache;
long region_cache_count;
+ struct rw_semaphore rw_sema;
#ifdef CONFIG_CGROUP_HUGETLB
/*
* On private mappings, the counter to uncharge reservations is stored
@@ -1231,6 +1232,11 @@ static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
}

+static inline bool __vma_private_lock(struct vm_area_struct *vma)
+{
+ return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data;
+}
+
/*
* Safe version of huge_pte_offset() to check the locks. See comments
* above huge_pte_offset().
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ba6d39b71cb1..e859fba5bc7d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -97,6 +97,7 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma);

static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -267,6 +268,10 @@ void hugetlb_vma_lock_read(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

down_read(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ down_read(&resv_map->rw_sema);
}
}

@@ -276,6 +281,10 @@ void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

up_read(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ up_read(&resv_map->rw_sema);
}
}

@@ -285,6 +294,10 @@ void hugetlb_vma_lock_write(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

down_write(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ down_write(&resv_map->rw_sema);
}
}

@@ -294,17 +307,27 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

up_write(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ up_write(&resv_map->rw_sema);
}
}

int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
{
- struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

- if (!__vma_shareable_lock(vma))
- return 1;
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

- return down_write_trylock(&vma_lock->rw_sema);
+ return down_write_trylock(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ return down_write_trylock(&resv_map->rw_sema);
+ }
+
+ return 1;
}

void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
@@ -313,6 +336,10 @@ void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

lockdep_assert_held(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ lockdep_assert_held(&resv_map->rw_sema);
}
}

@@ -345,6 +372,11 @@ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

__hugetlb_vma_unlock_write_put(vma_lock);
+ } else {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ /* no free for anon vmas, but still need to unlock */
+ up_write(&resv_map->rw_sema);
}
}

@@ -1068,6 +1100,7 @@ struct resv_map *resv_map_alloc(void)
kref_init(&resv_map->refs);
spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions);
+ init_rwsem(&resv_map->rw_sema);

resv_map->adds_in_progress = 0;
/*
--
2.41.0

2023-09-30 10:23:30

by Mike Kravetz

[permalink] [raw]
Subject: Re: [PATCH 1/3] hugetlbfs: extend hugetlb_vma_lock to private VMAs

On 09/25/23 23:10, [email protected] wrote:
> From: Rik van Riel <[email protected]>
>
> Extend the locking scheme used to protect shared hugetlb mappings
> from truncate vs page fault races, in order to protect private
> hugetlb mappings (with resv_map) against MADV_DONTNEED.
>
> Add a read-write semaphore to the resv_map data structure, and
> use that from the hugetlb_vma_(un)lock_* functions, in preparation
> for closing the race between MADV_DONTNEED and page faults.
>
> Signed-off-by: Rik van Riel <[email protected]>
> Reviewed-by: Mike Kravetz <[email protected]>
> ---
> include/linux/hugetlb.h | 6 ++++++
> mm/hugetlb.c | 41 +++++++++++++++++++++++++++++++++++++----
> 2 files changed, 43 insertions(+), 4 deletions(-)

My bad during the review of patch 2!

In reply to patch 1, I suggested the changes:

> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index f906c5fa4d09..8f3d5895fffc 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -372,6 +372,11 @@ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
> struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>
> __hugetlb_vma_unlock_write_put(vma_lock);
> + } else if (__vma_private_lock(vma)) {
> + struct resv_map *resv_map = vma_resv_map(vma);
> +
> + /* no free for anon vmas, but still need to unlock */
> + up_write(&resv_map->rw_sema);
> }
> }

However, the check for 'if (__vma_private_lock(vma))' was dropped.

> @@ -345,6 +372,11 @@ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
> struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>
> __hugetlb_vma_unlock_write_put(vma_lock);
> + } else {
> + struct resv_map *resv_map = vma_resv_map(vma);
> +
> + /* no free for anon vmas, but still need to unlock */
> + up_write(&resv_map->rw_sema);
> }
> }

So, the map_high_truncate_2 (2M: 32) libhugetlbfs test still BUGs with:

BUG: kernel NULL pointer dereference
--
Mike Kravetz

2023-09-30 19:49:07

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 1/3] hugetlbfs: extend hugetlb_vma_lock to private VMAs

On Fri, 2023-09-29 at 19:28 -0700, Mike Kravetz wrote:
> On 09/25/23 23:10, [email protected] wrote:
>
>
> In reply to patch 1, I suggested the changes:
>
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index f906c5fa4d09..8f3d5895fffc 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -372,6 +372,11 @@ static void
> > __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
> >                 struct hugetlb_vma_lock *vma_lock = vma-
> > >vm_private_data;
> >  
> >                 __hugetlb_vma_unlock_write_put(vma_lock);
> > +       } else if (__vma_private_lock(vma)) {
> > +               struct resv_map *resv_map = vma_resv_map(vma);
> > +
> > +               /* no free for anon vmas, but still need to unlock
> > */
> > +               up_write(&resv_map->rw_sema);
> >         }
> >  }
>
> However, the check for 'if (__vma_private_lock(vma))' was dropped.

Oh ugh. I definitely added that in somewhere, but must
have committed that to the wrong git branch :(

Let me send out a new version with that change.

Sorry about that.

--
All Rights Reversed.