2022-11-29 16:23:40

by Jann Horn

[permalink] [raw]
Subject: [PATCH v5 1/3] mm/khugepaged: Take the right locks for page table retraction

pagetable walks on address ranges mapped by VMAs can be done under the mmap
lock, the lock of an anon_vma attached to the VMA, or the lock of the VMA's
address_space. Only one of these needs to be held, and it does not need to
be held in exclusive mode.

Under those circumstances, the rules for concurrent access to page table
entries are:

- Terminal page table entries (entries that don't point to another page
table) can be arbitrarily changed under the page table lock, with the
exception that they always need to be consistent for
hardware page table walks and lockless_pages_from_mm().
This includes that they can be changed into non-terminal entries.
- Non-terminal page table entries (which point to another page table)
can not be modified; readers are allowed to READ_ONCE() an entry, verify
that it is non-terminal, and then assume that its value will stay as-is.

Retracting a page table involves modifying a non-terminal entry, so
page-table-level locks are insufficient to protect against concurrent
page table traversal; it requires taking all the higher-level locks under
which it is possible to start a page walk in the relevant range in
exclusive mode.

The collapse_huge_page() path for anonymous THP already follows this rule,
but the shmem/file THP path was getting it wrong, making it possible for
concurrent rmap-based operations to cause corruption.

Cc: [email protected]
Fixes: 27e1f8273113 ("khugepaged: enable collapse pmd for pte-mapped THP")
Acked-by: David Hildenbrand <[email protected]>
Reviewed-by: Yang Shi <[email protected]>
Signed-off-by: Jann Horn <[email protected]>
---

Notes:
v4: added ack by David Hildenbrand
v5: added reviewed-by by Yang Shi

mm/khugepaged.c | 55 +++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4734315f79407..674b111a24fa7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1384,16 +1384,37 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
return SCAN_SUCCEED;
}

+/*
+ * A note about locking:
+ * Trying to take the page table spinlocks would be useless here because those
+ * are only used to synchronize:
+ *
+ * - modifying terminal entries (ones that point to a data page, not to another
+ * page table)
+ * - installing *new* non-terminal entries
+ *
+ * Instead, we need roughly the same kind of protection as free_pgtables() or
+ * mm_take_all_locks() (but only for a single VMA):
+ * The mmap lock together with this VMA's rmap locks covers all paths towards
+ * the page table entries we're messing with here, except for hardware page
+ * table walks and lockless_pages_from_mm().
+ */
static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- spinlock_t *ptl;
pmd_t pmd;

mmap_assert_write_locked(mm);
- ptl = pmd_lock(vma->vm_mm, pmdp);
+ if (vma->vm_file)
+ lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
+ /*
+ * All anon_vmas attached to the VMA have the same root and are
+ * therefore locked by the same lock.
+ */
+ if (vma->anon_vma)
+ lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
+
pmd = pmdp_collapse_flush(vma, addr, pmdp);
- spin_unlock(ptl);
mm_dec_nr_ptes(mm);
page_table_check_pte_clear_range(mm, addr, pmd);
pte_free(mm, pmd_pgtable(pmd));
@@ -1444,6 +1465,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
return SCAN_VMA_CHECK;

+ /*
+ * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings
+ * that got written to. Without this, we'd have to also lock the
+ * anon_vma if one exists.
+ */
+ if (vma->anon_vma)
+ return SCAN_VMA_CHECK;
+
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
if (userfaultfd_wp(vma))
return SCAN_PTE_UFFD_WP;
@@ -1477,6 +1506,20 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto drop_hpage;
}

+ /*
+ * We need to lock the mapping so that from here on, only GUP-fast and
+ * hardware page walks can access the parts of the page tables that
+ * we're operating on.
+ * See collapse_and_free_pmd().
+ */
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ /*
+ * This spinlock should be unnecessary: Nobody else should be accessing
+ * the page tables under spinlock protection here, only
+ * lockless_pages_from_mm() and the hardware page walker can access page
+ * tables while all the high-level locks are held in write mode.
+ */
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
result = SCAN_FAIL;

@@ -1531,6 +1574,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
/* step 4: remove pte entries */
collapse_and_free_pmd(mm, vma, haddr, pmd);

+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
@@ -1544,6 +1589,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,

abort:
pte_unmap_unlock(start_pte, ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
goto drop_hpage;
}

@@ -1600,7 +1646,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
* An alternative would be drop the check, but check that page
* table is clear before calling pmdp_collapse_flush() under
* ptl. It has higher chance to recover THP for the VMA, but
- * has higher cost too.
+ * has higher cost too. It would also probably require locking
+ * the anon_vma.
*/
if (vma->anon_vma) {
result = SCAN_PAGE_ANON;

base-commit: eb7081409f94a9a8608593d0fb63a1aa3d6f95d8
--
2.38.1.584.g0f3c55d4c2-goog


2022-11-29 16:25:37

by Jann Horn

[permalink] [raw]
Subject: [PATCH v5 2/3] mm/khugepaged: Fix GUP-fast interaction by sending IPI

The khugepaged paths that remove page tables have to be careful to
synchronize against the lockless_pages_from_mm() path, which traverses
page tables while only being protected by disabled IRQs.
lockless_pages_from_mm() must not:

1. interpret the contents of freed memory as page tables (and once a
page table has been deposited, it can be freed)
2. interpret the contents of deposited page tables as PTEs, since some
architectures will store non-PTE data inside deposited page tables
(see radix__pgtable_trans_huge_deposit())
3. create new page references from PTEs after the containing page
table has been detached and:
3a. __collapse_huge_page_isolate() has checked the page refcount
3b. the page table has been reused at another virtual address and
populated with new PTEs

("new page references" here refer to stable references returned to the
caller; speculative references that are dropped on an error path are
fine)

commit 70cbc3cc78a99 ("mm: gup: fix the fast GUP race against THP
collapse") addressed issue 3 by making the lockless_pages_from_mm()
fastpath recheck the pmd_t to ensure that the page table was not
removed by khugepaged in between (under the assumption that the page
table is not repeatedly moving back and forth between two addresses,
with one PTE repeatedly being populated with the same value).

But to address issues 1 and 2, we need to send IPIs before
freeing/reusing page tables. By doing that, issue 3 is also
automatically addressed, so the fix from commit 70cbc3cc78a99 ("mm: gup:
fix the fast GUP race against THP collapse") becomes redundant.

We can ensure that the necessary IPI is sent by calling
tlb_remove_table_sync_one() because, as noted in mm/gup.c, under
configurations that define CONFIG_HAVE_FAST_GUP, there are two possible
cases:

1. CONFIG_MMU_GATHER_RCU_TABLE_FREE is set, causing
tlb_remove_table_sync_one() to send an IPI to synchronize with
lockless_pages_from_mm().
2. CONFIG_MMU_GATHER_RCU_TABLE_FREE is unset, indicating that all
TLB flushes are already guaranteed to send IPIs.
tlb_remove_table_sync_one() will do nothing, but we've already
run pmdp_collapse_flush(), which did a TLB flush, which must have
involved IPIs.

Cc: [email protected]
Fixes: ba76149f47d8 ("thp: khugepaged")
Reviewed-by: Yang Shi <[email protected]>
Acked-by: David Hildenbrand <[email protected]>
Signed-off-by: Jann Horn <[email protected]>
---

Notes:
v4:
- added ack from David Hildenbrand
- made commit message more verbose
v5:
- added reviewed-by from Yang Shi
- rewrote commit message based on feedback from Yang Shi

include/asm-generic/tlb.h | 4 ++++
mm/khugepaged.c | 2 ++
mm/mmu_gather.c | 4 +---
3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 492dce43236ea..cab7cfebf40bd 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -222,12 +222,16 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
#define tlb_needs_table_invalidate() (true)
#endif

+void tlb_remove_table_sync_one(void);
+
#else

#ifdef tlb_needs_table_invalidate
#error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
#endif

+static inline void tlb_remove_table_sync_one(void) { }
+
#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */


diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 674b111a24fa7..c3d3ce596bff7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1057,6 +1057,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
+ tlb_remove_table_sync_one();

spin_lock(pte_ptl);
result = __collapse_huge_page_isolate(vma, address, pte, cc,
@@ -1415,6 +1416,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
lockdep_assert_held_write(&vma->anon_vma->root->rwsem);

pmd = pmdp_collapse_flush(vma, addr, pmdp);
+ tlb_remove_table_sync_one();
mm_dec_nr_ptes(mm);
page_table_check_pte_clear_range(mm, addr, pmd);
pte_free(mm, pmd_pgtable(pmd));
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index add4244e5790d..3a2c3f8cad2fe 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -153,7 +153,7 @@ static void tlb_remove_table_smp_sync(void *arg)
/* Simply deliver the interrupt */
}

-static void tlb_remove_table_sync_one(void)
+void tlb_remove_table_sync_one(void)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
@@ -177,8 +177,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch)

#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */

-static void tlb_remove_table_sync_one(void) { }
-
static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
__tlb_remove_table_free(batch);
--
2.38.1.584.g0f3c55d4c2-goog

2022-11-29 17:49:56

by Yang Shi

[permalink] [raw]
Subject: Re: [PATCH v5 2/3] mm/khugepaged: Fix GUP-fast interaction by sending IPI

On Tue, Nov 29, 2022 at 7:47 AM Jann Horn <[email protected]> wrote:
>
> The khugepaged paths that remove page tables have to be careful to
> synchronize against the lockless_pages_from_mm() path, which traverses
> page tables while only being protected by disabled IRQs.
> lockless_pages_from_mm() must not:
>
> 1. interpret the contents of freed memory as page tables (and once a
> page table has been deposited, it can be freed)
> 2. interpret the contents of deposited page tables as PTEs, since some
> architectures will store non-PTE data inside deposited page tables
> (see radix__pgtable_trans_huge_deposit())
> 3. create new page references from PTEs after the containing page
> table has been detached and:
> 3a. __collapse_huge_page_isolate() has checked the page refcount
> 3b. the page table has been reused at another virtual address and
> populated with new PTEs
>
> ("new page references" here refer to stable references returned to the
> caller; speculative references that are dropped on an error path are
> fine)
>
> commit 70cbc3cc78a99 ("mm: gup: fix the fast GUP race against THP
> collapse") addressed issue 3 by making the lockless_pages_from_mm()
> fastpath recheck the pmd_t to ensure that the page table was not
> removed by khugepaged in between (under the assumption that the page
> table is not repeatedly moving back and forth between two addresses,
> with one PTE repeatedly being populated with the same value).
>
> But to address issues 1 and 2, we need to send IPIs before
> freeing/reusing page tables. By doing that, issue 3 is also
> automatically addressed, so the fix from commit 70cbc3cc78a99 ("mm: gup:
> fix the fast GUP race against THP collapse") becomes redundant.
>
> We can ensure that the necessary IPI is sent by calling
> tlb_remove_table_sync_one() because, as noted in mm/gup.c, under
> configurations that define CONFIG_HAVE_FAST_GUP, there are two possible
> cases:
>
> 1. CONFIG_MMU_GATHER_RCU_TABLE_FREE is set, causing
> tlb_remove_table_sync_one() to send an IPI to synchronize with
> lockless_pages_from_mm().
> 2. CONFIG_MMU_GATHER_RCU_TABLE_FREE is unset, indicating that all
> TLB flushes are already guaranteed to send IPIs.
> tlb_remove_table_sync_one() will do nothing, but we've already
> run pmdp_collapse_flush(), which did a TLB flush, which must have
> involved IPIs.
>
> Cc: [email protected]
> Fixes: ba76149f47d8 ("thp: khugepaged")
> Reviewed-by: Yang Shi <[email protected]>
> Acked-by: David Hildenbrand <[email protected]>
> Signed-off-by: Jann Horn <[email protected]>
> ---
>
> Notes:
> v4:
> - added ack from David Hildenbrand
> - made commit message more verbose
> v5:
> - added reviewed-by from Yang Shi
> - rewrote commit message based on feedback from Yang Shi

Thanks, Jann. Looks good to me.

>
> include/asm-generic/tlb.h | 4 ++++
> mm/khugepaged.c | 2 ++
> mm/mmu_gather.c | 4 +---
> 3 files changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> index 492dce43236ea..cab7cfebf40bd 100644
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -222,12 +222,16 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
> #define tlb_needs_table_invalidate() (true)
> #endif
>
> +void tlb_remove_table_sync_one(void);
> +
> #else
>
> #ifdef tlb_needs_table_invalidate
> #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
> #endif
>
> +static inline void tlb_remove_table_sync_one(void) { }
> +
> #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
>
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 674b111a24fa7..c3d3ce596bff7 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1057,6 +1057,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> _pmd = pmdp_collapse_flush(vma, address, pmd);
> spin_unlock(pmd_ptl);
> mmu_notifier_invalidate_range_end(&range);
> + tlb_remove_table_sync_one();
>
> spin_lock(pte_ptl);
> result = __collapse_huge_page_isolate(vma, address, pte, cc,
> @@ -1415,6 +1416,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
> lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
>
> pmd = pmdp_collapse_flush(vma, addr, pmdp);
> + tlb_remove_table_sync_one();
> mm_dec_nr_ptes(mm);
> page_table_check_pte_clear_range(mm, addr, pmd);
> pte_free(mm, pmd_pgtable(pmd));
> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> index add4244e5790d..3a2c3f8cad2fe 100644
> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -153,7 +153,7 @@ static void tlb_remove_table_smp_sync(void *arg)
> /* Simply deliver the interrupt */
> }
>
> -static void tlb_remove_table_sync_one(void)
> +void tlb_remove_table_sync_one(void)
> {
> /*
> * This isn't an RCU grace period and hence the page-tables cannot be
> @@ -177,8 +177,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch)
>
> #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
>
> -static void tlb_remove_table_sync_one(void) { }
> -
> static void tlb_remove_table_free(struct mmu_table_batch *batch)
> {
> __tlb_remove_table_free(batch);
> --
> 2.38.1.584.g0f3c55d4c2-goog
>