LinuxLists.cc - [PATCH v4 6/8] mm/gup: Retire follow_hugetlb

2023-06-28 22:08:16

Subject: [PATCH v4 6/8] mm/gup: Retire follow_hugetlb_page()

Now __get_user_pages() should be well prepared to handle thp completely,
as long as hugetlb gup requests even without the hugetlb's special path.

Time to retire follow_hugetlb_page().

Tweak misc comments to reflect reality of follow_hugetlb_page()'s removal.

Acked-by: David Hildenbrand <[email protected]>
Signed-off-by: Peter Xu <[email protected]>
---
fs/userfaultfd.c | 2 +-
include/linux/hugetlb.h | 12 ---
mm/gup.c | 19 ----
mm/hugetlb.c | 224 ----------------------------------------
4 files changed, 1 insertion(+), 256 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7cecd49e078b..ae711f1d7a83 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -427,7 +427,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
*
* We also don't do userfault handling during
* coredumping. hugetlbfs has the special
- * follow_hugetlb_page() to skip missing pages in the
+ * hugetlb_follow_page_mask() to skip missing pages in the
* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
* the no_page_table() helper in follow_page_mask(), but the
* shmem_vm_ops->fault method is invoked even during
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 9f282f370d96..9bc3c2d71b71 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -133,9 +133,6 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned int *page_mask);
-long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
- struct page **, unsigned long *, unsigned long *,
- long, unsigned int, int *);
void unmap_hugepage_range(struct vm_area_struct *,
unsigned long, unsigned long, struct page *,
zap_flags_t);
@@ -305,15 +302,6 @@ static inline struct page *hugetlb_follow_page_mask(
BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/
}

-static inline long follow_hugetlb_page(struct mm_struct *mm,
- struct vm_area_struct *vma, struct page **pages,
- unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags, int *nonblocking)
-{
- BUG();
- return 0;
-}
-
static inline int copy_hugetlb_page_range(struct mm_struct *dst,
struct mm_struct *src,
struct vm_area_struct *dst_vma,
diff --git a/mm/gup.c b/mm/gup.c
index 0e2b0ff1143a..a7c294de6ae5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -775,9 +775,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
* Call hugetlb_follow_page_mask for hugetlb vmas as it will use
* special hugetlb page table walking code. This eliminates the
* need to check for hugetlb entries in the general walking code.
- *
- * hugetlb_follow_page_mask is only for follow_page() handling here.
- * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
*/
if (is_vm_hugetlb_page(vma))
return hugetlb_follow_page_mask(vma, address, flags,
@@ -1138,22 +1135,6 @@ static long __get_user_pages(struct mm_struct *mm,
ret = check_vma_flags(vma, gup_flags);
if (ret)
goto out;
-
- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages,
- &start, &nr_pages, i,
- gup_flags, locked);
- if (!*locked) {
- /*
- * We've got a VM_FAULT_RETRY
- * and we've lost mmap_lock.
- * We must stop here.
- */
- BUG_ON(gup_flags & FOLL_NOWAIT);
- goto out;
- }
- continue;
- }
}
retry:
/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 15e82a8a2b76..2f12da409a19 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5721,7 +5721,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,

/*
* Return whether there is a pagecache page to back given address within VMA.
- * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
*/
static bool hugetlbfs_pagecache_present(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
@@ -6422,37 +6421,6 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
}
#endif /* CONFIG_USERFAULTFD */

-static void record_subpages(struct page *page, struct vm_area_struct *vma,
- int refs, struct page **pages)
-{
- int nr;
-
- for (nr = 0; nr < refs; nr++) {
- if (likely(pages))
- pages[nr] = nth_page(page, nr);
- }
-}
-
-static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
- unsigned int flags, pte_t *pte,
- bool *unshare)
-{
- pte_t pteval = huge_ptep_get(pte);
-
- *unshare = false;
- if (is_swap_pte(pteval))
- return true;
- if (huge_pte_write(pteval))
- return false;
- if (flags & FOLL_WRITE)
- return true;
- if (gup_must_unshare(vma, flags, pte_page(pteval))) {
- *unshare = true;
- return true;
- }
- return false;
-}
-
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned int *page_mask)
@@ -6524,198 +6492,6 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
return page;
}

-long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, unsigned long *position,
- unsigned long *nr_pages, long i, unsigned int flags,
- int *locked)
-{
- unsigned long pfn_offset;
- unsigned long vaddr = *position;
- unsigned long remainder = *nr_pages;
- struct hstate *h = hstate_vma(vma);
- int err = -EFAULT, refs;
-
- while (vaddr < vma->vm_end && remainder) {
- pte_t *pte;
- spinlock_t *ptl = NULL;
- bool unshare = false;
- int absent;
- struct page *page;
-
- /*
- * If we have a pending SIGKILL, don't keep faulting pages and
- * potentially allocating memory.
- */
- if (fatal_signal_pending(current)) {
- remainder = 0;
- break;
- }
-
- hugetlb_vma_lock_read(vma);
- /*
- * Some archs (sparc64, sh*) have multiple pte_ts to
- * each hugepage. We have to make sure we get the
- * first, for the page indexing below to work.
- *
- * Note that page table lock is not held when pte is null.
- */
- pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
- huge_page_size(h));
- if (pte)
- ptl = huge_pte_lock(h, mm, pte);
- absent = !pte || huge_pte_none(huge_ptep_get(pte));
-
- /*
- * When coredumping, it suits get_dump_page if we just return
- * an error where there's an empty slot with no huge pagecache
- * to back it. This way, we avoid allocating a hugepage, and
- * the sparse dumpfile avoids allocating disk blocks, but its
- * huge holes still show up with zeroes where they need to be.
- */
- if (absent && (flags & FOLL_DUMP) &&
- !hugetlbfs_pagecache_present(h, vma, vaddr)) {
- if (pte)
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- remainder = 0;
- break;
- }
-
- /*
- * We need call hugetlb_fault for both hugepages under migration
- * (in which case hugetlb_fault waits for the migration,) and
- * hwpoisoned hugepages (in which case we need to prevent the
- * caller from accessing to them.) In order to do this, we use
- * here is_swap_pte instead of is_hugetlb_entry_migration and
- * is_hugetlb_entry_hwpoisoned. This is because it simply covers
- * both cases, and because we can't follow correct pages
- * directly from any kind of swap entries.
- */
- if (absent ||
- __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
- vm_fault_t ret;
- unsigned int fault_flags = 0;
-
- if (pte)
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
-
- if (flags & FOLL_WRITE)
- fault_flags |= FAULT_FLAG_WRITE;
- else if (unshare)
- fault_flags |= FAULT_FLAG_UNSHARE;
- if (locked) {
- fault_flags |= FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_KILLABLE;
- if (flags & FOLL_INTERRUPTIBLE)
- fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
- }
- if (flags & FOLL_NOWAIT)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_RETRY_NOWAIT;
- if (flags & FOLL_TRIED) {
- /*
- * Note: FAULT_FLAG_ALLOW_RETRY and
- * FAULT_FLAG_TRIED can co-exist
- */
- fault_flags |= FAULT_FLAG_TRIED;
- }
- ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
- if (ret & VM_FAULT_ERROR) {
- err = vm_fault_to_errno(ret, flags);
- remainder = 0;
- break;
- }
- if (ret & VM_FAULT_RETRY) {
- if (locked &&
- !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
- *locked = 0;
- *nr_pages = 0;
- /*
- * VM_FAULT_RETRY must not return an
- * error, it will return zero
- * instead.
- *
- * No need to update "position" as the
- * caller will not check it after
- * *nr_pages is set to 0.
- */
- return i;
- }
- continue;
- }
-
- pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
- page = pte_page(huge_ptep_get(pte));
-
- VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
- !PageAnonExclusive(page), page);
-
- /*
- * If subpage information not requested, update counters
- * and skip the same_page loop below.
- */
- if (!pages && !pfn_offset &&
- (vaddr + huge_page_size(h) < vma->vm_end) &&
- (remainder >= pages_per_huge_page(h))) {
- vaddr += huge_page_size(h);
- remainder -= pages_per_huge_page(h);
- i += pages_per_huge_page(h);
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- continue;
- }
-
- /* vaddr may not be aligned to PAGE_SIZE */
- refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
- (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
-
- if (pages)
- record_subpages(nth_page(page, pfn_offset),
- vma, refs,
- likely(pages) ? pages + i : NULL);
-
- if (pages) {
- /*
- * try_grab_folio() should always succeed here,
- * because: a) we hold the ptl lock, and b) we've just
- * checked that the huge page is present in the page
- * tables. If the huge page is present, then the tail
- * pages must also be present. The ptl prevents the
- * head page and tail pages from being rearranged in
- * any way. As this is hugetlb, the pages will never
- * be p2pdma or not longterm pinable. So this page
- * must be available at this point, unless the page
- * refcount overflowed:
- */
- if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
- flags))) {
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- remainder = 0;
- err = -ENOMEM;
- break;
- }
- }
-
- vaddr += (refs << PAGE_SHIFT);
- remainder -= refs;
- i += refs;
-
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- }
- *nr_pages = remainder;
- /*
- * setting position is actually required only if remainder is
- * not zero but it's faster not to add a "if (remainder)"
- * branch.
- */
- *position = vaddr;
-
- return i ? i : err;
-}
-
long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
--
2.41.0