2024-04-01 20:27:05

by Vishal Moola

[permalink] [raw]
Subject: [PATCH v2 0/3] Hugetlb fault path to use struct vm_fault

This patchset converts the hugetlb fault path to use struct vm_fault.
This helps make the code more readable, and alleviates the stack by
allowing us to consolidate many fault-related variables into an
individual pointer.
----
v2:
- renamed patchset from 'Define struct vm_fault in handle_mm_fault()'
- Dropped patches 4/5 - These allowed vmf->{address,pgoff} to be
modified, but that allows misuse of these fields. Converting hugetlb
to using the same address/pgoff as generic mm is not simple, so that
can be done later.

Vishal Moola (Oracle) (3):
hugetlb: Convert hugetlb_fault() to use struct vm_fault
hugetlb: Convert hugetlb_no_page() to use struct vm_fault
hugetlb: Convert hugetlb_wp() to use struct vm_fault

mm/hugetlb.c | 194 +++++++++++++++++++++++++--------------------------
1 file changed, 95 insertions(+), 99 deletions(-)

--
2.43.0



2024-04-01 20:27:27

by Vishal Moola

[permalink] [raw]
Subject: [PATCH v2 2/3] hugetlb: Convert hugetlb_no_page() to use struct vm_fault

hugetlb_no_page() can use the struct vm_fault passed in from
hugetlb_fault(). This alleviates the stack by consolidating 7
variables into a single struct.

Signed-off-by: Vishal Moola (Oracle) <[email protected]>
---
mm/hugetlb.c | 59 ++++++++++++++++++++++++++--------------------------
1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 360b82374a89..aca2f11b4138 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6189,9 +6189,7 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,

static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
- struct address_space *mapping, pgoff_t idx,
- unsigned long address, pte_t *ptep,
- pte_t old_pte, unsigned int flags,
+ struct address_space *mapping,
struct vm_fault *vmf)
{
struct hstate *h = hstate_vma(vma);
@@ -6200,10 +6198,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
unsigned long size;
struct folio *folio;
pte_t new_pte;
- spinlock_t *ptl;
- unsigned long haddr = address & huge_page_mask(h);
bool new_folio, new_pagecache_folio = false;
- u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
+ u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);

/*
* Currently, we are forced to kill the process in the event the
@@ -6222,10 +6218,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* before we get page_table_lock.
*/
new_folio = false;
- folio = filemap_lock_hugetlb_folio(h, mapping, idx);
+ folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff);
if (IS_ERR(folio)) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
+ if (vmf->pgoff >= size)
goto out;
/* Check for page in userfault range */
if (userfaultfd_missing(vma)) {
@@ -6246,7 +6242,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* never happen on the page after UFFDIO_COPY has
* correctly installed the page and returned.
*/
- if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) {
ret = 0;
goto out;
}
@@ -6255,7 +6251,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
VM_UFFD_MISSING);
}

- folio = alloc_hugetlb_folio(vma, haddr, 0);
+ folio = alloc_hugetlb_folio(vma, vmf->address, 0);
if (IS_ERR(folio)) {
/*
* Returning error will result in faulting task being
@@ -6269,18 +6265,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* here. Before returning error, get ptl and make
* sure there really is no pte entry.
*/
- if (hugetlb_pte_stable(h, mm, ptep, old_pte))
+ if (hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte))
ret = vmf_error(PTR_ERR(folio));
else
ret = 0;
goto out;
}
- clear_huge_page(&folio->page, address, pages_per_huge_page(h));
+ clear_huge_page(&folio->page, vmf->real_address,
+ pages_per_huge_page(h));
__folio_mark_uptodate(folio);
new_folio = true;

if (vma->vm_flags & VM_MAYSHARE) {
- int err = hugetlb_add_to_page_cache(folio, mapping, idx);
+ int err = hugetlb_add_to_page_cache(folio, mapping,
+ vmf->pgoff);
if (err) {
/*
* err can't be -EEXIST which implies someone
@@ -6289,7 +6287,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* to the page cache. So it's safe to call
* restore_reserve_on_error() here.
*/
- restore_reserve_on_error(h, vma, haddr, folio);
+ restore_reserve_on_error(h, vma, vmf->address,
+ folio);
folio_put(folio);
goto out;
}
@@ -6319,7 +6318,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
folio_unlock(folio);
folio_put(folio);
/* See comment in userfaultfd_missing() block above */
- if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) {
ret = 0;
goto out;
}
@@ -6334,23 +6333,23 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* any allocations necessary to record that reservation occur outside
* the spinlock.
*/
- if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
- if (vma_needs_reservation(h, vma, haddr) < 0) {
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+ if (vma_needs_reservation(h, vma, vmf->address) < 0) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
/* Just decrements count, does not deallocate */
- vma_end_reservation(h, vma, haddr);
+ vma_end_reservation(h, vma, vmf->address);
}

- ptl = huge_pte_lock(h, mm, ptep);
+ vmf->ptl = huge_pte_lock(h, mm, vmf->pte);
ret = 0;
/* If pte changed from under us, retry */
- if (!pte_same(huge_ptep_get(ptep), old_pte))
+ if (!pte_same(huge_ptep_get(vmf->pte), vmf->orig_pte))
goto backout;

if (anon_rmap)
- hugetlb_add_new_anon_rmap(folio, vma, haddr);
+ hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
else
hugetlb_add_file_rmap(folio);
new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
@@ -6359,17 +6358,18 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* If this pte was previously wr-protected, keep it wr-protected even
* if populated.
*/
- if (unlikely(pte_marker_uffd_wp(old_pte)))
+ if (unlikely(pte_marker_uffd_wp(vmf->orig_pte)))
new_pte = huge_pte_mkuffd_wp(new_pte);
- set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h));
+ set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h));

hugetlb_count_add(pages_per_huge_page(h), mm);
- if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl, vmf);
+ ret = hugetlb_wp(mm, vma, vmf->real_address, vmf->pte,
+ vmf->flags, folio, vmf->ptl, vmf);
}

- spin_unlock(ptl);
+ spin_unlock(vmf->ptl);

/*
* Only set hugetlb_migratable in newly allocated pages. Existing pages
@@ -6386,10 +6386,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
return ret;

backout:
- spin_unlock(ptl);
+ spin_unlock(vmf->ptl);
backout_unlocked:
if (new_folio && !new_pagecache_folio)
- restore_reserve_on_error(h, vma, haddr, folio);
+ restore_reserve_on_error(h, vma, vmf->address, folio);

folio_unlock(folio);
folio_put(folio);
@@ -6485,8 +6485,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* hugetlb_no_page will drop vma lock and hugetlb fault
* mutex internally, which make us return immediately.
*/
- return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address,
- vmf.pte, vmf.orig_pte, flags, &vmf);
+ return hugetlb_no_page(mm, vma, mapping, &vmf);
}

ret = 0;
--
2.43.0


2024-04-01 20:27:30

by Vishal Moola

[permalink] [raw]
Subject: [PATCH v2 1/3] hugetlb: Convert hugetlb_fault() to use struct vm_fault

Now that hugetlb_fault() has a vm_fault available for fault tracking, use
it throughout. This cleans up the code by removing 2 variables, and
prepares hugetlb_fault() to take in a struct vm_fault argument.

Signed-off-by: Vishal Moola (Oracle) <[email protected]>
---
mm/hugetlb.c | 84 +++++++++++++++++++++++++---------------------------
1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8267e221ca5d..360b82374a89 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6423,8 +6423,6 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
- pte_t *ptep, entry;
- spinlock_t *ptl;
vm_fault_t ret;
u32 hash;
struct folio *folio = NULL;
@@ -6432,13 +6430,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
int need_wait_lock = 0;
- unsigned long haddr = address & huge_page_mask(h);
struct vm_fault vmf = {
.vma = vma,
- .address = haddr,
+ .address = address & huge_page_mask(h),
.real_address = address,
.flags = flags,
- .pgoff = vma_hugecache_offset(h, vma, haddr),
+ .pgoff = vma_hugecache_offset(h, vma,
+ address & huge_page_mask(h)),
/* TODO: Track hugetlb faults using vm_fault */

/*
@@ -6458,22 +6456,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,

/*
* Acquire vma lock before calling huge_pte_alloc and hold
- * until finished with ptep. This prevents huge_pmd_unshare from
- * being called elsewhere and making the ptep no longer valid.
+ * until finished with vmf.pte. This prevents huge_pmd_unshare from
+ * being called elsewhere and making the vmf.pte no longer valid.
*/
hugetlb_vma_lock_read(vma);
- ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
- if (!ptep) {
+ vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h));
+ if (!vmf.pte) {
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return VM_FAULT_OOM;
}

- entry = huge_ptep_get(ptep);
- if (huge_pte_none_mostly(entry)) {
- if (is_pte_marker(entry)) {
+ vmf.orig_pte = huge_ptep_get(vmf.pte);
+ if (huge_pte_none_mostly(vmf.orig_pte)) {
+ if (is_pte_marker(vmf.orig_pte)) {
pte_marker marker =
- pte_marker_get(pte_to_swp_entry(entry));
+ pte_marker_get(pte_to_swp_entry(vmf.orig_pte));

if (marker & PTE_MARKER_POISONED) {
ret = VM_FAULT_HWPOISON_LARGE;
@@ -6488,20 +6486,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* mutex internally, which make us return immediately.
*/
return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address,
- ptep, entry, flags, &vmf);
+ vmf.pte, vmf.orig_pte, flags, &vmf);
}

ret = 0;

/*
- * entry could be a migration/hwpoison entry at this point, so this
- * check prevents the kernel from going below assuming that we have
- * an active hugepage in pagecache. This goto expects the 2nd page
- * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
- * properly handle it.
+ * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
+ * point, so this check prevents the kernel from going below assuming
+ * that we have an active hugepage in pagecache. This goto expects
+ * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
+ * check will properly handle it.
*/
- if (!pte_present(entry)) {
- if (unlikely(is_hugetlb_entry_migration(entry))) {
+ if (!pte_present(vmf.orig_pte)) {
+ if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
/*
* Release the hugetlb fault lock now, but retain
* the vma lock, because it is needed to guard the
@@ -6510,9 +6508,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* be released there.
*/
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- migration_entry_wait_huge(vma, ptep);
+ migration_entry_wait_huge(vma, vmf.pte);
return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
@@ -6526,13 +6524,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* determine if a reservation has been consumed.
*/
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
- !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
- if (vma_needs_reservation(h, vma, haddr) < 0) {
+ !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
+ if (vma_needs_reservation(h, vma, vmf.address) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
}
/* Just decrements count, does not deallocate */
- vma_end_reservation(h, vma, haddr);
+ vma_end_reservation(h, vma, vmf.address);

pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
vmf.pgoff);
@@ -6540,17 +6538,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pagecache_folio = NULL;
}

- ptl = huge_pte_lock(h, mm, ptep);
+ vmf.ptl = huge_pte_lock(h, mm, vmf.pte);

/* Check for a racing update before calling hugetlb_wp() */
- if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+ if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte))))
goto out_ptl;

/* Handle userfault-wp first, before trying to lock more pages */
- if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
- (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) &&
+ (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
if (!userfaultfd_wp_async(vma)) {
- spin_unlock(ptl);
+ spin_unlock(vmf.ptl);
if (pagecache_folio) {
folio_unlock(pagecache_folio);
folio_put(pagecache_folio);
@@ -6560,18 +6558,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return handle_userfault(&vmf, VM_UFFD_WP);
}

- entry = huge_pte_clear_uffd_wp(entry);
- set_huge_pte_at(mm, haddr, ptep, entry,
+ vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte);
+ set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,
huge_page_size(hstate_vma(vma)));
/* Fallthrough to CoW */
}

/*
- * hugetlb_wp() requires page locks of pte_page(entry) and
+ * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
* pagecache_folio, so here we need take the former one
* when folio != pagecache_folio or !pagecache_folio.
*/
- folio = page_folio(pte_page(entry));
+ folio = page_folio(pte_page(vmf.orig_pte));
if (folio != pagecache_folio)
if (!folio_trylock(folio)) {
need_wait_lock = 1;
@@ -6581,24 +6579,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
folio_get(folio);

if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
- if (!huge_pte_write(entry)) {
- ret = hugetlb_wp(mm, vma, address, ptep, flags,
- pagecache_folio, ptl, &vmf);
+ if (!huge_pte_write(vmf.orig_pte)) {
+ ret = hugetlb_wp(mm, vma, address, vmf.pte, flags,
+ pagecache_folio, vmf.ptl, &vmf);
goto out_put_page;
} else if (likely(flags & FAULT_FLAG_WRITE)) {
- entry = huge_pte_mkdirty(entry);
+ vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
}
}
- entry = pte_mkyoung(entry);
- if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
+ vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
+ if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
flags & FAULT_FLAG_WRITE))
- update_mmu_cache(vma, haddr, ptep);
+ update_mmu_cache(vma, vmf.address, vmf.pte);
out_put_page:
if (folio != pagecache_folio)
folio_unlock(folio);
folio_put(folio);
out_ptl:
- spin_unlock(ptl);
+ spin_unlock(vmf.ptl);

if (pagecache_folio) {
folio_unlock(pagecache_folio);
--
2.43.0


2024-04-01 20:28:12

by Vishal Moola

[permalink] [raw]
Subject: [PATCH v2 3/3] hugetlb: Convert hugetlb_wp() to use struct vm_fault

hugetlb_wp() can use the struct vm_fault passed in from hugetlb_fault().
This alleviates the stack by consolidating 5 variables into a single
struct.

Signed-off-by: Vishal Moola (Oracle) <[email protected]>
---
mm/hugetlb.c | 61 ++++++++++++++++++++++++++--------------------------
1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aca2f11b4138..d4f26947173e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5918,18 +5918,16 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep, unsigned int flags,
- struct folio *pagecache_folio, spinlock_t *ptl,
+ struct folio *pagecache_folio,
struct vm_fault *vmf)
{
- const bool unshare = flags & FAULT_FLAG_UNSHARE;
- pte_t pte = huge_ptep_get(ptep);
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
+ pte_t pte = huge_ptep_get(vmf->pte);
struct hstate *h = hstate_vma(vma);
struct folio *old_folio;
struct folio *new_folio;
int outside_reserve = 0;
vm_fault_t ret = 0;
- unsigned long haddr = address & huge_page_mask(h);
struct mmu_notifier_range range;

/*
@@ -5952,7 +5950,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,

/* Let's take out MAP_SHARED mappings first. */
if (vma->vm_flags & VM_MAYSHARE) {
- set_huge_ptep_writable(vma, haddr, ptep);
+ set_huge_ptep_writable(vma, vmf->address, vmf->pte);
return 0;
}

@@ -5971,7 +5969,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
SetPageAnonExclusive(&old_folio->page);
}
if (likely(!unshare))
- set_huge_ptep_writable(vma, haddr, ptep);
+ set_huge_ptep_writable(vma, vmf->address, vmf->pte);

delayacct_wpcopy_end();
return 0;
@@ -5998,8 +5996,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
* Drop page table lock as buddy allocator may be called. It will
* be acquired again before returning to the caller, as expected.
*/
- spin_unlock(ptl);
- new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve);
+ spin_unlock(vmf->ptl);
+ new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve);

if (IS_ERR(new_folio)) {
/*
@@ -6024,19 +6022,21 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
*
* Reacquire both after unmap operation.
*/
- idx = vma_hugecache_offset(h, vma, haddr);
+ idx = vma_hugecache_offset(h, vma, vmf->address);
hash = hugetlb_fault_mutex_hash(mapping, idx);
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);

- unmap_ref_private(mm, vma, &old_folio->page, haddr);
+ unmap_ref_private(mm, vma, &old_folio->page,
+ vmf->address);

mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
- spin_lock(ptl);
- ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
- if (likely(ptep &&
- pte_same(huge_ptep_get(ptep), pte)))
+ spin_lock(vmf->ptl);
+ vmf->pte = hugetlb_walk(vma, vmf->address,
+ huge_page_size(h));
+ if (likely(vmf->pte &&
+ pte_same(huge_ptep_get(vmf->pte), pte)))
goto retry_avoidcopy;
/*
* race occurs while re-acquiring page table
@@ -6058,37 +6058,38 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(ret))
goto out_release_all;

- if (copy_user_large_folio(new_folio, old_folio, address, vma)) {
+ if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) {
ret = VM_FAULT_HWPOISON_LARGE;
goto out_release_all;
}
__folio_mark_uptodate(new_folio);

- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
- haddr + huge_page_size(h));
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address,
+ vmf->address + huge_page_size(h));
mmu_notifier_invalidate_range_start(&range);

/*
* Retake the page table lock to check for racing updates
* before the page tables are altered
*/
- spin_lock(ptl);
- ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
- if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
+ spin_lock(vmf->ptl);
+ vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
+ if (likely(vmf->pte && pte_same(huge_ptep_get(vmf->pte), pte))) {
pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);

/* Break COW or unshare */
- huge_ptep_clear_flush(vma, haddr, ptep);
+ huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
hugetlb_remove_rmap(old_folio);
- hugetlb_add_new_anon_rmap(new_folio, vma, haddr);
+ hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address);
if (huge_pte_uffd_wp(pte))
newpte = huge_pte_mkuffd_wp(newpte);
- set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h));
+ set_huge_pte_at(mm, vmf->address, vmf->pte, newpte,
+ huge_page_size(h));
folio_set_hugetlb_migratable(new_folio);
/* Make the old page be freed below */
new_folio = old_folio;
}
- spin_unlock(ptl);
+ spin_unlock(vmf->ptl);
mmu_notifier_invalidate_range_end(&range);
out_release_all:
/*
@@ -6096,12 +6097,12 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
* unshare)
*/
if (new_folio != old_folio)
- restore_reserve_on_error(h, vma, haddr, new_folio);
+ restore_reserve_on_error(h, vma, vmf->address, new_folio);
folio_put(new_folio);
out_release_old:
folio_put(old_folio);

- spin_lock(ptl); /* Caller expects lock to be held */
+ spin_lock(vmf->ptl); /* Caller expects lock to be held */

delayacct_wpcopy_end();
return ret;
@@ -6365,8 +6366,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_wp(mm, vma, vmf->real_address, vmf->pte,
- vmf->flags, folio, vmf->ptl, vmf);
+ ret = hugetlb_wp(mm, vma, folio, vmf);
}

spin_unlock(vmf->ptl);
@@ -6579,8 +6579,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,

if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(vmf.orig_pte)) {
- ret = hugetlb_wp(mm, vma, address, vmf.pte, flags,
- pagecache_folio, vmf.ptl, &vmf);
+ ret = hugetlb_wp(mm, vma, pagecache_folio, &vmf);
goto out_put_page;
} else if (likely(flags & FAULT_FLAG_WRITE)) {
vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
--
2.43.0


2024-04-04 02:07:35

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v2 0/3] Hugetlb fault path to use struct vm_fault

On Mon, 1 Apr 2024 13:26:48 -0700 "Vishal Moola (Oracle)" <[email protected]> wrote:

> This patchset converts the hugetlb fault path to use struct vm_fault.
> This helps make the code more readable, and alleviates the stack by
> allowing us to consolidate many fault-related variables into an
> individual pointer.

The .text shrunk a little. x86_64 defconfig:

52873 4015 13796 70684 1141c mm/hugetlb.o-before
52617 4015 13796 70428 1131c mm/hugetlb.o-after


2024-04-04 12:49:27

by Oscar Salvador

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] hugetlb: Convert hugetlb_no_page() to use struct vm_fault

On Mon, Apr 01, 2024 at 01:26:50PM -0700, Vishal Moola (Oracle) wrote:
> hugetlb_no_page() can use the struct vm_fault passed in from
> hugetlb_fault(). This alleviates the stack by consolidating 7
> variables into a single struct.
>
> Signed-off-by: Vishal Moola (Oracle) <[email protected]>
> ---
> mm/hugetlb.c | 59 ++++++++++++++++++++++++++--------------------------
> 1 file changed, 29 insertions(+), 30 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 360b82374a89..aca2f11b4138 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6189,9 +6189,7 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
>
> static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
> struct vm_area_struct *vma,
> - struct address_space *mapping, pgoff_t idx,
> - unsigned long address, pte_t *ptep,
> - pte_t old_pte, unsigned int flags,
> + struct address_space *mapping,

AFAICS all this can be self-contained in vm_fault struct.
vmf->vma->mm and vmf->vma.
I mean, if we want to convert this interface, why not going all the way?

Looks a bit odd some fields yes while some others remain.

Or am I missing something?

--
Oscar Salvador
SUSE Labs

2024-04-04 12:49:59

by Oscar Salvador

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] hugetlb: Convert hugetlb_fault() to use struct vm_fault

On Mon, Apr 01, 2024 at 01:26:49PM -0700, Vishal Moola (Oracle) wrote:
> Now that hugetlb_fault() has a vm_fault available for fault tracking, use
> it throughout. This cleans up the code by removing 2 variables, and
> prepares hugetlb_fault() to take in a struct vm_fault argument.
>
> Signed-off-by: Vishal Moola (Oracle) <[email protected]>

Reviewed-by: Oscar Salvador <[email protected]>

A question below:

> mm/hugetlb.c | 84 +++++++++++++++++++++++++---------------------------
> 1 file changed, 41 insertions(+), 43 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 8267e221ca5d..360b82374a89 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
..
> /*
> - * entry could be a migration/hwpoison entry at this point, so this
> - * check prevents the kernel from going below assuming that we have
> - * an active hugepage in pagecache. This goto expects the 2nd page
> - * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
> - * properly handle it.
> + * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this

"vmf.orig_pte could be a migration/hwpoison entry at ..."

> - entry = pte_mkyoung(entry);
> - if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
> + vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
> + if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
> flags & FAULT_FLAG_WRITE))

Would it make sense to teach huge_ptep_set_access_flags/set_huge_pte_at() to use
vm_fault struct as well? All info we are passing is stored there.
Maybe it is not worth the trouble though, just asking.


--
Oscar Salvador
SUSE Labs

2024-04-04 19:32:31

by Vishal Moola

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] hugetlb: Convert hugetlb_fault() to use struct vm_fault

On Thu, Apr 4, 2024 at 5:26 AM Oscar Salvador <[email protected]> wrote:
>
> On Mon, Apr 01, 2024 at 01:26:49PM -0700, Vishal Moola (Oracle) wrote:
> > Now that hugetlb_fault() has a vm_fault available for fault tracking, use
> > it throughout. This cleans up the code by removing 2 variables, and
> > prepares hugetlb_fault() to take in a struct vm_fault argument.
> >
> > Signed-off-by: Vishal Moola (Oracle) <[email protected]>
>
> Reviewed-by: Oscar Salvador <[email protected]>
>
> A question below:
>
> > mm/hugetlb.c | 84 +++++++++++++++++++++++++---------------------------
> > 1 file changed, 41 insertions(+), 43 deletions(-)
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 8267e221ca5d..360b82374a89 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> ...
> > /*
> > - * entry could be a migration/hwpoison entry at this point, so this
> > - * check prevents the kernel from going below assuming that we have
> > - * an active hugepage in pagecache. This goto expects the 2nd page
> > - * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
> > - * properly handle it.
> > + * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
>
> "vmf.orig_pte could be a migration/hwpoison entry at ..."
>
> > - entry = pte_mkyoung(entry);
> > - if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
> > + vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
> > + if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
> > flags & FAULT_FLAG_WRITE))
>
> Would it make sense to teach huge_ptep_set_access_flags/set_huge_pte_at() to use
> vm_fault struct as well? All info we are passing is stored there.
> Maybe it is not worth the trouble though, just asking.

Yeah, it makes sense. There are actually many function calls in the
hugetlb_fault() and
__handle_mm_fault() pathways that could make use of vm_fault to clean
up the stack.

It's not particularly complicated either, aside from reorganizing some
variables for every
implementation of each function. I'm not really sure if it's worth
dedicated effort
and churn though (at least I'm not focused on that for now).

2024-04-04 19:58:37

by Vishal Moola

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] hugetlb: Convert hugetlb_no_page() to use struct vm_fault

On Thu, Apr 4, 2024 at 5:49 AM Oscar Salvador <[email protected]> wrote:
>
> On Mon, Apr 01, 2024 at 01:26:50PM -0700, Vishal Moola (Oracle) wrote:
> > hugetlb_no_page() can use the struct vm_fault passed in from
> > hugetlb_fault(). This alleviates the stack by consolidating 7
> > variables into a single struct.
> >
> > Signed-off-by: Vishal Moola (Oracle) <[email protected]>
> > ---
> > mm/hugetlb.c | 59 ++++++++++++++++++++++++++--------------------------
> > 1 file changed, 29 insertions(+), 30 deletions(-)
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 360b82374a89..aca2f11b4138 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -6189,9 +6189,7 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
> >
> > static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
> > struct vm_area_struct *vma,
> > - struct address_space *mapping, pgoff_t idx,
> > - unsigned long address, pte_t *ptep,
> > - pte_t old_pte, unsigned int flags,
> > + struct address_space *mapping,
>
> AFAICS all this can be self-contained in vm_fault struct.
> vmf->vma->mm and vmf->vma.
> I mean, if we want to convert this interface, why not going all the way?
>
> Looks a bit odd some fields yes while some others remain.
>
> Or am I missing something?

Mainly just minimizing code churn, we would either unnecessarily
change multiple lines using vma or have to declare the variables
again anyways (or have extra churn I didn't like).

2024-04-05 03:10:51

by Oscar Salvador

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] hugetlb: Convert hugetlb_no_page() to use struct vm_fault

On Mon, Apr 01, 2024 at 01:26:50PM -0700, Vishal Moola (Oracle) wrote:
> hugetlb_no_page() can use the struct vm_fault passed in from
> hugetlb_fault(). This alleviates the stack by consolidating 7
> variables into a single struct.
>
> Signed-off-by: Vishal Moola (Oracle) <[email protected]>


Reviewed-by: Oscar Salvador <[email protected]>


--
Oscar Salvador
SUSE Labs

2024-04-05 03:22:42

by Oscar Salvador

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] hugetlb: Convert hugetlb_wp() to use struct vm_fault

On Mon, Apr 01, 2024 at 01:26:51PM -0700, Vishal Moola (Oracle) wrote:
> hugetlb_wp() can use the struct vm_fault passed in from hugetlb_fault().
> This alleviates the stack by consolidating 5 variables into a single
> struct.
>
> Signed-off-by: Vishal Moola (Oracle) <[email protected]>

Reviewed-by: Oscar Salvador <[email protected]>


--
Oscar Salvador
SUSE Labs

2024-04-07 07:18:52

by Muchun Song

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] hugetlb: Convert hugetlb_fault() to use struct vm_fault



> On Apr 2, 2024, at 04:26, Vishal Moola (Oracle) <[email protected]> wrote:
>
> Now that hugetlb_fault() has a vm_fault available for fault tracking, use
> it throughout. This cleans up the code by removing 2 variables, and
> prepares hugetlb_fault() to take in a struct vm_fault argument.
>
> Signed-off-by: Vishal Moola (Oracle) <[email protected]>

Reviewed-by: Muchun Song <[email protected]>

Thanks.


2024-04-07 07:37:24

by Muchun Song

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] hugetlb: Convert hugetlb_fault() to use struct vm_fault



> On Apr 5, 2024, at 03:32, Vishal Moola <[email protected]> wrote:
>
> On Thu, Apr 4, 2024 at 5:26 AM Oscar Salvador <[email protected]> wrote:
>>
>> On Mon, Apr 01, 2024 at 01:26:49PM -0700, Vishal Moola (Oracle) wrote:
>>> Now that hugetlb_fault() has a vm_fault available for fault tracking, use
>>> it throughout. This cleans up the code by removing 2 variables, and
>>> prepares hugetlb_fault() to take in a struct vm_fault argument.
>>>
>>> Signed-off-by: Vishal Moola (Oracle) <[email protected]>
>>
>> Reviewed-by: Oscar Salvador <[email protected]>
>>
>> A question below:
>>
>>> mm/hugetlb.c | 84 +++++++++++++++++++++++++---------------------------
>>> 1 file changed, 41 insertions(+), 43 deletions(-)
>>>
>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>> index 8267e221ca5d..360b82374a89 100644
>>> --- a/mm/hugetlb.c
>>> +++ b/mm/hugetlb.c
>> ...
>>> /*
>>> - * entry could be a migration/hwpoison entry at this point, so this
>>> - * check prevents the kernel from going below assuming that we have
>>> - * an active hugepage in pagecache. This goto expects the 2nd page
>>> - * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
>>> - * properly handle it.
>>> + * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
>>
>> "vmf.orig_pte could be a migration/hwpoison entry at ..."
>>
>>> - entry = pte_mkyoung(entry);
>>> - if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
>>> + vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
>>> + if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
>>> flags & FAULT_FLAG_WRITE))
>>
>> Would it make sense to teach huge_ptep_set_access_flags/set_huge_pte_at() to use
>> vm_fault struct as well? All info we are passing is stored there.
>> Maybe it is not worth the trouble though, just asking.
>
> Yeah, it makes sense. There are actually many function calls in the
> hugetlb_fault() and
> __handle_mm_fault() pathways that could make use of vm_fault to clean
> up the stack.
>
> It's not particularly complicated either, aside from reorganizing some
> variables for every
> implementation of each function. I'm not really sure if it's worth
> dedicated effort
> and churn though (at least I'm not focused on that for now).

Not all the users of set_huge_pte_at() have a vmf structure. So I do not
think it is a good idea to change it. And huge_ptep_set_access_flags() is
a variant of ptep_set_access_flags(), it's better to keep consistent.
Otherwise, I think both of them should be adapted if you want cleanup.
My tendency is to remain unchanged.

Muchun,
Thanks.




2024-04-07 09:00:05

by Muchun Song

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] hugetlb: Convert hugetlb_no_page() to use struct vm_fault



> On Apr 5, 2024, at 03:58, Vishal Moola <[email protected]> wrote:
>
> On Thu, Apr 4, 2024 at 5:49 AM Oscar Salvador <[email protected]> wrote:
>>
>> On Mon, Apr 01, 2024 at 01:26:50PM -0700, Vishal Moola (Oracle) wrote:
>>> hugetlb_no_page() can use the struct vm_fault passed in from
>>> hugetlb_fault(). This alleviates the stack by consolidating 7
>>> variables into a single struct.
>>>
>>> Signed-off-by: Vishal Moola (Oracle) <[email protected]>
>>> ---
>>> mm/hugetlb.c | 59 ++++++++++++++++++++++++++--------------------------
>>> 1 file changed, 29 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>> index 360b82374a89..aca2f11b4138 100644
>>> --- a/mm/hugetlb.c
>>> +++ b/mm/hugetlb.c
>>> @@ -6189,9 +6189,7 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
>>>
>>> static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
>>> struct vm_area_struct *vma,
>>> - struct address_space *mapping, pgoff_t idx,
>>> - unsigned long address, pte_t *ptep,
>>> - pte_t old_pte, unsigned int flags,
>>> + struct address_space *mapping,
>>
>> AFAICS all this can be self-contained in vm_fault struct.
>> vmf->vma->mm and vmf->vma.
>> I mean, if we want to convert this interface, why not going all the way?
>>
>> Looks a bit odd some fields yes while some others remain.
>>
>> Or am I missing something?
>
> Mainly just minimizing code churn, we would either unnecessarily
> change multiple lines using vma or have to declare the variables
> again anyways (or have extra churn I didn't like).

I don't think adding some variables is a problem. I suppose the compiler
could do some optimization for us. So I think it is better to pass
only one argument vmf to hugetlb_no_page(). Otherwise, LGTM.

Muchun,
Thanks.


2024-04-07 09:12:59

by Muchun Song

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] hugetlb: Convert hugetlb_wp() to use struct vm_fault



On 2024/4/2 04:26, Vishal Moola (Oracle) wrote:
> hugetlb_wp() can use the struct vm_fault passed in from hugetlb_fault().
> This alleviates the stack by consolidating 5 variables into a single
> struct.
>
> Signed-off-by: Vishal Moola (Oracle) <[email protected]>
> ---
> mm/hugetlb.c | 61 ++++++++++++++++++++++++++--------------------------
> 1 file changed, 30 insertions(+), 31 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index aca2f11b4138..d4f26947173e 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5918,18 +5918,16 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
> * Keep the pte_same checks anyway to make transition from the mutex easier.
> */
> static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
> - unsigned long address, pte_t *ptep, unsigned int flags,
> - struct folio *pagecache_folio, spinlock_t *ptl,
> + struct folio *pagecache_folio,

The same as comment in the previous thread.

Muchun,
Thanks.

> struct vm_fault *vmf)
> {
> - const bool unshare = flags & FAULT_FLAG_UNSHARE;
> - pte_t pte = huge_ptep_get(ptep);
> + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
> + pte_t pte = huge_ptep_get(vmf->pte);
> struct hstate *h = hstate_vma(vma);
> struct folio *old_folio;
> struct folio *new_folio;
> int outside_reserve = 0;
> vm_fault_t ret = 0;
> - unsigned long haddr = address & huge_page_mask(h);
> struct mmu_notifier_range range;
>
> /*
> @@ -5952,7 +5950,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
>
> /* Let's take out MAP_SHARED mappings first. */
> if (vma->vm_flags & VM_MAYSHARE) {
> - set_huge_ptep_writable(vma, haddr, ptep);
> + set_huge_ptep_writable(vma, vmf->address, vmf->pte);
> return 0;
> }
>
> @@ -5971,7 +5969,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
> SetPageAnonExclusive(&old_folio->page);
> }
> if (likely(!unshare))
> - set_huge_ptep_writable(vma, haddr, ptep);
> + set_huge_ptep_writable(vma, vmf->address, vmf->pte);
>
> delayacct_wpcopy_end();
> return 0;
> @@ -5998,8 +5996,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
> * Drop page table lock as buddy allocator may be called. It will
> * be acquired again before returning to the caller, as expected.
> */
> - spin_unlock(ptl);
> - new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve);
> + spin_unlock(vmf->ptl);
> + new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve);
>
> if (IS_ERR(new_folio)) {
> /*
> @@ -6024,19 +6022,21 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
> *
> * Reacquire both after unmap operation.
> */
> - idx = vma_hugecache_offset(h, vma, haddr);
> + idx = vma_hugecache_offset(h, vma, vmf->address);
> hash = hugetlb_fault_mutex_hash(mapping, idx);
> hugetlb_vma_unlock_read(vma);
> mutex_unlock(&hugetlb_fault_mutex_table[hash]);
>
> - unmap_ref_private(mm, vma, &old_folio->page, haddr);
> + unmap_ref_private(mm, vma, &old_folio->page,
> + vmf->address);
>
> mutex_lock(&hugetlb_fault_mutex_table[hash]);
> hugetlb_vma_lock_read(vma);
> - spin_lock(ptl);
> - ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
> - if (likely(ptep &&
> - pte_same(huge_ptep_get(ptep), pte)))
> + spin_lock(vmf->ptl);
> + vmf->pte = hugetlb_walk(vma, vmf->address,
> + huge_page_size(h));
> + if (likely(vmf->pte &&
> + pte_same(huge_ptep_get(vmf->pte), pte)))
> goto retry_avoidcopy;
> /*
> * race occurs while re-acquiring page table
> @@ -6058,37 +6058,38 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
> if (unlikely(ret))
> goto out_release_all;
>
> - if (copy_user_large_folio(new_folio, old_folio, address, vma)) {
> + if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) {
> ret = VM_FAULT_HWPOISON_LARGE;
> goto out_release_all;
> }
> __folio_mark_uptodate(new_folio);
>
> - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
> - haddr + huge_page_size(h));
> + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address,
> + vmf->address + huge_page_size(h));
> mmu_notifier_invalidate_range_start(&range);
>
> /*
> * Retake the page table lock to check for racing updates
> * before the page tables are altered
> */
> - spin_lock(ptl);
> - ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
> - if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
> + spin_lock(vmf->ptl);
> + vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
> + if (likely(vmf->pte && pte_same(huge_ptep_get(vmf->pte), pte))) {
> pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
>
> /* Break COW or unshare */
> - huge_ptep_clear_flush(vma, haddr, ptep);
> + huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
> hugetlb_remove_rmap(old_folio);
> - hugetlb_add_new_anon_rmap(new_folio, vma, haddr);
> + hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address);
> if (huge_pte_uffd_wp(pte))
> newpte = huge_pte_mkuffd_wp(newpte);
> - set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h));
> + set_huge_pte_at(mm, vmf->address, vmf->pte, newpte,
> + huge_page_size(h));
> folio_set_hugetlb_migratable(new_folio);
> /* Make the old page be freed below */
> new_folio = old_folio;
> }
> - spin_unlock(ptl);
> + spin_unlock(vmf->ptl);
> mmu_notifier_invalidate_range_end(&range);
> out_release_all:
> /*
> @@ -6096,12 +6097,12 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
> * unshare)
> */
> if (new_folio != old_folio)
> - restore_reserve_on_error(h, vma, haddr, new_folio);
> + restore_reserve_on_error(h, vma, vmf->address, new_folio);
> folio_put(new_folio);
> out_release_old:
> folio_put(old_folio);
>
> - spin_lock(ptl); /* Caller expects lock to be held */
> + spin_lock(vmf->ptl); /* Caller expects lock to be held */
>
> delayacct_wpcopy_end();
> return ret;
> @@ -6365,8 +6366,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
> hugetlb_count_add(pages_per_huge_page(h), mm);
> if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
> /* Optimization, do the COW without a second fault */
> - ret = hugetlb_wp(mm, vma, vmf->real_address, vmf->pte,
> - vmf->flags, folio, vmf->ptl, vmf);
> + ret = hugetlb_wp(mm, vma, folio, vmf);
> }
>
> spin_unlock(vmf->ptl);
> @@ -6579,8 +6579,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>
> if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
> if (!huge_pte_write(vmf.orig_pte)) {
> - ret = hugetlb_wp(mm, vma, address, vmf.pte, flags,
> - pagecache_folio, vmf.ptl, &vmf);
> + ret = hugetlb_wp(mm, vma, pagecache_folio, &vmf);
> goto out_put_page;
> } else if (likely(flags & FAULT_FLAG_WRITE)) {
> vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);


2024-04-08 17:45:41

by Vishal Moola

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] hugetlb: Convert hugetlb_no_page() to use struct vm_fault

On Sun, Apr 07, 2024 at 04:59:13PM +0800, Muchun Song wrote:
>
>
> > On Apr 5, 2024, at 03:58, Vishal Moola <[email protected]> wrote:
> >
> > On Thu, Apr 4, 2024 at 5:49 AM Oscar Salvador <[email protected]> wrote:
> >>
> >> On Mon, Apr 01, 2024 at 01:26:50PM -0700, Vishal Moola (Oracle) wrote:
> >>> hugetlb_no_page() can use the struct vm_fault passed in from
> >>> hugetlb_fault(). This alleviates the stack by consolidating 7
> >>> variables into a single struct.
> >>>
> >>> Signed-off-by: Vishal Moola (Oracle) <[email protected]>
> >>> ---
> >>> mm/hugetlb.c | 59 ++++++++++++++++++++++++++--------------------------
> >>> 1 file changed, 29 insertions(+), 30 deletions(-)
> >>>
> >>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> >>> index 360b82374a89..aca2f11b4138 100644
> >>> --- a/mm/hugetlb.c
> >>> +++ b/mm/hugetlb.c
> >>> @@ -6189,9 +6189,7 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
> >>>
> >>> static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
> >>> struct vm_area_struct *vma,
> >>> - struct address_space *mapping, pgoff_t idx,
> >>> - unsigned long address, pte_t *ptep,
> >>> - pte_t old_pte, unsigned int flags,
> >>> + struct address_space *mapping,
> >>
> >> AFAICS all this can be self-contained in vm_fault struct.
> >> vmf->vma->mm and vmf->vma.
> >> I mean, if we want to convert this interface, why not going all the way?
> >>
> >> Looks a bit odd some fields yes while some others remain.
> >>
> >> Or am I missing something?
> >
> > Mainly just minimizing code churn, we would either unnecessarily
> > change multiple lines using vma or have to declare the variables
> > again anyways (or have extra churn I didn't like).
>
> I don't think adding some variables is a problem. I suppose the compiler
> could do some optimization for us. So I think it is better to pass
> only one argument vmf to hugetlb_no_page(). Otherwise, LGTM.

Alright we can get rid of the vm_area_struct and mm_struct arguments as
well.

Andrew, could you please fold the attached patch into this one?


Attachments:
(No filename) (2.17 kB)
0001-hugetlb-Simplify-hugetlb_no_page-arguments.patch (1.52 kB)
Download all attachments

2024-04-08 17:47:28

by Vishal Moola

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] hugetlb: Convert hugetlb_wp() to use struct vm_fault

On Sun, Apr 07, 2024 at 05:12:42PM +0800, Muchun Song wrote:
>
>
> On 2024/4/2 04:26, Vishal Moola (Oracle) wrote:
> > hugetlb_wp() can use the struct vm_fault passed in from hugetlb_fault().
> > This alleviates the stack by consolidating 5 variables into a single
> > struct.
> >
> > Signed-off-by: Vishal Moola (Oracle) <[email protected]>
> > ---
> > mm/hugetlb.c | 61 ++++++++++++++++++++++++++--------------------------
> > 1 file changed, 30 insertions(+), 31 deletions(-)
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index aca2f11b4138..d4f26947173e 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -5918,18 +5918,16 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
> > * Keep the pte_same checks anyway to make transition from the mutex easier.
> > */
> > static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
> > - unsigned long address, pte_t *ptep, unsigned int flags,
> > - struct folio *pagecache_folio, spinlock_t *ptl,
> > + struct folio *pagecache_folio,
>
> The same as comment in the previous thread.

And fold the attached patch into here as well please Andrew?


Attachments:
(No filename) (1.21 kB)
0002-hugetlb-Simplyfy-hugetlb_wp-arguments.patch (2.13 kB)
Download all attachments

2024-04-08 17:55:45

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] hugetlb: Convert hugetlb_wp() to use struct vm_fault

On Mon, Apr 08, 2024 at 10:47:12AM -0700, Vishal Moola wrote:
> -static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
> - struct folio *pagecache_folio,
> +static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
> struct vm_fault *vmf)
> {
> + struct vm_area_struct *vma = vmf->vma;
> + struct mm_struct *mm = vma->vm_mm;

I think 'vmf' should be the first parameter, not the second.
Compare:

static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio)
static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
struct page *page, unsigned int nr, unsigned long addr)
int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
unsigned long addr, int page_nid, int *flags)
static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
unsigned long fault_addr, pte_t *fault_pte,
bool writable)
static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
struct folio *folio, pte_t fault_pte,
bool ignore_writable, bool pte_write_upgrade)
static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)

numa_migrate_prep() is the only one which doesn't have vmf as the first
param. It's a subtle inconsistency, but one you notice after a while
.. then wish you'd done right the first time, but can't quite bring
yourself to submit a patch to fix ;-)