When per-VMA locks were introduced in [1] several types of page faults
would still fall back to mmap_lock to keep the patchset simple. Among them
are swap and userfault pages. The main reason for skipping those cases was
the fact that mmap_lock could be dropped while handling these faults and
that required additional logic to be implemented.
Implement the mechanism to allow per-VMA locks to be dropped for these
cases.
First, change handle_mm_fault to drop per-VMA locks when returning
VM_FAULT_RETRY or VM_FAULT_COMPLETED to be consistent with the way
mmap_lock is handled. Then change folio_lock_or_retry (and rename it to
folio_lock_fault) to accept vm_fault, which will be used to indicate
mmap_lock/per-VMA lock's state upon exit. Finally allow swap and uffd
page faults to be handled under per-VMA locks by dropping per-VMA locks
when waiting for a folio, the same way it's done under mmap_lock.
Naturally, once VMA lock is dropped that VMA should be assumed unstable
and can't be used.
Changes since v2 posted at [2]
- Moved prerequisite patches to the beginning (first 2 patches)
- Added a new patch 3/8 to make per-VMA locks consistent with mmap_locks
by dropping it on VM_FAULT_RETRY or VM_FAULT_COMPLETED.
- Implemented folio_lock_fault in 4/8, per Matthew Wilcox
- Replaced VM_FAULT_VMA_UNLOCKED with FAULT_FLAG_LOCK_DROPPED vmf_flag in
5/8.
- Merged swap page fault handling patch with the one implementing wait for
a folio into 6/8, per Peter Xu
Note: patch 3/8 will cause a trivial merge conflict in arch/arm64/mm/fault.c
when applied over mm-unstable branch due to a patch from ARM64 tree [3]
which is missing in mm-unstable.
[1] https://lore.kernel.org/all/[email protected]/
[2] https://lore.kernel.org/all/[email protected]/
[3] https://lore.kernel.org/all/[email protected]/
Suren Baghdasaryan (8):
swap: remove remnants of polling from read_swap_cache_async
mm: add missing VM_FAULT_RESULT_TRACE name for VM_FAULT_COMPLETED
mm: drop per-VMA lock in handle_mm_fault if retrying or when finished
mm: replace folio_lock_or_retry with folio_lock_fault
mm: make folio_lock_fault indicate the state of mmap_lock upon return
mm: handle swap page faults under per-VMA lock
mm: drop VMA lock before waiting for migration
mm: handle userfaults under VMA lock
arch/arm64/mm/fault.c | 3 +-
arch/powerpc/mm/fault.c | 3 +-
arch/s390/mm/fault.c | 3 +-
arch/x86/mm/fault.c | 3 +-
fs/userfaultfd.c | 42 +++++++++++++------------
include/linux/mm_types.h | 4 ++-
include/linux/pagemap.h | 13 ++++----
mm/filemap.c | 55 +++++++++++++++++++--------------
mm/madvise.c | 4 +--
mm/memory.c | 66 +++++++++++++++++++++++++---------------
mm/swap.h | 1 -
mm/swap_state.c | 12 +++-----
12 files changed, 120 insertions(+), 89 deletions(-)
--
2.41.0.178.g377b9f9a00-goog
folio_lock_fault might drop mmap_lock before returning and to extend it
to work with per-VMA locks, the callers will need to know whether the
lock was dropped or is still held. Introduce new fault_flag to indicate
whether the lock got dropped and store it inside vm_fault flags.
Signed-off-by: Suren Baghdasaryan <[email protected]>
---
include/linux/mm_types.h | 1 +
mm/filemap.c | 2 ++
2 files changed, 3 insertions(+)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 79765e3dd8f3..6f0dbef7aa1f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1169,6 +1169,7 @@ enum fault_flag {
FAULT_FLAG_UNSHARE = 1 << 10,
FAULT_FLAG_ORIG_PTE_VALID = 1 << 11,
FAULT_FLAG_VMA_LOCK = 1 << 12,
+ FAULT_FLAG_LOCK_DROPPED = 1 << 13,
};
typedef unsigned int __bitwise zap_flags_t;
diff --git a/mm/filemap.c b/mm/filemap.c
index 87b335a93530..8ad06d69895b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1723,6 +1723,7 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
return VM_FAULT_RETRY;
mmap_read_unlock(mm);
+ vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
if (vmf->flags & FAULT_FLAG_KILLABLE)
folio_wait_locked_killable(folio);
else
@@ -1735,6 +1736,7 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
ret = __folio_lock_killable(folio);
if (ret) {
mmap_read_unlock(mm);
+ vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
return VM_FAULT_RETRY;
}
} else {
--
2.41.0.178.g377b9f9a00-goog
Enable handle_userfault to operate under VMA lock by releasing VMA lock
instead of mmap_lock and retrying.
Signed-off-by: Suren Baghdasaryan <[email protected]>
---
fs/userfaultfd.c | 42 ++++++++++++++++++++++--------------------
mm/memory.c | 9 ---------
2 files changed, 22 insertions(+), 29 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 4e800bb7d2ab..b88632c404b6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -277,17 +277,17 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
* hugepmd ranges.
*/
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
- struct vm_area_struct *vma,
- unsigned long address,
- unsigned long flags,
- unsigned long reason)
+ struct vm_fault *vmf,
+ unsigned long reason)
{
+ struct vm_area_struct *vma = vmf->vma;
pte_t *ptep, pte;
bool ret = true;
- mmap_assert_locked(ctx->mm);
+ if (!(vmf->flags & FAULT_FLAG_VMA_LOCK))
+ mmap_assert_locked(ctx->mm);
- ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
+ ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
if (!ptep)
goto out;
@@ -308,10 +308,8 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
- struct vm_area_struct *vma,
- unsigned long address,
- unsigned long flags,
- unsigned long reason)
+ struct vm_fault *vmf,
+ unsigned long reason)
{
return false; /* should never get here */
}
@@ -325,11 +323,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
* threads.
*/
static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
- unsigned long address,
- unsigned long flags,
+ struct vm_fault *vmf,
unsigned long reason)
{
struct mm_struct *mm = ctx->mm;
+ unsigned long address = vmf->address;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -337,7 +335,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
pte_t *pte;
bool ret = true;
- mmap_assert_locked(mm);
+ if (!(vmf->flags & FAULT_FLAG_VMA_LOCK))
+ mmap_assert_locked(mm);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -445,7 +444,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
* Coredumping runs without mmap_lock so we can only check that
* the mmap_lock is held, if PF_DUMPCORE was not set.
*/
- mmap_assert_locked(mm);
+ if (!(vmf->flags & FAULT_FLAG_VMA_LOCK))
+ mmap_assert_locked(mm);
ctx = vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
@@ -561,15 +561,17 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
if (!is_vm_hugetlb_page(vma))
- must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
- reason);
+ must_wait = userfaultfd_must_wait(ctx, vmf, reason);
else
- must_wait = userfaultfd_huge_must_wait(ctx, vma,
- vmf->address,
- vmf->flags, reason);
+ must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
if (is_vm_hugetlb_page(vma))
hugetlb_vma_unlock_read(vma);
- mmap_read_unlock(mm);
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ /* WARNING: VMA can't be used after this */
+ vma_end_read(vma);
+ } else
+ mmap_read_unlock(mm);
+ vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
if (likely(must_wait && !READ_ONCE(ctx->released))) {
wake_up_poll(&ctx->fd_wqh, EPOLLIN);
diff --git a/mm/memory.c b/mm/memory.c
index bdf46fdc58d6..923c1576bd14 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5316,15 +5316,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
if (!vma_start_read(vma))
goto inval;
- /*
- * Due to the possibility of userfault handler dropping mmap_lock, avoid
- * it for now and fall back to page fault handling under mmap_lock.
- */
- if (userfaultfd_armed(vma)) {
- vma_end_read(vma);
- goto inval;
- }
-
/* Check since vm_start/vm_end might change before we lock the VMA */
if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
vma_end_read(vma);
--
2.41.0.178.g377b9f9a00-goog
When page fault is handled under per-VMA lock protection, all swap page
faults are retried with mmap_lock because folio_lock_fault (formerly
known as folio_lock_or_retry) had to drop and reacquire mmap_lock
if folio could not be immediately locked.
Follow the same pattern as mmap_lock to drop per-VMA lock when waiting
for folio in folio_lock_fault and retrying once folio is available.
With this obstacle removed, enable do_swap_page to operate under
per-VMA lock protection. Drivers implementing ops->migrate_to_ram might
still rely on mmap_lock, therefore we have to fall back to mmap_lock in
that particular case.
Note that the only time do_swap_page calls synchronous swap_readpage
is when SWP_SYNCHRONOUS_IO is set, which is only set for
QUEUE_FLAG_SYNCHRONOUS devices: brd, zram and nvdimms (both btt and
pmem). Therefore we don't sleep in this path, and there's no need to
drop the mmap or per-VMA lock.
Signed-off-by: Suren Baghdasaryan <[email protected]>
---
mm/filemap.c | 24 ++++++++++++++++--------
mm/memory.c | 21 ++++++++++++++-------
2 files changed, 30 insertions(+), 15 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 8ad06d69895b..683f11f244cd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1703,12 +1703,14 @@ static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
* Return values:
* 0 - folio is locked.
* VM_FAULT_RETRY - folio is not locked.
- * mmap_lock has been released (mmap_read_unlock(), unless flags had both
- * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
- * which case mmap_lock is still held.
+ * FAULT_FLAG_LOCK_DROPPED bit in vmf flags will be set if mmap_lock or
+ * per-VMA lock got dropped. mmap_lock/per-VMA lock is dropped when
+ * function fails to lock the folio, unless flags had both
+ * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in which case
+ * the lock is still held.
*
* If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
- * with the folio locked and the mmap_lock unperturbed.
+ * with the folio locked and the mmap_lock/per-VMA lock unperturbed.
*/
vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
{
@@ -1716,13 +1718,16 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
if (fault_flag_allow_retry_first(vmf->flags)) {
/*
- * CAUTION! In this case, mmap_lock is not released
- * even though return VM_FAULT_RETRY.
+ * CAUTION! In this case, mmap_lock/per-VMA lock is not
+ * released even though returning VM_FAULT_RETRY.
*/
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
return VM_FAULT_RETRY;
- mmap_read_unlock(mm);
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK)
+ vma_end_read(vmf->vma);
+ else
+ mmap_read_unlock(mm);
vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
if (vmf->flags & FAULT_FLAG_KILLABLE)
folio_wait_locked_killable(folio);
@@ -1735,7 +1740,10 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
ret = __folio_lock_killable(folio);
if (ret) {
- mmap_read_unlock(mm);
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK)
+ vma_end_read(vmf->vma);
+ else
+ mmap_read_unlock(mm);
vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
return VM_FAULT_RETRY;
}
diff --git a/mm/memory.c b/mm/memory.c
index 3c2acafcd7b6..5caaa4c66ea2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3712,11 +3712,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!pte_unmap_same(vmf))
goto out;
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
- ret = VM_FAULT_RETRY;
- goto out;
- }
-
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
@@ -3726,6 +3721,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->page = pfn_swap_entry_to_page(entry);
ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ /*
+ * migrate_to_ram is not yet ready to operate
+ * under VMA lock.
+ */
+ ret |= VM_FAULT_RETRY;
+ goto out;
+ }
+
vmf->page = pfn_swap_entry_to_page(entry);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
@@ -5089,9 +5093,12 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
/*
* In case of VM_FAULT_RETRY or VM_FAULT_COMPLETED we might
* be still holding per-VMA lock to keep the vma stable as long
- * as possible. Drop it before returning.
+ * as possible. In this situation vmf.flags has
+ * FAULT_FLAG_VMA_LOCK set and FAULT_FLAG_LOCK_DROPPED unset.
+ * Drop the lock before returning when this happens.
*/
- if (vmf.flags & FAULT_FLAG_VMA_LOCK)
+ if ((vmf.flags & (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_LOCK_DROPPED)) ==
+ FAULT_FLAG_VMA_LOCK)
vma_end_read(vma);
}
return ret;
--
2.41.0.178.g377b9f9a00-goog
Commit [1] introduced IO polling support duding swapin to reduce
swap read latency for block devices that can be polled. However later
commit [2] removed polling support. Therefore it seems safe to remove
do_poll parameter in read_swap_cache_async and always call swap_readpage
with synchronous=false waiting for IO completion in folio_lock_or_retry.
[1] commit 23955622ff8d ("swap: add block io poll in swapin path")
[2] commit 9650b453a3d4 ("block: ignore RWF_HIPRI hint for sync dio")
Suggested-by: "Huang, Ying" <[email protected]>
Signed-off-by: Suren Baghdasaryan <[email protected]>
Reviewed-by: "Huang, Ying" <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
mm/madvise.c | 4 ++--
mm/swap.h | 1 -
mm/swap_state.c | 12 +++++-------
3 files changed, 7 insertions(+), 10 deletions(-)
diff --git a/mm/madvise.c b/mm/madvise.c
index b5ffbaf616f5..b1e8adf1234e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -215,7 +215,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
continue;
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
- vma, index, false, &splug);
+ vma, index, &splug);
if (page)
put_page(page);
}
@@ -252,7 +252,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
rcu_read_unlock();
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
- NULL, 0, false, &splug);
+ NULL, 0, &splug);
if (page)
put_page(page);
diff --git a/mm/swap.h b/mm/swap.h
index 7c033d793f15..8a3c7a0ace4f 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -46,7 +46,6 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
unsigned long addr,
- bool do_poll,
struct swap_iocb **plug);
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b76a65ac28b3..a3839de71f3f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -517,15 +517,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
*/
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
- unsigned long addr, bool do_poll,
- struct swap_iocb **plug)
+ unsigned long addr, struct swap_iocb **plug)
{
bool page_was_allocated;
struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
vma, addr, &page_was_allocated);
if (page_was_allocated)
- swap_readpage(retpage, do_poll, plug);
+ swap_readpage(retpage, false, plug);
return retpage;
}
@@ -620,7 +619,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct swap_info_struct *si = swp_swap_info(entry);
struct blk_plug plug;
struct swap_iocb *splug = NULL;
- bool do_poll = true, page_allocated;
+ bool page_allocated;
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address;
@@ -628,7 +627,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!mask)
goto skip;
- do_poll = false;
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
end_offset = offset | mask;
@@ -660,7 +658,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
lru_add_drain(); /* Push any new pages onto the LRU now */
skip:
/* The page was likely read above, so no need for plugging here */
- return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL);
+ return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL);
}
int init_swap_address_space(unsigned int type, unsigned long nr_pages)
@@ -825,7 +823,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
skip:
/* The page was likely read above, so no need for plugging here */
return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
- ra_info.win == 1, NULL);
+ NULL);
}
/**
--
2.41.0.178.g377b9f9a00-goog
handle_mm_fault returning VM_FAULT_RETRY or VM_FAULT_COMPLETED means
mmap_lock has been released. However with per-VMA locks behavior is
different and the caller should still release it. To make the
rules consistent for the caller, drop the per-VMA lock before returning
from handle_mm_fault when page fault should be retried or is completed.
Signed-off-by: Suren Baghdasaryan <[email protected]>
---
arch/arm64/mm/fault.c | 3 ++-
arch/powerpc/mm/fault.c | 3 ++-
arch/s390/mm/fault.c | 3 ++-
arch/x86/mm/fault.c | 3 ++-
mm/memory.c | 12 +++++++++++-
5 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 6045a5117ac1..89f84e9ea1ff 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -601,7 +601,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
goto lock_mmap;
}
fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs);
- vma_end_read(vma);
+ if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+ vma_end_read(vma);
if (!(fault & VM_FAULT_RETRY)) {
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 531177a4ee08..4697c5dca31c 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -494,7 +494,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
- vma_end_read(vma);
+ if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+ vma_end_read(vma);
if (!(fault & VM_FAULT_RETRY)) {
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index b65144c392b0..cccefe41038b 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -418,7 +418,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
goto lock_mmap;
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
- vma_end_read(vma);
+ if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+ vma_end_read(vma);
if (!(fault & VM_FAULT_RETRY)) {
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
goto out;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e4399983c50c..d69c85c1c04e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1347,7 +1347,8 @@ void do_user_addr_fault(struct pt_regs *regs,
goto lock_mmap;
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
- vma_end_read(vma);
+ if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+ vma_end_read(vma);
if (!(fault & VM_FAULT_RETRY)) {
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
diff --git a/mm/memory.c b/mm/memory.c
index f69fbc251198..9011ad63c41b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5086,7 +5086,17 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
}
}
- return handle_pte_fault(&vmf);
+ ret = handle_pte_fault(&vmf);
+ if (ret & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)) {
+ /*
+ * In case of VM_FAULT_RETRY or VM_FAULT_COMPLETED we might
+ * be still holding per-VMA lock to keep the vma stable as long
+ * as possible. Drop it before returning.
+ */
+ if (vmf.flags & FAULT_FLAG_VMA_LOCK)
+ vma_end_read(vma);
+ }
+ return ret;
}
/**
--
2.41.0.178.g377b9f9a00-goog
migration_entry_wait does not need VMA lock, therefore it can be
dropped before waiting.
Signed-off-by: Suren Baghdasaryan <[email protected]>
---
mm/memory.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 5caaa4c66ea2..bdf46fdc58d6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3715,8 +3715,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
- migration_entry_wait(vma->vm_mm, vmf->pmd,
- vmf->address);
+ /* Save mm in case VMA lock is dropped */
+ struct mm_struct *mm = vma->vm_mm;
+
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ /*
+ * No need to hold VMA lock for migration.
+ * WARNING: vma can't be used after this!
+ */
+ vma_end_read(vma);
+ ret |= VM_FAULT_COMPLETED;
+ }
+ migration_entry_wait(mm, vmf->pmd, vmf->address);
} else if (is_device_exclusive_entry(entry)) {
vmf->page = pfn_swap_entry_to_page(entry);
ret = remove_device_exclusive_entry(vmf);
--
2.41.0.178.g377b9f9a00-goog
VM_FAULT_RESULT_TRACE should contain an element for every vm_fault_reason
to be used as flag_array inside trace_print_flags_seq(). The element
for VM_FAULT_COMPLETED is missing, add it.
Signed-off-by: Suren Baghdasaryan <[email protected]>
Reviewed-by: Peter Xu <[email protected]>
---
include/linux/mm_types.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 306a3d1a0fa6..79765e3dd8f3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1070,7 +1070,8 @@ enum vm_fault_reason {
{ VM_FAULT_RETRY, "RETRY" }, \
{ VM_FAULT_FALLBACK, "FALLBACK" }, \
{ VM_FAULT_DONE_COW, "DONE_COW" }, \
- { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
+ { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \
+ { VM_FAULT_COMPLETED, "COMPLETED" }
struct vm_special_mapping {
const char *name; /* The name, e.g. "[vdso]". */
--
2.41.0.178.g377b9f9a00-goog
Change folio_lock_or_retry to accept vm_fault struct and return the
vm_fault_t directly. This will be used later to return additional
information about the state of the mmap_lock upon return from this
function.
Suggested-by: Matthew Wilcox <[email protected]>
Signed-off-by: Suren Baghdasaryan <[email protected]>
---
include/linux/pagemap.h | 13 ++++++-------
mm/filemap.c | 29 +++++++++++++++--------------
mm/memory.c | 14 ++++++--------
3 files changed, 27 insertions(+), 29 deletions(-)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a56308a9d1a4..0bc206c6f62c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -896,8 +896,7 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page,
void __folio_lock(struct folio *folio);
int __folio_lock_killable(struct folio *folio);
-bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
- unsigned int flags);
+vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf);
void unlock_page(struct page *page);
void folio_unlock(struct folio *folio);
@@ -995,17 +994,17 @@ static inline int folio_lock_killable(struct folio *folio)
}
/*
- * folio_lock_or_retry - Lock the folio, unless this would block and the
+ * folio_lock_fault - Lock the folio, unless this would block and the
* caller indicated that it can handle a retry.
*
* Return value and mmap_lock implications depend on flags; see
- * __folio_lock_or_retry().
+ * __folio_lock_fault().
*/
-static inline bool folio_lock_or_retry(struct folio *folio,
- struct mm_struct *mm, unsigned int flags)
+static inline vm_fault_t folio_lock_fault(struct folio *folio,
+ struct vm_fault *vmf)
{
might_sleep();
- return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags);
+ return folio_trylock(folio) ? 0 : __folio_lock_fault(folio, vmf);
}
/*
diff --git a/mm/filemap.c b/mm/filemap.c
index 00f01d8ead47..87b335a93530 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1701,46 +1701,47 @@ static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
/*
* Return values:
- * true - folio is locked; mmap_lock is still held.
- * false - folio is not locked.
+ * 0 - folio is locked.
+ * VM_FAULT_RETRY - folio is not locked.
* mmap_lock has been released (mmap_read_unlock(), unless flags had both
* FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
* which case mmap_lock is still held.
*
- * If neither ALLOW_RETRY nor KILLABLE are set, will always return true
+ * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
* with the folio locked and the mmap_lock unperturbed.
*/
-bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
- unsigned int flags)
+vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
{
- if (fault_flag_allow_retry_first(flags)) {
+ struct mm_struct *mm = vmf->vma->vm_mm;
+
+ if (fault_flag_allow_retry_first(vmf->flags)) {
/*
* CAUTION! In this case, mmap_lock is not released
- * even though return 0.
+ * even though return VM_FAULT_RETRY.
*/
- if (flags & FAULT_FLAG_RETRY_NOWAIT)
- return false;
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
+ return VM_FAULT_RETRY;
mmap_read_unlock(mm);
- if (flags & FAULT_FLAG_KILLABLE)
+ if (vmf->flags & FAULT_FLAG_KILLABLE)
folio_wait_locked_killable(folio);
else
folio_wait_locked(folio);
- return false;
+ return VM_FAULT_RETRY;
}
- if (flags & FAULT_FLAG_KILLABLE) {
+ if (vmf->flags & FAULT_FLAG_KILLABLE) {
bool ret;
ret = __folio_lock_killable(folio);
if (ret) {
mmap_read_unlock(mm);
- return false;
+ return VM_FAULT_RETRY;
}
} else {
__folio_lock(folio);
}
- return true;
+ return 0;
}
/**
diff --git a/mm/memory.c b/mm/memory.c
index 9011ad63c41b..3c2acafcd7b6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3568,6 +3568,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
struct folio *folio = page_folio(vmf->page);
struct vm_area_struct *vma = vmf->vma;
struct mmu_notifier_range range;
+ vm_fault_t ret;
/*
* We need a reference to lock the folio because we don't hold
@@ -3580,9 +3581,10 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
if (!folio_try_get(folio))
return 0;
- if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
+ ret = folio_lock_fault(folio, vmf);
+ if (ret) {
folio_put(folio);
- return VM_FAULT_RETRY;
+ return ret;
}
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
vma->vm_mm, vmf->address & PAGE_MASK,
@@ -3704,7 +3706,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
bool exclusive = false;
swp_entry_t entry;
pte_t pte;
- int locked;
vm_fault_t ret = 0;
void *shadow = NULL;
@@ -3825,12 +3826,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_release;
}
- locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags);
-
- if (!locked) {
- ret |= VM_FAULT_RETRY;
+ ret |= folio_lock_fault(folio, vmf);
+ if (ret & VM_FAULT_RETRY)
goto out_release;
- }
if (swapcache) {
/*
--
2.41.0.178.g377b9f9a00-goog
Suren Baghdasaryan <[email protected]> writes:
> folio_lock_fault might drop mmap_lock before returning and to extend it
> to work with per-VMA locks, the callers will need to know whether the
> lock was dropped or is still held. Introduce new fault_flag to indicate
> whether the lock got dropped and store it inside vm_fault flags.
>
> Signed-off-by: Suren Baghdasaryan <[email protected]>
> ---
> include/linux/mm_types.h | 1 +
> mm/filemap.c | 2 ++
> 2 files changed, 3 insertions(+)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 79765e3dd8f3..6f0dbef7aa1f 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1169,6 +1169,7 @@ enum fault_flag {
> FAULT_FLAG_UNSHARE = 1 << 10,
> FAULT_FLAG_ORIG_PTE_VALID = 1 << 11,
> FAULT_FLAG_VMA_LOCK = 1 << 12,
> + FAULT_FLAG_LOCK_DROPPED = 1 << 13,
Minor nit but this should also be added to the enum documentation
comment above this.
> };
>
> typedef unsigned int __bitwise zap_flags_t;
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 87b335a93530..8ad06d69895b 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1723,6 +1723,7 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> return VM_FAULT_RETRY;
>
> mmap_read_unlock(mm);
> + vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> if (vmf->flags & FAULT_FLAG_KILLABLE)
> folio_wait_locked_killable(folio);
> else
> @@ -1735,6 +1736,7 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> ret = __folio_lock_killable(folio);
> if (ret) {
> mmap_read_unlock(mm);
> + vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> return VM_FAULT_RETRY;
> }
> } else {
Suren Baghdasaryan <[email protected]> writes:
> migration_entry_wait does not need VMA lock, therefore it can be
> dropped before waiting.
>
> Signed-off-by: Suren Baghdasaryan <[email protected]>
> ---
> mm/memory.c | 14 ++++++++++++--
> 1 file changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 5caaa4c66ea2..bdf46fdc58d6 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3715,8 +3715,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> entry = pte_to_swp_entry(vmf->orig_pte);
> if (unlikely(non_swap_entry(entry))) {
> if (is_migration_entry(entry)) {
> - migration_entry_wait(vma->vm_mm, vmf->pmd,
> - vmf->address);
> + /* Save mm in case VMA lock is dropped */
> + struct mm_struct *mm = vma->vm_mm;
> +
> + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> + /*
> + * No need to hold VMA lock for migration.
> + * WARNING: vma can't be used after this!
> + */
> + vma_end_read(vma);
> + ret |= VM_FAULT_COMPLETED;
Doesn't this need to also set FAULT_FLAG_LOCK_DROPPED to ensure we don't
call vma_end_read() again in __handle_mm_fault()?
> + }
> + migration_entry_wait(mm, vmf->pmd, vmf->address);
> } else if (is_device_exclusive_entry(entry)) {
> vmf->page = pfn_swap_entry_to_page(entry);
> ret = remove_device_exclusive_entry(vmf);
On Mon, Jun 26, 2023 at 09:23:16PM -0700, Suren Baghdasaryan wrote:
> handle_mm_fault returning VM_FAULT_RETRY or VM_FAULT_COMPLETED means
> mmap_lock has been released. However with per-VMA locks behavior is
> different and the caller should still release it. To make the
> rules consistent for the caller, drop the per-VMA lock before returning
> from handle_mm_fault when page fault should be retried or is completed.
>
> Signed-off-by: Suren Baghdasaryan <[email protected]>
> ---
> arch/arm64/mm/fault.c | 3 ++-
> arch/powerpc/mm/fault.c | 3 ++-
> arch/s390/mm/fault.c | 3 ++-
> arch/x86/mm/fault.c | 3 ++-
> mm/memory.c | 12 +++++++++++-
> 5 files changed, 19 insertions(+), 5 deletions(-)
>
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index 6045a5117ac1..89f84e9ea1ff 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -601,7 +601,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
> goto lock_mmap;
> }
> fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs);
> - vma_end_read(vma);
> + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
> + vma_end_read(vma);
>
> if (!(fault & VM_FAULT_RETRY)) {
> count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 531177a4ee08..4697c5dca31c 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -494,7 +494,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
> }
>
> fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
> - vma_end_read(vma);
> + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
> + vma_end_read(vma);
>
> if (!(fault & VM_FAULT_RETRY)) {
> count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
> diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
> index b65144c392b0..cccefe41038b 100644
> --- a/arch/s390/mm/fault.c
> +++ b/arch/s390/mm/fault.c
> @@ -418,7 +418,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
> goto lock_mmap;
> }
> fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
> - vma_end_read(vma);
> + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
> + vma_end_read(vma);
> if (!(fault & VM_FAULT_RETRY)) {
> count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
> goto out;
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index e4399983c50c..d69c85c1c04e 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -1347,7 +1347,8 @@ void do_user_addr_fault(struct pt_regs *regs,
> goto lock_mmap;
> }
> fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
> - vma_end_read(vma);
> + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
> + vma_end_read(vma);
>
> if (!(fault & VM_FAULT_RETRY)) {
> count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
> diff --git a/mm/memory.c b/mm/memory.c
> index f69fbc251198..9011ad63c41b 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5086,7 +5086,17 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
> }
> }
>
> - return handle_pte_fault(&vmf);
> + ret = handle_pte_fault(&vmf);
> + if (ret & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)) {
> + /*
> + * In case of VM_FAULT_RETRY or VM_FAULT_COMPLETED we might
> + * be still holding per-VMA lock to keep the vma stable as long
> + * as possible. Drop it before returning.
> + */
> + if (vmf.flags & FAULT_FLAG_VMA_LOCK)
> + vma_end_read(vma);
> + }
This smells hackish.. I'd think better we just release the lock at the
place where we'll return RETRY, and AFAIU swap is the only place vma lock
returns a RETRY with current code base?
do_swap_page():
if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
ret = VM_FAULT_RETRY;
goto out;
}
I.e., I don't think VM_FAULT_COMPLETED can even be returned with vma lock
paths yet as it doesn't yet support VM_SHARED.
--
Peter Xu
On Mon, Jun 26, 2023 at 09:23:18PM -0700, Suren Baghdasaryan wrote:
> folio_lock_fault might drop mmap_lock before returning and to extend it
> to work with per-VMA locks, the callers will need to know whether the
> lock was dropped or is still held. Introduce new fault_flag to indicate
> whether the lock got dropped and store it inside vm_fault flags.
>
> Signed-off-by: Suren Baghdasaryan <[email protected]>
> ---
> include/linux/mm_types.h | 1 +
> mm/filemap.c | 2 ++
> 2 files changed, 3 insertions(+)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 79765e3dd8f3..6f0dbef7aa1f 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1169,6 +1169,7 @@ enum fault_flag {
> FAULT_FLAG_UNSHARE = 1 << 10,
> FAULT_FLAG_ORIG_PTE_VALID = 1 << 11,
> FAULT_FLAG_VMA_LOCK = 1 << 12,
> + FAULT_FLAG_LOCK_DROPPED = 1 << 13,
> };
>
> typedef unsigned int __bitwise zap_flags_t;
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 87b335a93530..8ad06d69895b 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1723,6 +1723,7 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> return VM_FAULT_RETRY;
>
> mmap_read_unlock(mm);
> + vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> if (vmf->flags & FAULT_FLAG_KILLABLE)
> folio_wait_locked_killable(folio);
> else
> @@ -1735,6 +1736,7 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> ret = __folio_lock_killable(folio);
> if (ret) {
> mmap_read_unlock(mm);
> + vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> return VM_FAULT_RETRY;
> }
> } else {
IIRC we've discussed about this bits in previous version, and the consensus
was that we don't need yet another flag? Just to recap: I think relying on
RETRY|COMPLETE would be enough for vma lock, as NOWAIT is only used by gup
while not affecting vma lockings, no?
As mentioned in the other reply, even COMPLETE won't appear for vma lock
path yet afaict, so mostly only RETRY matters here and it can 100% imply a
lock release happened. It's just that it's very easy to still cover
COMPLETE altogether in this case, being prepared for any possible shared
support on vma locks, IMHO.
Thanks,
--
Peter Xu
On Tue, Jun 27, 2023 at 1:06 AM Alistair Popple <[email protected]> wrote:
>
>
> Suren Baghdasaryan <[email protected]> writes:
>
> > migration_entry_wait does not need VMA lock, therefore it can be
> > dropped before waiting.
> >
> > Signed-off-by: Suren Baghdasaryan <[email protected]>
> > ---
> > mm/memory.c | 14 ++++++++++++--
> > 1 file changed, 12 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 5caaa4c66ea2..bdf46fdc58d6 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3715,8 +3715,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > entry = pte_to_swp_entry(vmf->orig_pte);
> > if (unlikely(non_swap_entry(entry))) {
> > if (is_migration_entry(entry)) {
> > - migration_entry_wait(vma->vm_mm, vmf->pmd,
> > - vmf->address);
> > + /* Save mm in case VMA lock is dropped */
> > + struct mm_struct *mm = vma->vm_mm;
> > +
> > + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> > + /*
> > + * No need to hold VMA lock for migration.
> > + * WARNING: vma can't be used after this!
> > + */
> > + vma_end_read(vma);
> > + ret |= VM_FAULT_COMPLETED;
>
> Doesn't this need to also set FAULT_FLAG_LOCK_DROPPED to ensure we don't
> call vma_end_read() again in __handle_mm_fault()?
Uh, right. Got lost during the last refactoring. Thanks for flagging!
>
> > + }
> > + migration_entry_wait(mm, vmf->pmd, vmf->address);
> > } else if (is_device_exclusive_entry(entry)) {
> > vmf->page = pfn_swap_entry_to_page(entry);
> > ret = remove_device_exclusive_entry(vmf);
>
On Mon, Jun 26, 2023 at 09:23:20PM -0700, Suren Baghdasaryan wrote:
> migration_entry_wait does not need VMA lock, therefore it can be
> dropped before waiting.
Hmm, I'm not sure..
Note that we're still dereferencing *vmf->pmd when waiting, while *pmd is
on the page table and IIUC only be guaranteed if the vma is still there.
If without both mmap / vma lock I don't see what makes sure the pgtable is
always there. E.g. IIUC a race can happen where unmap() runs right after
vma_end_read() below but before pmdp_get_lockless() (inside
migration_entry_wait()), then pmdp_get_lockless() can read some random
things if the pgtable is freed.
>
> Signed-off-by: Suren Baghdasaryan <[email protected]>
> ---
> mm/memory.c | 14 ++++++++++++--
> 1 file changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 5caaa4c66ea2..bdf46fdc58d6 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3715,8 +3715,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> entry = pte_to_swp_entry(vmf->orig_pte);
> if (unlikely(non_swap_entry(entry))) {
> if (is_migration_entry(entry)) {
> - migration_entry_wait(vma->vm_mm, vmf->pmd,
> - vmf->address);
> + /* Save mm in case VMA lock is dropped */
> + struct mm_struct *mm = vma->vm_mm;
> +
> + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> + /*
> + * No need to hold VMA lock for migration.
> + * WARNING: vma can't be used after this!
> + */
> + vma_end_read(vma);
> + ret |= VM_FAULT_COMPLETED;
> + }
> + migration_entry_wait(mm, vmf->pmd, vmf->address);
> } else if (is_device_exclusive_entry(entry)) {
> vmf->page = pfn_swap_entry_to_page(entry);
> ret = remove_device_exclusive_entry(vmf);
> --
> 2.41.0.178.g377b9f9a00-goog
>
--
Peter Xu
On Mon, Jun 26, 2023 at 09:23:19PM -0700, Suren Baghdasaryan wrote:
> When page fault is handled under per-VMA lock protection, all swap page
> faults are retried with mmap_lock because folio_lock_fault (formerly
> known as folio_lock_or_retry) had to drop and reacquire mmap_lock
> if folio could not be immediately locked.
> Follow the same pattern as mmap_lock to drop per-VMA lock when waiting
> for folio in folio_lock_fault and retrying once folio is available.
> With this obstacle removed, enable do_swap_page to operate under
> per-VMA lock protection. Drivers implementing ops->migrate_to_ram might
> still rely on mmap_lock, therefore we have to fall back to mmap_lock in
> that particular case.
> Note that the only time do_swap_page calls synchronous swap_readpage
> is when SWP_SYNCHRONOUS_IO is set, which is only set for
> QUEUE_FLAG_SYNCHRONOUS devices: brd, zram and nvdimms (both btt and
> pmem). Therefore we don't sleep in this path, and there's no need to
> drop the mmap or per-VMA lock.
>
> Signed-off-by: Suren Baghdasaryan <[email protected]>
> ---
> mm/filemap.c | 24 ++++++++++++++++--------
> mm/memory.c | 21 ++++++++++++++-------
> 2 files changed, 30 insertions(+), 15 deletions(-)
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 8ad06d69895b..683f11f244cd 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1703,12 +1703,14 @@ static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
> * Return values:
> * 0 - folio is locked.
> * VM_FAULT_RETRY - folio is not locked.
> - * mmap_lock has been released (mmap_read_unlock(), unless flags had both
> - * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
> - * which case mmap_lock is still held.
> + * FAULT_FLAG_LOCK_DROPPED bit in vmf flags will be set if mmap_lock or
This "FAULT_FLAG_LOCK_DROPPED" should belong to that patch when introduced.
But again I still think this flag as a whole with that patch is not needed
and should be dropped, unless I miss something important..
> + * per-VMA lock got dropped. mmap_lock/per-VMA lock is dropped when
> + * function fails to lock the folio, unless flags had both
> + * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in which case
> + * the lock is still held.
> *
> * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
> - * with the folio locked and the mmap_lock unperturbed.
> + * with the folio locked and the mmap_lock/per-VMA lock unperturbed.
> */
> vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> {
> @@ -1716,13 +1718,16 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
>
> if (fault_flag_allow_retry_first(vmf->flags)) {
> /*
> - * CAUTION! In this case, mmap_lock is not released
> - * even though return VM_FAULT_RETRY.
> + * CAUTION! In this case, mmap_lock/per-VMA lock is not
> + * released even though returning VM_FAULT_RETRY.
> */
> if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
> return VM_FAULT_RETRY;
>
> - mmap_read_unlock(mm);
> + if (vmf->flags & FAULT_FLAG_VMA_LOCK)
> + vma_end_read(vmf->vma);
> + else
> + mmap_read_unlock(mm);
> vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> if (vmf->flags & FAULT_FLAG_KILLABLE)
> folio_wait_locked_killable(folio);
> @@ -1735,7 +1740,10 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
>
> ret = __folio_lock_killable(folio);
> if (ret) {
> - mmap_read_unlock(mm);
> + if (vmf->flags & FAULT_FLAG_VMA_LOCK)
> + vma_end_read(vmf->vma);
> + else
> + mmap_read_unlock(mm);
> vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> return VM_FAULT_RETRY;
> }
> diff --git a/mm/memory.c b/mm/memory.c
> index 3c2acafcd7b6..5caaa4c66ea2 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3712,11 +3712,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> if (!pte_unmap_same(vmf))
> goto out;
>
> - if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
So if with my imagination, here we'll already have the vma_read_end() and
this patch will remove it, which makes sense. Then...
> - ret = VM_FAULT_RETRY;
> - goto out;
> - }
> -
> entry = pte_to_swp_entry(vmf->orig_pte);
> if (unlikely(non_swap_entry(entry))) {
> if (is_migration_entry(entry)) {
> @@ -3726,6 +3721,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> vmf->page = pfn_swap_entry_to_page(entry);
> ret = remove_device_exclusive_entry(vmf);
> } else if (is_device_private_entry(entry)) {
> + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> + /*
> + * migrate_to_ram is not yet ready to operate
> + * under VMA lock.
> + */
... here we probably just do vma_read_end(), then...
> + ret |= VM_FAULT_RETRY;
> + goto out;
> + }
> +
> vmf->page = pfn_swap_entry_to_page(entry);
> vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> vmf->address, &vmf->ptl);
> @@ -5089,9 +5093,12 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
> /*
> * In case of VM_FAULT_RETRY or VM_FAULT_COMPLETED we might
> * be still holding per-VMA lock to keep the vma stable as long
> - * as possible. Drop it before returning.
> + * as possible. In this situation vmf.flags has
> + * FAULT_FLAG_VMA_LOCK set and FAULT_FLAG_LOCK_DROPPED unset.
> + * Drop the lock before returning when this happens.
> */
> - if (vmf.flags & FAULT_FLAG_VMA_LOCK)
> + if ((vmf.flags & (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_LOCK_DROPPED)) ==
> + FAULT_FLAG_VMA_LOCK)
> vma_end_read(vma);
This whole chunk should have been dropped altogether with my comment in
previous patch, iiuc, and it should be no-op anyway for swap case. For the
real "waiting for page lock during swapin" phase we should always 100%
release the vma lock in folio_lock_or_retry() - just like mmap lock.
Thanks,
> }
> return ret;
> --
> 2.41.0.178.g377b9f9a00-goog
>
--
Peter Xu
On Tue, Jun 27, 2023 at 1:09 AM Alistair Popple <[email protected]> wrote:
>
>
> Suren Baghdasaryan <[email protected]> writes:
>
> > folio_lock_fault might drop mmap_lock before returning and to extend it
> > to work with per-VMA locks, the callers will need to know whether the
> > lock was dropped or is still held. Introduce new fault_flag to indicate
> > whether the lock got dropped and store it inside vm_fault flags.
> >
> > Signed-off-by: Suren Baghdasaryan <[email protected]>
> > ---
> > include/linux/mm_types.h | 1 +
> > mm/filemap.c | 2 ++
> > 2 files changed, 3 insertions(+)
> >
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 79765e3dd8f3..6f0dbef7aa1f 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -1169,6 +1169,7 @@ enum fault_flag {
> > FAULT_FLAG_UNSHARE = 1 << 10,
> > FAULT_FLAG_ORIG_PTE_VALID = 1 << 11,
> > FAULT_FLAG_VMA_LOCK = 1 << 12,
> > + FAULT_FLAG_LOCK_DROPPED = 1 << 13,
>
> Minor nit but this should also be added to the enum documentation
> comment above this.
Thanks! Sounds like we will be dripping the new flag, so hopefully I
won't need to document it :)
>
> > };
> >
> > typedef unsigned int __bitwise zap_flags_t;
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index 87b335a93530..8ad06d69895b 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -1723,6 +1723,7 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> > return VM_FAULT_RETRY;
> >
> > mmap_read_unlock(mm);
> > + vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> > if (vmf->flags & FAULT_FLAG_KILLABLE)
> > folio_wait_locked_killable(folio);
> > else
> > @@ -1735,6 +1736,7 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> > ret = __folio_lock_killable(folio);
> > if (ret) {
> > mmap_read_unlock(mm);
> > + vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> > return VM_FAULT_RETRY;
> > }
> > } else {
>
On Mon, Jun 26, 2023 at 09:23:17PM -0700, Suren Baghdasaryan wrote:
> Change folio_lock_or_retry to accept vm_fault struct and return the
> vm_fault_t directly. This will be used later to return additional
> information about the state of the mmap_lock upon return from this
> function.
>
> Suggested-by: Matthew Wilcox <[email protected]>
> Signed-off-by: Suren Baghdasaryan <[email protected]>
The patch looks all fine to me except on the renaming..
*_fault() makes me think of a fault handler, while *_lock_or_retry() was
there for years and it still sounds better than the new one to me.
Can we still come up with a better renaming, or just keep the name?
Thanks,
--
Peter Xu
On Tue, Jun 27, 2023 at 8:32 AM Peter Xu <[email protected]> wrote:
>
> On Mon, Jun 26, 2023 at 09:23:18PM -0700, Suren Baghdasaryan wrote:
> > folio_lock_fault might drop mmap_lock before returning and to extend it
> > to work with per-VMA locks, the callers will need to know whether the
> > lock was dropped or is still held. Introduce new fault_flag to indicate
> > whether the lock got dropped and store it inside vm_fault flags.
> >
> > Signed-off-by: Suren Baghdasaryan <[email protected]>
> > ---
> > include/linux/mm_types.h | 1 +
> > mm/filemap.c | 2 ++
> > 2 files changed, 3 insertions(+)
> >
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 79765e3dd8f3..6f0dbef7aa1f 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -1169,6 +1169,7 @@ enum fault_flag {
> > FAULT_FLAG_UNSHARE = 1 << 10,
> > FAULT_FLAG_ORIG_PTE_VALID = 1 << 11,
> > FAULT_FLAG_VMA_LOCK = 1 << 12,
> > + FAULT_FLAG_LOCK_DROPPED = 1 << 13,
> > };
> >
> > typedef unsigned int __bitwise zap_flags_t;
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index 87b335a93530..8ad06d69895b 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -1723,6 +1723,7 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> > return VM_FAULT_RETRY;
> >
> > mmap_read_unlock(mm);
> > + vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> > if (vmf->flags & FAULT_FLAG_KILLABLE)
> > folio_wait_locked_killable(folio);
> > else
> > @@ -1735,6 +1736,7 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> > ret = __folio_lock_killable(folio);
> > if (ret) {
> > mmap_read_unlock(mm);
> > + vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> > return VM_FAULT_RETRY;
> > }
> > } else {
>
> IIRC we've discussed about this bits in previous version, and the consensus
> was that we don't need yet another flag? Just to recap: I think relying on
> RETRY|COMPLETE would be enough for vma lock, as NOWAIT is only used by gup
> while not affecting vma lockings, no?
Sorry for missing that point. I focused on making VMA locks being
dropped for RETRY|COMPLETE and forgot to check after that change if
RETRY|COMPLETE is enough indication to conclude that VMA lock is
dropped. Looking at that now, I'm not sure that would be always true
for file-backed page faults (including shmem_fault()), but we do not
handle them under VMA locks for now anyway, so this indeed seems like
a safe assumption. When Matthew implements file-backed support he
needs to be careful to ensure this rule still holds. With your
suggestions to drop the VMA lock at the place where we return RETRY
this seems to indeed eliminate the need for FAULT_FLAG_LOCK_DROPPED
and simplifies things. I'll try that approach and see if anything
blows up.
>
> As mentioned in the other reply, even COMPLETE won't appear for vma lock
> path yet afaict, so mostly only RETRY matters here and it can 100% imply a
> lock release happened. It's just that it's very easy to still cover
> COMPLETE altogether in this case, being prepared for any possible shared
> support on vma locks, IMHO.
Yes and I do introduce one place where we use COMPLETE with VMA locks,
so will cover it the same way as for RETRY.
Thanks,
Suren.
>
> Thanks,
>
> --
> Peter Xu
>
On Mon, Jun 26, 2023 at 09:23:21PM -0700, Suren Baghdasaryan wrote:
> Enable handle_userfault to operate under VMA lock by releasing VMA lock
> instead of mmap_lock and retrying.
This mostly good to me (besides the new DROP flag.. of course), thanks.
Still some nitpicks below.
>
> Signed-off-by: Suren Baghdasaryan <[email protected]>
> ---
> fs/userfaultfd.c | 42 ++++++++++++++++++++++--------------------
> mm/memory.c | 9 ---------
> 2 files changed, 22 insertions(+), 29 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 4e800bb7d2ab..b88632c404b6 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -277,17 +277,17 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
> * hugepmd ranges.
> */
> static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
> - struct vm_area_struct *vma,
> - unsigned long address,
> - unsigned long flags,
> - unsigned long reason)
> + struct vm_fault *vmf,
> + unsigned long reason)
> {
> + struct vm_area_struct *vma = vmf->vma;
> pte_t *ptep, pte;
> bool ret = true;
>
> - mmap_assert_locked(ctx->mm);
> + if (!(vmf->flags & FAULT_FLAG_VMA_LOCK))
> + mmap_assert_locked(ctx->mm);
Maybe we can have a helper asserting proper vma protector locks (mmap for
!VMA_LOCK and vma read lock for VMA_LOCK)? It basically tells the context
the vma is still safe to access.
>
> - ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
> + ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
> if (!ptep)
> goto out;
>
> @@ -308,10 +308,8 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
> }
> #else
> static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
> - struct vm_area_struct *vma,
> - unsigned long address,
> - unsigned long flags,
> - unsigned long reason)
> + struct vm_fault *vmf,
> + unsigned long reason)
> {
> return false; /* should never get here */
> }
> @@ -325,11 +323,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
> * threads.
> */
> static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
> - unsigned long address,
> - unsigned long flags,
> + struct vm_fault *vmf,
> unsigned long reason)
> {
> struct mm_struct *mm = ctx->mm;
> + unsigned long address = vmf->address;
> pgd_t *pgd;
> p4d_t *p4d;
> pud_t *pud;
> @@ -337,7 +335,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
> pte_t *pte;
> bool ret = true;
>
> - mmap_assert_locked(mm);
> + if (!(vmf->flags & FAULT_FLAG_VMA_LOCK))
> + mmap_assert_locked(mm);
(the assert helper can also be used here)
>
> pgd = pgd_offset(mm, address);
> if (!pgd_present(*pgd))
> @@ -445,7 +444,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
> * Coredumping runs without mmap_lock so we can only check that
> * the mmap_lock is held, if PF_DUMPCORE was not set.
> */
> - mmap_assert_locked(mm);
> + if (!(vmf->flags & FAULT_FLAG_VMA_LOCK))
> + mmap_assert_locked(mm);
>
> ctx = vma->vm_userfaultfd_ctx.ctx;
> if (!ctx)
> @@ -561,15 +561,17 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
> spin_unlock_irq(&ctx->fault_pending_wqh.lock);
>
> if (!is_vm_hugetlb_page(vma))
> - must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
> - reason);
> + must_wait = userfaultfd_must_wait(ctx, vmf, reason);
> else
> - must_wait = userfaultfd_huge_must_wait(ctx, vma,
> - vmf->address,
> - vmf->flags, reason);
> + must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
> if (is_vm_hugetlb_page(vma))
> hugetlb_vma_unlock_read(vma);
> - mmap_read_unlock(mm);
> + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> + /* WARNING: VMA can't be used after this */
> + vma_end_read(vma);
> + } else
> + mmap_read_unlock(mm);
I also think maybe we should have a helper mm_release_fault_lock() just
release different locks for with/without VMA_LOCK. It can also be used in
the other patch of folio_lock_or_retry().
> + vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
>
> if (likely(must_wait && !READ_ONCE(ctx->released))) {
> wake_up_poll(&ctx->fd_wqh, EPOLLIN);
> diff --git a/mm/memory.c b/mm/memory.c
> index bdf46fdc58d6..923c1576bd14 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5316,15 +5316,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
> if (!vma_start_read(vma))
> goto inval;
>
> - /*
> - * Due to the possibility of userfault handler dropping mmap_lock, avoid
> - * it for now and fall back to page fault handling under mmap_lock.
> - */
> - if (userfaultfd_armed(vma)) {
> - vma_end_read(vma);
> - goto inval;
> - }
> -
> /* Check since vm_start/vm_end might change before we lock the VMA */
> if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
> vma_end_read(vma);
> --
> 2.41.0.178.g377b9f9a00-goog
>
--
Peter Xu
On Tue, Jun 27, 2023 at 8:54 AM Peter Xu <[email protected]> wrote:
>
> On Mon, Jun 26, 2023 at 09:23:21PM -0700, Suren Baghdasaryan wrote:
> > Enable handle_userfault to operate under VMA lock by releasing VMA lock
> > instead of mmap_lock and retrying.
>
> This mostly good to me (besides the new DROP flag.. of course), thanks.
> Still some nitpicks below.
>
> >
> > Signed-off-by: Suren Baghdasaryan <[email protected]>
> > ---
> > fs/userfaultfd.c | 42 ++++++++++++++++++++++--------------------
> > mm/memory.c | 9 ---------
> > 2 files changed, 22 insertions(+), 29 deletions(-)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 4e800bb7d2ab..b88632c404b6 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -277,17 +277,17 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
> > * hugepmd ranges.
> > */
> > static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
> > - struct vm_area_struct *vma,
> > - unsigned long address,
> > - unsigned long flags,
> > - unsigned long reason)
> > + struct vm_fault *vmf,
> > + unsigned long reason)
> > {
> > + struct vm_area_struct *vma = vmf->vma;
> > pte_t *ptep, pte;
> > bool ret = true;
> >
> > - mmap_assert_locked(ctx->mm);
> > + if (!(vmf->flags & FAULT_FLAG_VMA_LOCK))
> > + mmap_assert_locked(ctx->mm);
>
> Maybe we can have a helper asserting proper vma protector locks (mmap for
> !VMA_LOCK and vma read lock for VMA_LOCK)? It basically tells the context
> the vma is still safe to access.
>
> >
> > - ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
> > + ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
> > if (!ptep)
> > goto out;
> >
> > @@ -308,10 +308,8 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
> > }
> > #else
> > static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
> > - struct vm_area_struct *vma,
> > - unsigned long address,
> > - unsigned long flags,
> > - unsigned long reason)
> > + struct vm_fault *vmf,
> > + unsigned long reason)
> > {
> > return false; /* should never get here */
> > }
> > @@ -325,11 +323,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
> > * threads.
> > */
> > static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
> > - unsigned long address,
> > - unsigned long flags,
> > + struct vm_fault *vmf,
> > unsigned long reason)
> > {
> > struct mm_struct *mm = ctx->mm;
> > + unsigned long address = vmf->address;
> > pgd_t *pgd;
> > p4d_t *p4d;
> > pud_t *pud;
> > @@ -337,7 +335,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
> > pte_t *pte;
> > bool ret = true;
> >
> > - mmap_assert_locked(mm);
> > + if (!(vmf->flags & FAULT_FLAG_VMA_LOCK))
> > + mmap_assert_locked(mm);
>
> (the assert helper can also be used here)
>
> >
> > pgd = pgd_offset(mm, address);
> > if (!pgd_present(*pgd))
> > @@ -445,7 +444,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
> > * Coredumping runs without mmap_lock so we can only check that
> > * the mmap_lock is held, if PF_DUMPCORE was not set.
> > */
> > - mmap_assert_locked(mm);
> > + if (!(vmf->flags & FAULT_FLAG_VMA_LOCK))
> > + mmap_assert_locked(mm);
> >
> > ctx = vma->vm_userfaultfd_ctx.ctx;
> > if (!ctx)
> > @@ -561,15 +561,17 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
> > spin_unlock_irq(&ctx->fault_pending_wqh.lock);
> >
> > if (!is_vm_hugetlb_page(vma))
> > - must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
> > - reason);
> > + must_wait = userfaultfd_must_wait(ctx, vmf, reason);
> > else
> > - must_wait = userfaultfd_huge_must_wait(ctx, vma,
> > - vmf->address,
> > - vmf->flags, reason);
> > + must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
> > if (is_vm_hugetlb_page(vma))
> > hugetlb_vma_unlock_read(vma);
> > - mmap_read_unlock(mm);
> > + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> > + /* WARNING: VMA can't be used after this */
> > + vma_end_read(vma);
> > + } else
> > + mmap_read_unlock(mm);
>
> I also think maybe we should have a helper mm_release_fault_lock() just
> release different locks for with/without VMA_LOCK. It can also be used in
> the other patch of folio_lock_or_retry().
All seem to be good suggestions. I'll try implementing them in the
next version. Thanks!
>
> > + vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> >
> > if (likely(must_wait && !READ_ONCE(ctx->released))) {
> > wake_up_poll(&ctx->fd_wqh, EPOLLIN);
> > diff --git a/mm/memory.c b/mm/memory.c
> > index bdf46fdc58d6..923c1576bd14 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -5316,15 +5316,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
> > if (!vma_start_read(vma))
> > goto inval;
> >
> > - /*
> > - * Due to the possibility of userfault handler dropping mmap_lock, avoid
> > - * it for now and fall back to page fault handling under mmap_lock.
> > - */
> > - if (userfaultfd_armed(vma)) {
> > - vma_end_read(vma);
> > - goto inval;
> > - }
> > -
> > /* Check since vm_start/vm_end might change before we lock the VMA */
> > if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
> > vma_end_read(vma);
> > --
> > 2.41.0.178.g377b9f9a00-goog
> >
>
> --
> Peter Xu
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
>
On Tue, Jun 27, 2023 at 8:41 AM Peter Xu <[email protected]> wrote:
>
> On Mon, Jun 26, 2023 at 09:23:19PM -0700, Suren Baghdasaryan wrote:
> > When page fault is handled under per-VMA lock protection, all swap page
> > faults are retried with mmap_lock because folio_lock_fault (formerly
> > known as folio_lock_or_retry) had to drop and reacquire mmap_lock
> > if folio could not be immediately locked.
> > Follow the same pattern as mmap_lock to drop per-VMA lock when waiting
> > for folio in folio_lock_fault and retrying once folio is available.
> > With this obstacle removed, enable do_swap_page to operate under
> > per-VMA lock protection. Drivers implementing ops->migrate_to_ram might
> > still rely on mmap_lock, therefore we have to fall back to mmap_lock in
> > that particular case.
> > Note that the only time do_swap_page calls synchronous swap_readpage
> > is when SWP_SYNCHRONOUS_IO is set, which is only set for
> > QUEUE_FLAG_SYNCHRONOUS devices: brd, zram and nvdimms (both btt and
> > pmem). Therefore we don't sleep in this path, and there's no need to
> > drop the mmap or per-VMA lock.
> >
> > Signed-off-by: Suren Baghdasaryan <[email protected]>
> > ---
> > mm/filemap.c | 24 ++++++++++++++++--------
> > mm/memory.c | 21 ++++++++++++++-------
> > 2 files changed, 30 insertions(+), 15 deletions(-)
> >
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index 8ad06d69895b..683f11f244cd 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -1703,12 +1703,14 @@ static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
> > * Return values:
> > * 0 - folio is locked.
> > * VM_FAULT_RETRY - folio is not locked.
> > - * mmap_lock has been released (mmap_read_unlock(), unless flags had both
> > - * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
> > - * which case mmap_lock is still held.
> > + * FAULT_FLAG_LOCK_DROPPED bit in vmf flags will be set if mmap_lock or
>
> This "FAULT_FLAG_LOCK_DROPPED" should belong to that patch when introduced.
> But again I still think this flag as a whole with that patch is not needed
> and should be dropped, unless I miss something important..
>
> > + * per-VMA lock got dropped. mmap_lock/per-VMA lock is dropped when
> > + * function fails to lock the folio, unless flags had both
> > + * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in which case
> > + * the lock is still held.
> > *
> > * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
> > - * with the folio locked and the mmap_lock unperturbed.
> > + * with the folio locked and the mmap_lock/per-VMA lock unperturbed.
> > */
> > vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> > {
> > @@ -1716,13 +1718,16 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> >
> > if (fault_flag_allow_retry_first(vmf->flags)) {
> > /*
> > - * CAUTION! In this case, mmap_lock is not released
> > - * even though return VM_FAULT_RETRY.
> > + * CAUTION! In this case, mmap_lock/per-VMA lock is not
> > + * released even though returning VM_FAULT_RETRY.
> > */
> > if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
> > return VM_FAULT_RETRY;
> >
> > - mmap_read_unlock(mm);
> > + if (vmf->flags & FAULT_FLAG_VMA_LOCK)
> > + vma_end_read(vmf->vma);
> > + else
> > + mmap_read_unlock(mm);
> > vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> > if (vmf->flags & FAULT_FLAG_KILLABLE)
> > folio_wait_locked_killable(folio);
> > @@ -1735,7 +1740,10 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> >
> > ret = __folio_lock_killable(folio);
> > if (ret) {
> > - mmap_read_unlock(mm);
> > + if (vmf->flags & FAULT_FLAG_VMA_LOCK)
> > + vma_end_read(vmf->vma);
> > + else
> > + mmap_read_unlock(mm);
> > vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> > return VM_FAULT_RETRY;
> > }
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 3c2acafcd7b6..5caaa4c66ea2 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3712,11 +3712,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > if (!pte_unmap_same(vmf))
> > goto out;
> >
> > - if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
>
> So if with my imagination, here we'll already have the vma_read_end() and
> this patch will remove it, which makes sense. Then...
>
> > - ret = VM_FAULT_RETRY;
> > - goto out;
> > - }
> > -
> > entry = pte_to_swp_entry(vmf->orig_pte);
> > if (unlikely(non_swap_entry(entry))) {
> > if (is_migration_entry(entry)) {
> > @@ -3726,6 +3721,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > vmf->page = pfn_swap_entry_to_page(entry);
> > ret = remove_device_exclusive_entry(vmf);
> > } else if (is_device_private_entry(entry)) {
> > + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> > + /*
> > + * migrate_to_ram is not yet ready to operate
> > + * under VMA lock.
> > + */
>
> ... here we probably just do vma_read_end(), then...
>
> > + ret |= VM_FAULT_RETRY;
> > + goto out;
> > + }
> > +
> > vmf->page = pfn_swap_entry_to_page(entry);
> > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> > vmf->address, &vmf->ptl);
> > @@ -5089,9 +5093,12 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
> > /*
> > * In case of VM_FAULT_RETRY or VM_FAULT_COMPLETED we might
> > * be still holding per-VMA lock to keep the vma stable as long
> > - * as possible. Drop it before returning.
> > + * as possible. In this situation vmf.flags has
> > + * FAULT_FLAG_VMA_LOCK set and FAULT_FLAG_LOCK_DROPPED unset.
> > + * Drop the lock before returning when this happens.
> > */
> > - if (vmf.flags & FAULT_FLAG_VMA_LOCK)
> > + if ((vmf.flags & (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_LOCK_DROPPED)) ==
> > + FAULT_FLAG_VMA_LOCK)
> > vma_end_read(vma);
>
> This whole chunk should have been dropped altogether with my comment in
> previous patch, iiuc, and it should be no-op anyway for swap case. For the
> real "waiting for page lock during swapin" phase we should always 100%
> release the vma lock in folio_lock_or_retry() - just like mmap lock.
Yep, we drop FAULT_FLAG_LOCK_DROPPED, release vma lock when we return
RETRY and that makes all this unnecessary. I just need to make sure we
do not access VMA after we drop its lock since we will be releasing it
now earlier than before.
>
> Thanks,
>
> > }
> > return ret;
> > --
> > 2.41.0.178.g377b9f9a00-goog
> >
>
> --
> Peter Xu
>
On Tue, Jun 27, 2023 at 09:05:32AM -0700, Suren Baghdasaryan wrote:
> On Tue, Jun 27, 2023 at 8:41 AM Peter Xu <[email protected]> wrote:
> >
> > On Mon, Jun 26, 2023 at 09:23:19PM -0700, Suren Baghdasaryan wrote:
> > > When page fault is handled under per-VMA lock protection, all swap page
> > > faults are retried with mmap_lock because folio_lock_fault (formerly
> > > known as folio_lock_or_retry) had to drop and reacquire mmap_lock
> > > if folio could not be immediately locked.
> > > Follow the same pattern as mmap_lock to drop per-VMA lock when waiting
> > > for folio in folio_lock_fault and retrying once folio is available.
> > > With this obstacle removed, enable do_swap_page to operate under
> > > per-VMA lock protection. Drivers implementing ops->migrate_to_ram might
> > > still rely on mmap_lock, therefore we have to fall back to mmap_lock in
> > > that particular case.
> > > Note that the only time do_swap_page calls synchronous swap_readpage
> > > is when SWP_SYNCHRONOUS_IO is set, which is only set for
> > > QUEUE_FLAG_SYNCHRONOUS devices: brd, zram and nvdimms (both btt and
> > > pmem). Therefore we don't sleep in this path, and there's no need to
> > > drop the mmap or per-VMA lock.
> > >
> > > Signed-off-by: Suren Baghdasaryan <[email protected]>
> > > ---
> > > mm/filemap.c | 24 ++++++++++++++++--------
> > > mm/memory.c | 21 ++++++++++++++-------
> > > 2 files changed, 30 insertions(+), 15 deletions(-)
> > >
> > > diff --git a/mm/filemap.c b/mm/filemap.c
> > > index 8ad06d69895b..683f11f244cd 100644
> > > --- a/mm/filemap.c
> > > +++ b/mm/filemap.c
> > > @@ -1703,12 +1703,14 @@ static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
> > > * Return values:
> > > * 0 - folio is locked.
> > > * VM_FAULT_RETRY - folio is not locked.
> > > - * mmap_lock has been released (mmap_read_unlock(), unless flags had both
> > > - * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
> > > - * which case mmap_lock is still held.
> > > + * FAULT_FLAG_LOCK_DROPPED bit in vmf flags will be set if mmap_lock or
> >
> > This "FAULT_FLAG_LOCK_DROPPED" should belong to that patch when introduced.
> > But again I still think this flag as a whole with that patch is not needed
> > and should be dropped, unless I miss something important..
> >
> > > + * per-VMA lock got dropped. mmap_lock/per-VMA lock is dropped when
> > > + * function fails to lock the folio, unless flags had both
> > > + * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in which case
> > > + * the lock is still held.
> > > *
> > > * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
> > > - * with the folio locked and the mmap_lock unperturbed.
> > > + * with the folio locked and the mmap_lock/per-VMA lock unperturbed.
> > > */
> > > vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> > > {
> > > @@ -1716,13 +1718,16 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> > >
> > > if (fault_flag_allow_retry_first(vmf->flags)) {
> > > /*
> > > - * CAUTION! In this case, mmap_lock is not released
> > > - * even though return VM_FAULT_RETRY.
> > > + * CAUTION! In this case, mmap_lock/per-VMA lock is not
> > > + * released even though returning VM_FAULT_RETRY.
> > > */
> > > if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
> > > return VM_FAULT_RETRY;
> > >
> > > - mmap_read_unlock(mm);
> > > + if (vmf->flags & FAULT_FLAG_VMA_LOCK)
> > > + vma_end_read(vmf->vma);
> > > + else
> > > + mmap_read_unlock(mm);
> > > vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> > > if (vmf->flags & FAULT_FLAG_KILLABLE)
> > > folio_wait_locked_killable(folio);
> > > @@ -1735,7 +1740,10 @@ vm_fault_t __folio_lock_fault(struct folio *folio, struct vm_fault *vmf)
> > >
> > > ret = __folio_lock_killable(folio);
> > > if (ret) {
> > > - mmap_read_unlock(mm);
> > > + if (vmf->flags & FAULT_FLAG_VMA_LOCK)
> > > + vma_end_read(vmf->vma);
> > > + else
> > > + mmap_read_unlock(mm);
> > > vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
> > > return VM_FAULT_RETRY;
> > > }
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index 3c2acafcd7b6..5caaa4c66ea2 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -3712,11 +3712,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > > if (!pte_unmap_same(vmf))
> > > goto out;
> > >
> > > - if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> >
> > So if with my imagination, here we'll already have the vma_read_end() and
> > this patch will remove it, which makes sense. Then...
> >
> > > - ret = VM_FAULT_RETRY;
> > > - goto out;
> > > - }
> > > -
> > > entry = pte_to_swp_entry(vmf->orig_pte);
> > > if (unlikely(non_swap_entry(entry))) {
> > > if (is_migration_entry(entry)) {
> > > @@ -3726,6 +3721,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > > vmf->page = pfn_swap_entry_to_page(entry);
> > > ret = remove_device_exclusive_entry(vmf);
> > > } else if (is_device_private_entry(entry)) {
> > > + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> > > + /*
> > > + * migrate_to_ram is not yet ready to operate
> > > + * under VMA lock.
> > > + */
> >
> > ... here we probably just do vma_read_end(), then...
> >
> > > + ret |= VM_FAULT_RETRY;
> > > + goto out;
> > > + }
> > > +
> > > vmf->page = pfn_swap_entry_to_page(entry);
> > > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> > > vmf->address, &vmf->ptl);
> > > @@ -5089,9 +5093,12 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
> > > /*
> > > * In case of VM_FAULT_RETRY or VM_FAULT_COMPLETED we might
> > > * be still holding per-VMA lock to keep the vma stable as long
> > > - * as possible. Drop it before returning.
> > > + * as possible. In this situation vmf.flags has
> > > + * FAULT_FLAG_VMA_LOCK set and FAULT_FLAG_LOCK_DROPPED unset.
> > > + * Drop the lock before returning when this happens.
> > > */
> > > - if (vmf.flags & FAULT_FLAG_VMA_LOCK)
> > > + if ((vmf.flags & (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_LOCK_DROPPED)) ==
> > > + FAULT_FLAG_VMA_LOCK)
> > > vma_end_read(vma);
> >
> > This whole chunk should have been dropped altogether with my comment in
> > previous patch, iiuc, and it should be no-op anyway for swap case. For the
> > real "waiting for page lock during swapin" phase we should always 100%
> > release the vma lock in folio_lock_or_retry() - just like mmap lock.
>
> Yep, we drop FAULT_FLAG_LOCK_DROPPED, release vma lock when we return
> RETRY and that makes all this unnecessary. I just need to make sure we
> do not access VMA after we drop its lock since we will be releasing it
> now earlier than before.
Yes. Hopefully that's not a problem, because mmap lock has it the same
way. And that's also the good thing if we can always keep both locks
behaving the same if possible, because the problem is really shared, imho.
--
Peter Xu
On Tue, Jun 27, 2023 at 8:49 AM Peter Xu <[email protected]> wrote:
>
> On Mon, Jun 26, 2023 at 09:23:20PM -0700, Suren Baghdasaryan wrote:
> > migration_entry_wait does not need VMA lock, therefore it can be
> > dropped before waiting.
>
> Hmm, I'm not sure..
>
> Note that we're still dereferencing *vmf->pmd when waiting, while *pmd is
> on the page table and IIUC only be guaranteed if the vma is still there.
> If without both mmap / vma lock I don't see what makes sure the pgtable is
> always there. E.g. IIUC a race can happen where unmap() runs right after
> vma_end_read() below but before pmdp_get_lockless() (inside
> migration_entry_wait()), then pmdp_get_lockless() can read some random
> things if the pgtable is freed.
That sounds correct. I thought ptl would keep pmd stable but there is
time between vma_end_read() and spin_lock(ptl) when it can be freed
from under us. I think it would work if we do vma_end_read() after
spin_lock(ptl) but that requires code refactoring. I'll probably drop
this optimization from the patchset for now to keep things simple and
will get back to it later.
>
> >
> > Signed-off-by: Suren Baghdasaryan <[email protected]>
> > ---
> > mm/memory.c | 14 ++++++++++++--
> > 1 file changed, 12 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 5caaa4c66ea2..bdf46fdc58d6 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3715,8 +3715,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > entry = pte_to_swp_entry(vmf->orig_pte);
> > if (unlikely(non_swap_entry(entry))) {
> > if (is_migration_entry(entry)) {
> > - migration_entry_wait(vma->vm_mm, vmf->pmd,
> > - vmf->address);
> > + /* Save mm in case VMA lock is dropped */
> > + struct mm_struct *mm = vma->vm_mm;
> > +
> > + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> > + /*
> > + * No need to hold VMA lock for migration.
> > + * WARNING: vma can't be used after this!
> > + */
> > + vma_end_read(vma);
> > + ret |= VM_FAULT_COMPLETED;
> > + }
> > + migration_entry_wait(mm, vmf->pmd, vmf->address);
> > } else if (is_device_exclusive_entry(entry)) {
> > vmf->page = pfn_swap_entry_to_page(entry);
> > ret = remove_device_exclusive_entry(vmf);
> > --
> > 2.41.0.178.g377b9f9a00-goog
> >
>
> --
> Peter Xu
>
On Tue, Jun 27, 2023 at 8:22 AM Peter Xu <[email protected]> wrote:
>
> On Mon, Jun 26, 2023 at 09:23:17PM -0700, Suren Baghdasaryan wrote:
> > Change folio_lock_or_retry to accept vm_fault struct and return the
> > vm_fault_t directly. This will be used later to return additional
> > information about the state of the mmap_lock upon return from this
> > function.
> >
> > Suggested-by: Matthew Wilcox <[email protected]>
> > Signed-off-by: Suren Baghdasaryan <[email protected]>
>
> The patch looks all fine to me except on the renaming..
>
> *_fault() makes me think of a fault handler, while *_lock_or_retry() was
> there for years and it still sounds better than the new one to me.
>
> Can we still come up with a better renaming, or just keep the name?
I thought about alternatives but could not find anything better. I can
keep the old name if that is preferred.
>
> Thanks,
>
> --
> Peter Xu
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
>
On Tue, Jun 27, 2023 at 8:28 AM Peter Xu <[email protected]> wrote:
>
> On Mon, Jun 26, 2023 at 09:23:16PM -0700, Suren Baghdasaryan wrote:
> > handle_mm_fault returning VM_FAULT_RETRY or VM_FAULT_COMPLETED means
> > mmap_lock has been released. However with per-VMA locks behavior is
> > different and the caller should still release it. To make the
> > rules consistent for the caller, drop the per-VMA lock before returning
> > from handle_mm_fault when page fault should be retried or is completed.
> >
> > Signed-off-by: Suren Baghdasaryan <[email protected]>
> > ---
> > arch/arm64/mm/fault.c | 3 ++-
> > arch/powerpc/mm/fault.c | 3 ++-
> > arch/s390/mm/fault.c | 3 ++-
> > arch/x86/mm/fault.c | 3 ++-
> > mm/memory.c | 12 +++++++++++-
> > 5 files changed, 19 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> > index 6045a5117ac1..89f84e9ea1ff 100644
> > --- a/arch/arm64/mm/fault.c
> > +++ b/arch/arm64/mm/fault.c
> > @@ -601,7 +601,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
> > goto lock_mmap;
> > }
> > fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs);
> > - vma_end_read(vma);
> > + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
> > + vma_end_read(vma);
> >
> > if (!(fault & VM_FAULT_RETRY)) {
> > count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
> > diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> > index 531177a4ee08..4697c5dca31c 100644
> > --- a/arch/powerpc/mm/fault.c
> > +++ b/arch/powerpc/mm/fault.c
> > @@ -494,7 +494,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
> > }
> >
> > fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
> > - vma_end_read(vma);
> > + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
> > + vma_end_read(vma);
> >
> > if (!(fault & VM_FAULT_RETRY)) {
> > count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
> > diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
> > index b65144c392b0..cccefe41038b 100644
> > --- a/arch/s390/mm/fault.c
> > +++ b/arch/s390/mm/fault.c
> > @@ -418,7 +418,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
> > goto lock_mmap;
> > }
> > fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
> > - vma_end_read(vma);
> > + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
> > + vma_end_read(vma);
> > if (!(fault & VM_FAULT_RETRY)) {
> > count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
> > goto out;
> > diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> > index e4399983c50c..d69c85c1c04e 100644
> > --- a/arch/x86/mm/fault.c
> > +++ b/arch/x86/mm/fault.c
> > @@ -1347,7 +1347,8 @@ void do_user_addr_fault(struct pt_regs *regs,
> > goto lock_mmap;
> > }
> > fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
> > - vma_end_read(vma);
> > + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
> > + vma_end_read(vma);
> >
> > if (!(fault & VM_FAULT_RETRY)) {
> > count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
> > diff --git a/mm/memory.c b/mm/memory.c
> > index f69fbc251198..9011ad63c41b 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -5086,7 +5086,17 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
> > }
> > }
> >
> > - return handle_pte_fault(&vmf);
> > + ret = handle_pte_fault(&vmf);
> > + if (ret & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)) {
> > + /*
> > + * In case of VM_FAULT_RETRY or VM_FAULT_COMPLETED we might
> > + * be still holding per-VMA lock to keep the vma stable as long
> > + * as possible. Drop it before returning.
> > + */
> > + if (vmf.flags & FAULT_FLAG_VMA_LOCK)
> > + vma_end_read(vma);
> > + }
>
> This smells hackish.. I'd think better we just release the lock at the
> place where we'll return RETRY, and AFAIU swap is the only place vma lock
> returns a RETRY with current code base?
>
> do_swap_page():
> if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> + vma_end_read(vma);
> ret = VM_FAULT_RETRY;
> goto out;
> }
>
> I.e., I don't think VM_FAULT_COMPLETED can even be returned with vma lock
> paths yet as it doesn't yet support VM_SHARED.
Ack.
>
> --
> Peter Xu
>
Suren Baghdasaryan <[email protected]> writes:
> On Tue, Jun 27, 2023 at 8:49 AM Peter Xu <[email protected]> wrote:
>>
>> On Mon, Jun 26, 2023 at 09:23:20PM -0700, Suren Baghdasaryan wrote:
>> > migration_entry_wait does not need VMA lock, therefore it can be
>> > dropped before waiting.
>>
>> Hmm, I'm not sure..
>>
>> Note that we're still dereferencing *vmf->pmd when waiting, while *pmd is
>> on the page table and IIUC only be guaranteed if the vma is still there.
>> If without both mmap / vma lock I don't see what makes sure the pgtable is
>> always there. E.g. IIUC a race can happen where unmap() runs right after
>> vma_end_read() below but before pmdp_get_lockless() (inside
>> migration_entry_wait()), then pmdp_get_lockless() can read some random
>> things if the pgtable is freed.
>
> That sounds correct. I thought ptl would keep pmd stable but there is
> time between vma_end_read() and spin_lock(ptl) when it can be freed
> from under us. I think it would work if we do vma_end_read() after
> spin_lock(ptl) but that requires code refactoring. I'll probably drop
> this optimization from the patchset for now to keep things simple and
> will get back to it later.
Oh thanks Peter that's a good point. It could be made to work, but agree
it's probably not worth the code refactoring at this point so I'm ok if
the optimisation is dropped for now.
>>
>> >
>> > Signed-off-by: Suren Baghdasaryan <[email protected]>
>> > ---
>> > mm/memory.c | 14 ++++++++++++--
>> > 1 file changed, 12 insertions(+), 2 deletions(-)
>> >
>> > diff --git a/mm/memory.c b/mm/memory.c
>> > index 5caaa4c66ea2..bdf46fdc58d6 100644
>> > --- a/mm/memory.c
>> > +++ b/mm/memory.c
>> > @@ -3715,8 +3715,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> > entry = pte_to_swp_entry(vmf->orig_pte);
>> > if (unlikely(non_swap_entry(entry))) {
>> > if (is_migration_entry(entry)) {
>> > - migration_entry_wait(vma->vm_mm, vmf->pmd,
>> > - vmf->address);
>> > + /* Save mm in case VMA lock is dropped */
>> > + struct mm_struct *mm = vma->vm_mm;
>> > +
>> > + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
>> > + /*
>> > + * No need to hold VMA lock for migration.
>> > + * WARNING: vma can't be used after this!
>> > + */
>> > + vma_end_read(vma);
>> > + ret |= VM_FAULT_COMPLETED;
>> > + }
>> > + migration_entry_wait(mm, vmf->pmd, vmf->address);
>> > } else if (is_device_exclusive_entry(entry)) {
>> > vmf->page = pfn_swap_entry_to_page(entry);
>> > ret = remove_device_exclusive_entry(vmf);
>> > --
>> > 2.41.0.178.g377b9f9a00-goog
>> >
>>
>> --
>> Peter Xu
>>