Different from MADV_DONTNEED case, MADV_FREE just marks the physical
page as lazyfree instead of unmapping it immediately, and the physical
page will not be unmapped until the system memory is tight. So we
convert the percpu_ref of the related user PTE page table page to
atomic mode in madvise_free_pte_range(), and then check if it is 0
in try_to_unmap_one(). If it is 0, we can safely reclaim the PTE page
table page at this time.
Signed-off-by: Qi Zheng <[email protected]>
---
include/linux/rmap.h | 2 ++
mm/madvise.c | 7 ++++++-
mm/page_vma_mapped.c | 46 ++++++++++++++++++++++++++++++++++++++++++--
mm/rmap.c | 9 +++++++++
4 files changed, 61 insertions(+), 3 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 17230c458341..a3174d3bf118 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -204,6 +204,8 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
#define PVMW_SYNC (1 << 0)
/* Look for migration entries rather than present PTEs */
#define PVMW_MIGRATION (1 << 1)
+/* Used for MADV_FREE page */
+#define PVMW_MADV_FREE (1 << 2)
struct page_vma_mapped_walk {
unsigned long pfn;
diff --git a/mm/madvise.c b/mm/madvise.c
index 8123397f14c8..bd4bcaad5a9f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -598,7 +598,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
pte_t *orig_pte, *pte, ptent;
struct page *page;
int nr_swap = 0;
+ bool have_lazyfree = false;
unsigned long next;
+ unsigned long start = addr;
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd))
@@ -709,6 +711,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
tlb_remove_tlb_entry(tlb, pte, addr);
}
mark_page_lazyfree(page);
+ have_lazyfree = true;
}
out:
if (nr_swap) {
@@ -718,8 +721,10 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
}
arch_leave_lazy_mmu_mode();
- if (orig_pte)
+ if (orig_pte) {
pte_unmap_unlock(orig_pte, ptl);
+ try_to_free_user_pte(mm, pmd, start, !have_lazyfree);
+ }
cond_resched();
next:
return 0;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 8ecf8fd7cf5e..00bc09f57f48 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -266,8 +266,30 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
next_pte:
do {
pvmw->address += PAGE_SIZE;
- if (pvmw->address >= end)
- return not_found(pvmw);
+ if (pvmw->address >= end) {
+ not_found(pvmw);
+
+ if (pvmw->flags & PVMW_MADV_FREE) {
+ pgtable_t pte;
+ pmd_t pmdval;
+
+ pvmw->flags &= ~PVMW_MADV_FREE;
+ rcu_read_lock();
+ pmdval = READ_ONCE(*pvmw->pmd);
+ if (pmd_none(pmdval) || pmd_leaf(pmdval)) {
+ rcu_read_unlock();
+ return false;
+ }
+ pte = pmd_pgtable(pmdval);
+ if (percpu_ref_is_zero(pte->pte_ref)) {
+ rcu_read_unlock();
+ free_user_pte(mm, pvmw->pmd, pvmw->address);
+ } else {
+ rcu_read_unlock();
+ }
+ }
+ return false;
+ }
/* Did we cross page table boundary? */
if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) {
if (pvmw->ptl) {
@@ -275,6 +297,26 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
pvmw->ptl = NULL;
}
pte_unmap(pvmw->pte);
+ if (pvmw->flags & PVMW_MADV_FREE) {
+ pgtable_t pte;
+ pmd_t pmdval;
+
+ pvmw->flags &= ~PVMW_MADV_FREE;
+ rcu_read_lock();
+ pmdval = READ_ONCE(*pvmw->pmd);
+ if (pmd_none(pmdval) || pmd_leaf(pmdval)) {
+ rcu_read_unlock();
+ pvmw->pte = NULL;
+ goto restart;
+ }
+ pte = pmd_pgtable(pmdval);
+ if (percpu_ref_is_zero(pte->pte_ref)) {
+ rcu_read_unlock();
+ free_user_pte(mm, pvmw->pmd, pvmw->address);
+ } else {
+ rcu_read_unlock();
+ }
+ }
pvmw->pte = NULL;
goto restart;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index fedb82371efe..f978d324d4f9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1616,6 +1616,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
dec_mm_counter(mm, MM_ANONPAGES);
+ if (IS_ENABLED(CONFIG_FREE_USER_PTE))
+ pvmw.flags |= PVMW_MADV_FREE;
goto discard;
}
@@ -1627,6 +1629,13 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
folio_set_swapbacked(folio);
ret = false;
page_vma_mapped_walk_done(&pvmw);
+ if (IS_ENABLED(CONFIG_FREE_USER_PTE) &&
+ pte_tryget(mm, pvmw.pmd, address)) {
+ pgtable_t pte_page = pmd_pgtable(*pvmw.pmd);
+
+ percpu_ref_switch_to_percpu(pte_page->pte_ref);
+ __pte_put(pte_page);
+ }
break;
}
--
2.20.1