2021-11-10 10:57:49

by Qi Zheng

[permalink] [raw]
Subject: [PATCH v3 07/15] mm/pte_ref: add support for user PTE page table page allocation

When the PTE page table page is allocated and installed into the
pmd entry, it needs to take an initial reference count to prevent
the release of PTE page table page by other threads, and the caller
of pte_alloc()(or other friends) needs to reduce this reference count.

Signed-off-by: Qi Zheng <[email protected]>
---
include/linux/mm.h | 7 +++++--
mm/debug_vm_pgtable.c | 1 +
mm/filemap.c | 8 ++++++--
mm/gup.c | 10 +++++++---
mm/memory.c | 51 +++++++++++++++++++++++++++++++++++++++++----------
mm/migrate.c | 9 ++++++---
mm/mlock.c | 1 +
mm/mremap.c | 1 +
mm/userfaultfd.c | 16 +++++++++++++++-
9 files changed, 83 insertions(+), 21 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 52f36fde2f11..753a9435e0d0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -26,6 +26,7 @@
#include <linux/err.h>
#include <linux/page-flags.h>
#include <linux/page_ref.h>
+#include <linux/pte_ref.h>
#include <linux/memremap.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
@@ -2313,9 +2314,11 @@ enum pmd_installed_type {

static inline int pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
- if (unlikely(pmd_none(*(pmd))))
+ enum pte_tryget_type ret = pte_try_get(pmd);
+
+ if (ret == TRYGET_FAILED_NONE || ret == TRYGET_FAILED_ZERO)
return __pte_alloc(mm, pmd);
- if (unlikely(is_huge_pmd(*pmd)))
+ else if (ret == TRYGET_FAILED_HUGE_PMD)
return INSTALLED_HUGE_PMD;

return INSTALLED_PTE;
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index b8322c55e65d..52f006654664 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1048,6 +1048,7 @@ static void __init destroy_args(struct pgtable_debug_args *args)

/* Free page table entries */
if (args->start_ptep) {
+ pte_put(args->mm, args->start_pmdp, args->vaddr);
pte_free(args->mm, args->start_ptep);
mm_dec_nr_ptes(args->mm);
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 23363f8ddbbe..1e7e9e4fd759 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3217,6 +3217,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
}
}

+retry:
if (pmd_none(*vmf->pmd)) {
int ret = pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);

@@ -3225,6 +3226,8 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
} else if (pmd_devmap_trans_unstable(vmf->pmd)) {
/* See comment in handle_pte_fault() */
goto out;
+ } else if (pte_try_get(vmf->pmd) == TRYGET_FAILED_ZERO) {
+ goto retry;
}

return false;
@@ -3301,7 +3304,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
- unsigned long addr;
+ unsigned long addr, start;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct page *head, *page;
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
@@ -3317,7 +3320,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
goto out;
}

- addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ start = addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
do {
page = find_subpage(head, xas.xa_index);
@@ -3348,6 +3351,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
put_page(head);
} while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL);
pte_unmap_unlock(vmf->pte, vmf->ptl);
+ pte_put(vma->vm_mm, vmf->pmd, start);
out:
rcu_read_unlock();
WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
diff --git a/mm/gup.c b/mm/gup.c
index 2def775232a3..e084111103f0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -694,7 +694,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
spin_unlock(ptl);
ret = 0;
split_huge_pmd(vma, pmd, address);
- if (pmd_trans_unstable(pmd))
+ if (pte_try_get(pmd) == TRYGET_FAILED_HUGE_PMD)
ret = -EBUSY;
} else {
spin_unlock(ptl);
@@ -702,8 +702,12 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
ret = pte_alloc(mm, pmd) < 0 ? -ENOMEM : 0;
}

- return ret ? ERR_PTR(ret) :
- follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ if (ret)
+ return ERR_PTR(ret);
+
+ page = follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ pte_put(mm, pmd, address);
+ return page;
}
page = follow_trans_huge_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
diff --git a/mm/memory.c b/mm/memory.c
index 8a39c0e58324..0b9af38cfa11 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -441,10 +441,13 @@ enum pmd_installed_type pmd_install(struct mm_struct *mm, pmd_t *pmd,
pgtable_t *pte)
{
int ret = INSTALLED_PTE;
- spinlock_t *ptl = pmd_lock(mm, pmd);
+ spinlock_t *ptl;

+retry:
+ ptl = pmd_lock(mm, pmd);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
mm_inc_nr_ptes(mm);
+ pte_ref_init(*pte, pmd, 1);
/*
* Ensure all pte setup (eg. pte page lock and page clearing) are
* visible before the pte is made visible to other CPUs by being
@@ -464,6 +467,9 @@ enum pmd_installed_type pmd_install(struct mm_struct *mm, pmd_t *pmd,
} else if (is_huge_pmd(*pmd)) {
/* See comment in handle_pte_fault() */
ret = INSTALLED_HUGE_PMD;
+ } else if (!pte_get_unless_zero(pmd)) {
+ spin_unlock(ptl);
+ goto retry;
}
spin_unlock(ptl);

@@ -1028,6 +1034,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
struct page *prealloc = NULL;
+ unsigned long start = addr;

again:
progress = 0;
@@ -1108,6 +1115,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_unmap(orig_src_pte);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
+ pte_put(dst_mm, dst_pmd, start);
cond_resched();

if (ret == -EIO) {
@@ -1778,6 +1786,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
goto out;
retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
pte_unmap_unlock(pte, ptl);
+ pte_put(mm, pte_to_pmd(pte), addr);
out:
return retval;
}
@@ -1810,6 +1819,7 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
unsigned long remaining_pages_total = *num;
unsigned long pages_to_write_in_pmd;
int ret;
+ unsigned long start = addr;
more:
ret = -EFAULT;
pmd = walk_to_pmd(mm, addr);
@@ -1836,7 +1846,7 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
pte_unmap_unlock(start_pte, pte_lock);
ret = err;
remaining_pages_total -= pte_idx;
- goto out;
+ goto put;
}
addr += PAGE_SIZE;
++curr_page_idx;
@@ -1845,9 +1855,13 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
pages_to_write_in_pmd -= batch_size;
remaining_pages_total -= batch_size;
}
- if (remaining_pages_total)
+ if (remaining_pages_total) {
+ pte_put(mm, pmd, start);
goto more;
+ }
ret = 0;
+put:
+ pte_put(mm, pmd, start);
out:
*num = remaining_pages_total;
return ret;
@@ -2075,6 +2089,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,

out_unlock:
pte_unmap_unlock(pte, ptl);
+ pte_put(mm, pte_to_pmd(pte), addr);
return VM_FAULT_NOPAGE;
}

@@ -2275,6 +2290,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
+ unsigned long start = addr;
pte_t *pte, *mapped_pte;
spinlock_t *ptl;
int err = 0;
@@ -2294,6 +2310,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(mapped_pte, ptl);
+ pte_put(mm, pmd, start);
return err;
}

@@ -2503,6 +2520,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
+ unsigned long start = addr;
pte_t *pte, *mapped_pte;
int err = 0;
spinlock_t *ptl;
@@ -2536,8 +2554,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,

arch_leave_lazy_mmu_mode();

- if (mm != &init_mm)
+ if (mm != &init_mm) {
pte_unmap_unlock(mapped_pte, ptl);
+ if (create)
+ pte_put(mm, pmd, start);
+ }
return err;
}

@@ -3761,7 +3782,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
- return handle_userfault(vmf, VM_UFFD_MISSING);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
+ goto put;
}
goto setpte;
}
@@ -3804,7 +3826,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
put_page(page);
- return handle_userfault(vmf, VM_UFFD_MISSING);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
+ goto put;
}

inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
@@ -3817,14 +3840,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
- return ret;
+ goto put;
release:
put_page(page);
goto unlock;
oom_free_page:
put_page(page);
oom:
- return VM_FAULT_OOM;
+ ret = VM_FAULT_OOM;
+put:
+ pte_put(vma->vm_mm, vmf->pmd, vmf->address);
+ return ret;
}

/*
@@ -4031,7 +4057,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return ret;
}

- if (pmd_none(*vmf->pmd)) {
+retry:
+ ret = pte_try_get(vmf->pmd);
+ if (ret == TRYGET_FAILED_NONE) {
int alloc_ret;

if (PageTransCompound(page)) {
@@ -4047,9 +4075,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)

if (unlikely(alloc_ret != INSTALLED_PTE))
return alloc_ret < 0 ? VM_FAULT_OOM : 0;
- } else if (pmd_devmap_trans_unstable(vmf->pmd)) {
+ } else if (ret == TRYGET_FAILED_HUGE_PMD) {
/* See comment in handle_pte_fault() */
return 0;
+ } else if (ret == TRYGET_FAILED_ZERO) {
+ goto retry;
}

vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
@@ -4063,6 +4093,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)

update_mmu_tlb(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
+ pte_put(vma->vm_mm, vmf->pmd, vmf->address);
return ret;
}

diff --git a/mm/migrate.c b/mm/migrate.c
index bdfdfd3b50be..26f16a4836d8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2736,9 +2736,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
goto abort;

if (unlikely(anon_vma_prepare(vma)))
- goto abort;
+ goto put;
if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
- goto abort;
+ goto put;

/*
* The memory barrier inside __SetPageUptodate makes sure that
@@ -2764,7 +2764,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
* device memory.
*/
pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
- goto abort;
+ goto put;
}
} else {
entry = mk_pte(page, vma->vm_page_prot);
@@ -2811,11 +2811,14 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
}

pte_unmap_unlock(ptep, ptl);
+ pte_put(mm, pmdp, addr);
*src = MIGRATE_PFN_MIGRATE;
return;

unlock_abort:
pte_unmap_unlock(ptep, ptl);
+put:
+ pte_put(mm, pmdp, addr);
abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
diff --git a/mm/mlock.c b/mm/mlock.c
index e263d62ae2d0..a4ef20ba9627 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -398,6 +398,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
break;
}
pte_unmap_unlock(pte, ptl);
+ pte_put(vma->vm_mm, pte_to_pmd(pte), start);
return start;
}

diff --git a/mm/mremap.c b/mm/mremap.c
index fc5c56858883..f80c628db25d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -555,6 +555,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
break;
move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
new_pmd, new_addr, need_rmap_locks);
+ pte_put(new_vma->vm_mm, new_pmd, new_addr);
}

mmu_notifier_invalidate_range_end(&range);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 2cea08e7f076..37df899a1b9d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -574,6 +574,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,

while (src_addr < src_start + len) {
pmd_t dst_pmdval;
+ enum pte_tryget_type tryget_type;

BUG_ON(dst_addr >= dst_start + len);

@@ -583,6 +584,14 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
break;
}

+again:
+ /*
+ * After the management of the PTE page changes to the refcount
+ * mode, the PTE page may be released by another thread(rcu mode),
+ * so the rcu lock is held here to prevent the PTE page from
+ * being released.
+ */
+ rcu_read_lock();
dst_pmdval = pmd_read_atomic(dst_pmd);
/*
* If the dst_pmd is mapped as THP don't
@@ -593,7 +602,9 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
break;
}

- if (unlikely(pmd_none(dst_pmdval))) {
+ tryget_type = pte_try_get(&dst_pmdval);
+ rcu_read_unlock();
+ if (unlikely(tryget_type == TRYGET_FAILED_NONE)) {
int ret = __pte_alloc(dst_mm, dst_pmd);

/*
@@ -607,6 +618,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
err = -EFAULT;
break;
}
+ } else if (unlikely(tryget_type == TRYGET_FAILED_ZERO)) {
+ goto again;
}

BUG_ON(pmd_none(*dst_pmd));
@@ -614,6 +627,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,

err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
src_addr, &page, mcopy_mode, wp_copy);
+ pte_put(dst_mm, dst_pmd, dst_addr);
cond_resched();

if (unlikely(err == -ENOENT)) {
--
2.11.0


2021-11-11 15:17:50

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v3 07/15] mm/pte_ref: add support for user PTE page table page allocation

Hi Qi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on hnaz-mm/master]
[also build test ERROR on linus/master next-20211111]
[cannot apply to tip/perf/core tip/x86/core v5.15]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Qi-Zheng/Free-user-PTE-page-table-pages/20211110-185837
base: https://github.com/hnaz/linux-mm master
config: ia64-defconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/6e3cc5bb722cbd2fc4170d2f5371e52792d17d2e
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Qi-Zheng/Free-user-PTE-page-table-pages/20211110-185837
git checkout 6e3cc5bb722cbd2fc4170d2f5371e52792d17d2e
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross ARCH=ia64

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

In file included from arch/ia64/include/uapi/asm/gcc_intrin.h:11,
from arch/ia64/include/asm/gcc_intrin.h:10,
from arch/ia64/include/uapi/asm/intrinsics.h:20,
from arch/ia64/include/asm/intrinsics.h:11,
from arch/ia64/include/asm/page.h:11,
from arch/ia64/include/asm/pgtable.h:18,
from include/linux/pgtable.h:6,
from include/linux/pte_ref.h:10,
from mm/pte_ref.c:8:
mm/pte_ref.c: In function 'pte_try_get':
>> mm/pte_ref.c:37:22: error: implicit declaration of function 'is_huge_pmd'; did you mean 'zap_huge_pmd'? [-Werror=implicit-function-declaration]
37 | if (unlikely(is_huge_pmd(*pmd)))
| ^~~~~~~~~~~
include/linux/compiler.h:78:45: note: in definition of macro 'unlikely'
78 | # define unlikely(x) __builtin_expect(!!(x), 0)
| ^
cc1: some warnings being treated as errors


vim +37 mm/pte_ref.c

e03404013f81d7 Qi Zheng 2021-11-10 20
e03404013f81d7 Qi Zheng 2021-11-10 21 /*
e03404013f81d7 Qi Zheng 2021-11-10 22 * pte_try_get - Try to increment refcount for the PTE page table.
e03404013f81d7 Qi Zheng 2021-11-10 23 * @pmd: a pointer to the pmd entry corresponding to the PTE page table.
e03404013f81d7 Qi Zheng 2021-11-10 24 *
e03404013f81d7 Qi Zheng 2021-11-10 25 * Return true if the increment succeeded. Otherwise return false.
e03404013f81d7 Qi Zheng 2021-11-10 26 *
e03404013f81d7 Qi Zheng 2021-11-10 27 * Before Operating the PTE page table, we need to hold a refcount
e03404013f81d7 Qi Zheng 2021-11-10 28 * to protect against the concurrent release of the PTE page table.
e03404013f81d7 Qi Zheng 2021-11-10 29 * But we will fail in the following case:
e03404013f81d7 Qi Zheng 2021-11-10 30 * - The content mapped in @pmd is not a PTE page
e03404013f81d7 Qi Zheng 2021-11-10 31 * - The refcount of the PTE page table is zero, it will be freed
e03404013f81d7 Qi Zheng 2021-11-10 32 */
e03404013f81d7 Qi Zheng 2021-11-10 33 enum pte_tryget_type pte_try_get(pmd_t *pmd)
e03404013f81d7 Qi Zheng 2021-11-10 34 {
e03404013f81d7 Qi Zheng 2021-11-10 35 if (unlikely(pmd_none(*pmd)))
e03404013f81d7 Qi Zheng 2021-11-10 36 return TRYGET_FAILED_NONE;
e03404013f81d7 Qi Zheng 2021-11-10 @37 if (unlikely(is_huge_pmd(*pmd)))
e03404013f81d7 Qi Zheng 2021-11-10 38 return TRYGET_FAILED_HUGE_PMD;
e03404013f81d7 Qi Zheng 2021-11-10 39
e03404013f81d7 Qi Zheng 2021-11-10 40 return TRYGET_SUCCESSED;
e03404013f81d7 Qi Zheng 2021-11-10 41 }
e03404013f81d7 Qi Zheng 2021-11-10 42

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (4.15 kB)
.config.gz (19.59 kB)
Download all attachments