2019-08-28 14:22:16

by Christoph Hellwig

[permalink] [raw]
Subject: cleanup the walk_page_range interface v2

Hi all,

this series is based on a patch from Linus to split the callbacks
passed to walk_page_range and walk_page_vma into a separate structure
that can be marked const, with various cleanups from me on top.

This series is also available as a git tre here:

git://git.infradead.org/users/hch/misc.git pagewalk-cleanup

Gitweb:

http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/pagewalk-cleanup


Diffstat:

14 files changed, 291 insertions(+), 273 deletions(-)

Changes since v1:
- minor comment typo and checkpatch fixes
- fix a compile failure for !CONFIG_SHMEM
- rebased to the wip/jgg-hmm branch


2019-08-28 14:23:03

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 2/3] pagewalk: separate function pointers from iterator data

The mm_walk structure currently mixed data and code. Split out the
operations vectors into a new mm_walk_ops structure, and while we
are changing the API also declare the mm_walk structure inside the
walk_page_range and walk_page_vma functions.

Based on patch from Linus Torvalds.

Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Thomas Hellstrom <[email protected]>
Reviewed-by: Steven Price <[email protected]>
---
arch/openrisc/kernel/dma.c | 22 +++--
arch/powerpc/mm/book3s64/subpage_prot.c | 10 +-
arch/s390/mm/gmap.c | 33 +++----
fs/proc/task_mmu.c | 78 ++++++++-------
include/linux/pagewalk.h | 64 +++++++-----
mm/hmm.c | 23 +++--
mm/madvise.c | 41 +++-----
mm/memcontrol.c | 23 +++--
mm/mempolicy.c | 15 ++-
mm/migrate.c | 23 +++--
mm/mincore.c | 15 ++-
mm/mprotect.c | 24 ++---
mm/pagewalk.c | 124 ++++++++++++++----------
13 files changed, 251 insertions(+), 244 deletions(-)

diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index c7812e6effa2..4d5b8bd1d795 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -44,6 +44,10 @@ page_set_nocache(pte_t *pte, unsigned long addr,
return 0;
}

+static const struct mm_walk_ops set_nocache_walk_ops = {
+ .pte_entry = page_set_nocache,
+};
+
static int
page_clear_nocache(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
@@ -59,6 +63,10 @@ page_clear_nocache(pte_t *pte, unsigned long addr,
return 0;
}

+static const struct mm_walk_ops clear_nocache_walk_ops = {
+ .pte_entry = page_clear_nocache,
+};
+
/*
* Alloc "coherent" memory, which for OpenRISC means simply uncached.
*
@@ -81,10 +89,6 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
{
unsigned long va;
void *page;
- struct mm_walk walk = {
- .pte_entry = page_set_nocache,
- .mm = &init_mm
- };

page = alloc_pages_exact(size, gfp | __GFP_ZERO);
if (!page)
@@ -99,7 +103,8 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
* We need to iterate through the pages, clearing the dcache for
* them and setting the cache-inhibit bit.
*/
- if (walk_page_range(va, va + size, &walk)) {
+ if (walk_page_range(&init_mm, va, va + size, &set_nocache_walk_ops,
+ NULL)) {
free_pages_exact(page, size);
return NULL;
}
@@ -112,13 +117,10 @@ arch_dma_free(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_handle, unsigned long attrs)
{
unsigned long va = (unsigned long)vaddr;
- struct mm_walk walk = {
- .pte_entry = page_clear_nocache,
- .mm = &init_mm
- };

/* walk_page_range shouldn't be able to fail here */
- WARN_ON(walk_page_range(va, va + size, &walk));
+ WARN_ON(walk_page_range(&init_mm, va, va + size,
+ &clear_nocache_walk_ops, NULL));

free_pages_exact(vaddr, size);
}
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
index 236f0a861ecc..2ef24a53f4c9 100644
--- a/arch/powerpc/mm/book3s64/subpage_prot.c
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -139,14 +139,14 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
return 0;
}

+static const struct mm_walk_ops subpage_walk_ops = {
+ .pmd_entry = subpage_walk_pmd_entry,
+};
+
static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
unsigned long len)
{
struct vm_area_struct *vma;
- struct mm_walk subpage_proto_walk = {
- .mm = mm,
- .pmd_entry = subpage_walk_pmd_entry,
- };

/*
* We don't try too hard, we just mark all the vma in that range
@@ -163,7 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
if (vma->vm_start >= (addr + len))
break;
vma->vm_flags |= VM_NOHUGEPAGE;
- walk_page_vma(vma, &subpage_proto_walk);
+ walk_page_vma(vma, &subpage_walk_ops, NULL);
vma = vma->vm_next;
}
}
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index cf80feae970d..bd78d504fdad 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2521,13 +2521,9 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
return 0;
}

-static inline void zap_zero_pages(struct mm_struct *mm)
-{
- struct mm_walk walk = { .pmd_entry = __zap_zero_pages };
-
- walk.mm = mm;
- walk_page_range(0, TASK_SIZE, &walk);
-}
+static const struct mm_walk_ops zap_zero_walk_ops = {
+ .pmd_entry = __zap_zero_pages,
+};

/*
* switch on pgstes for its userspace process (for kvm)
@@ -2546,7 +2542,7 @@ int s390_enable_sie(void)
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
- zap_zero_pages(mm);
+ walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
up_write(&mm->mmap_sem);
return 0;
}
@@ -2589,12 +2585,13 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
return 0;
}

+static const struct mm_walk_ops enable_skey_walk_ops = {
+ .hugetlb_entry = __s390_enable_skey_hugetlb,
+ .pte_entry = __s390_enable_skey_pte,
+};
+
int s390_enable_skey(void)
{
- struct mm_walk walk = {
- .hugetlb_entry = __s390_enable_skey_hugetlb,
- .pte_entry = __s390_enable_skey_pte,
- };
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int rc = 0;
@@ -2614,8 +2611,7 @@ int s390_enable_skey(void)
}
mm->def_flags &= ~VM_MERGEABLE;

- walk.mm = mm;
- walk_page_range(0, TASK_SIZE, &walk);
+ walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);

out_up:
up_write(&mm->mmap_sem);
@@ -2633,13 +2629,14 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
return 0;
}

+static const struct mm_walk_ops reset_cmma_walk_ops = {
+ .pte_entry = __s390_reset_cmma,
+};
+
void s390_reset_cmma(struct mm_struct *mm)
{
- struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
-
down_write(&mm->mmap_sem);
- walk.mm = mm;
- walk_page_range(0, TASK_SIZE, &walk);
+ walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
up_write(&mm->mmap_sem);
}
EXPORT_SYMBOL_GPL(s390_reset_cmma);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8857da830b86..bf43d1d60059 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -513,7 +513,9 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,

return 0;
}
-#endif
+#else
+#define smaps_pte_hole NULL
+#endif /* CONFIG_SHMEM */

static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
@@ -729,21 +731,24 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
}
return 0;
}
+#else
+#define smaps_hugetlb_range NULL
#endif /* HUGETLB_PAGE */

+static const struct mm_walk_ops smaps_walk_ops = {
+ .pmd_entry = smaps_pte_range,
+ .hugetlb_entry = smaps_hugetlb_range,
+};
+
+static const struct mm_walk_ops smaps_shmem_walk_ops = {
+ .pmd_entry = smaps_pte_range,
+ .hugetlb_entry = smaps_hugetlb_range,
+ .pte_hole = smaps_pte_hole,
+};
+
static void smap_gather_stats(struct vm_area_struct *vma,
struct mem_size_stats *mss)
{
- struct mm_walk smaps_walk = {
- .pmd_entry = smaps_pte_range,
-#ifdef CONFIG_HUGETLB_PAGE
- .hugetlb_entry = smaps_hugetlb_range,
-#endif
- .mm = vma->vm_mm,
- };
-
- smaps_walk.private = mss;
-
#ifdef CONFIG_SHMEM
/* In case of smaps_rollup, reset the value from previous vma */
mss->check_shmem_swap = false;
@@ -765,12 +770,13 @@ static void smap_gather_stats(struct vm_area_struct *vma,
mss->swap += shmem_swapped;
} else {
mss->check_shmem_swap = true;
- smaps_walk.pte_hole = smaps_pte_hole;
+ walk_page_vma(vma, &smaps_shmem_walk_ops, mss);
+ return;
}
}
#endif
/* mmap_sem is held in m_start */
- walk_page_vma(vma, &smaps_walk);
+ walk_page_vma(vma, &smaps_walk_ops, mss);
}

#define SEQ_PUT_DEC(str, val) \
@@ -1118,6 +1124,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
return 0;
}

+static const struct mm_walk_ops clear_refs_walk_ops = {
+ .pmd_entry = clear_refs_pte_range,
+ .test_walk = clear_refs_test_walk,
+};
+
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -1151,12 +1162,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
struct clear_refs_private cp = {
.type = type,
};
- struct mm_walk clear_refs_walk = {
- .pmd_entry = clear_refs_pte_range,
- .test_walk = clear_refs_test_walk,
- .mm = mm,
- .private = &cp,
- };

if (type == CLEAR_REFS_MM_HIWATER_RSS) {
if (down_write_killable(&mm->mmap_sem)) {
@@ -1217,7 +1222,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
0, NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start(&range);
}
- walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
+ walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
+ &cp);
if (type == CLEAR_REFS_SOFT_DIRTY)
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb, 0, -1);
@@ -1489,8 +1495,16 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,

return err;
}
+#else
+#define pagemap_hugetlb_range NULL
#endif /* HUGETLB_PAGE */

+static const struct mm_walk_ops pagemap_ops = {
+ .pmd_entry = pagemap_pmd_range,
+ .pte_hole = pagemap_pte_hole,
+ .hugetlb_entry = pagemap_hugetlb_range,
+};
+
/*
* /proc/pid/pagemap - an array mapping virtual pages to pfns
*
@@ -1522,7 +1536,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
{
struct mm_struct *mm = file->private_data;
struct pagemapread pm;
- struct mm_walk pagemap_walk = {};
unsigned long src;
unsigned long svpfn;
unsigned long start_vaddr;
@@ -1550,14 +1563,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!pm.buffer)
goto out_mm;

- pagemap_walk.pmd_entry = pagemap_pmd_range;
- pagemap_walk.pte_hole = pagemap_pte_hole;
-#ifdef CONFIG_HUGETLB_PAGE
- pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
-#endif
- pagemap_walk.mm = mm;
- pagemap_walk.private = &pm;
-
src = *ppos;
svpfn = src / PM_ENTRY_BYTES;
start_vaddr = svpfn << PAGE_SHIFT;
@@ -1586,7 +1591,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
ret = down_read_killable(&mm->mmap_sem);
if (ret)
goto out_free;
- ret = walk_page_range(start_vaddr, end, &pagemap_walk);
+ ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
up_read(&mm->mmap_sem);
start_vaddr = end;

@@ -1798,6 +1803,11 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
}
#endif

+static const struct mm_walk_ops show_numa_ops = {
+ .hugetlb_entry = gather_hugetlb_stats,
+ .pmd_entry = gather_pte_stats,
+};
+
/*
* Display pages allocated per node and memory policy via /proc.
*/
@@ -1809,12 +1819,6 @@ static int show_numa_map(struct seq_file *m, void *v)
struct numa_maps *md = &numa_priv->md;
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
- struct mm_walk walk = {
- .hugetlb_entry = gather_hugetlb_stats,
- .pmd_entry = gather_pte_stats,
- .private = md,
- .mm = mm,
- };
struct mempolicy *pol;
char buffer[64];
int nid;
@@ -1848,7 +1852,7 @@ static int show_numa_map(struct seq_file *m, void *v)
seq_puts(m, " huge");

/* mmap_sem is held by m_start */
- walk_page_vma(vma, &walk);
+ walk_page_vma(vma, &show_numa_ops, md);

if (!md->pages)
goto out;
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index df278a94086d..bddd9759bab9 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -4,31 +4,28 @@

#include <linux/mm.h>

+struct mm_walk;
+
/**
- * mm_walk - callbacks for walk_page_range
- * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
- * this handler should only handle pud_trans_huge() puds.
- * the pmd_entry or pte_entry callbacks will be used for
- * regular PUDs.
- * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
- * this handler is required to be able to handle
- * pmd_trans_huge() pmds. They may simply choose to
- * split_huge_page() instead of handling it explicitly.
- * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
- * @pte_hole: if set, called for each hole at all levels
- * @hugetlb_entry: if set, called for each hugetlb entry
- * @test_walk: caller specific callback function to determine whether
- * we walk over the current vma or not. Returning 0
- * value means "do page table walk over the current vma,"
- * and a negative one means "abort current page table walk
- * right now." 1 means "skip the current vma."
- * @mm: mm_struct representing the target process of page table walk
- * @vma: vma currently walked (NULL if walking outside vmas)
- * @private: private data for callbacks' usage
- *
- * (see the comment on walk_page_range() for more details)
+ * mm_walk_ops - callbacks for walk_page_range
+ * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
+ * this handler should only handle pud_trans_huge() puds.
+ * the pmd_entry or pte_entry callbacks will be used for
+ * regular PUDs.
+ * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
+ * this handler is required to be able to handle
+ * pmd_trans_huge() pmds. They may simply choose to
+ * split_huge_page() instead of handling it explicitly.
+ * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
+ * @pte_hole: if set, called for each hole at all levels
+ * @hugetlb_entry: if set, called for each hugetlb entry
+ * @test_walk: caller specific callback function to determine whether
+ * we walk over the current vma or not. Returning 0 means
+ * "do page table walk over the current vma", returning
+ * a negative value means "abort current page table walk
+ * right now" and returning 1 means "skip the current vma"
*/
-struct mm_walk {
+struct mm_walk_ops {
int (*pud_entry)(pud_t *pud, unsigned long addr,
unsigned long next, struct mm_walk *walk);
int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
@@ -42,13 +39,28 @@ struct mm_walk {
struct mm_walk *walk);
int (*test_walk)(unsigned long addr, unsigned long next,
struct mm_walk *walk);
+};
+
+/**
+ * mm_walk - walk_page_range data
+ * @ops: operation to call during the walk
+ * @mm: mm_struct representing the target process of page table walk
+ * @vma: vma currently walked (NULL if walking outside vmas)
+ * @private: private data for callbacks' usage
+ *
+ * (see the comment on walk_page_range() for more details)
+ */
+struct mm_walk {
+ const struct mm_walk_ops *ops;
struct mm_struct *mm;
struct vm_area_struct *vma;
void *private;
};

-int walk_page_range(unsigned long addr, unsigned long end,
- struct mm_walk *walk);
-int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk);
+int walk_page_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private);
+int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
+ void *private);

#endif /* _LINUX_PAGEWALK_H */
diff --git a/mm/hmm.c b/mm/hmm.c
index 26916ff6c8df..902f5fa6bf93 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -852,6 +852,13 @@ void hmm_range_unregister(struct hmm_range *range)
}
EXPORT_SYMBOL(hmm_range_unregister);

+static const struct mm_walk_ops hmm_walk_ops = {
+ .pud_entry = hmm_vma_walk_pud,
+ .pmd_entry = hmm_vma_walk_pmd,
+ .pte_hole = hmm_vma_walk_hole,
+ .hugetlb_entry = hmm_vma_walk_hugetlb_entry,
+};
+
/**
* hmm_range_fault - try to fault some address in a virtual address range
* @range: range being faulted
@@ -887,7 +894,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)
struct hmm_vma_walk hmm_vma_walk;
struct hmm *hmm = range->hmm;
struct vm_area_struct *vma;
- struct mm_walk mm_walk;
int ret;

lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem);
@@ -916,21 +922,14 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)
hmm_vma_walk.last = start;
hmm_vma_walk.flags = flags;
hmm_vma_walk.range = range;
- mm_walk.private = &hmm_vma_walk;
end = min(range->end, vma->vm_end);

- mm_walk.vma = vma;
- mm_walk.mm = vma->vm_mm;
- mm_walk.pte_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.pud_entry = hmm_vma_walk_pud;
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
- mm_walk.pte_hole = hmm_vma_walk_hole;
- mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+ walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops,
+ &hmm_vma_walk);

do {
- ret = walk_page_range(start, end, &mm_walk);
+ ret = walk_page_range(vma->vm_mm, start, end,
+ &hmm_walk_ops, &hmm_vma_walk);
start = hmm_vma_walk.last;

/* Keep trying while the range is valid. */
diff --git a/mm/madvise.c b/mm/madvise.c
index 80a78bb16782..afe2b015ea58 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -226,19 +226,9 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
return 0;
}

-static void force_swapin_readahead(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
- struct mm_walk walk = {
- .mm = vma->vm_mm,
- .pmd_entry = swapin_walk_pmd_entry,
- .private = vma,
- };
-
- walk_page_range(start, end, &walk);
-
- lru_add_drain(); /* Push any new pages onto the LRU now */
-}
+static const struct mm_walk_ops swapin_walk_ops = {
+ .pmd_entry = swapin_walk_pmd_entry,
+};

static void force_shm_swapin_readahead(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
@@ -280,7 +270,8 @@ static long madvise_willneed(struct vm_area_struct *vma,
*prev = vma;
#ifdef CONFIG_SWAP
if (!file) {
- force_swapin_readahead(vma, start, end);
+ walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
+ lru_add_drain(); /* Push any new pages onto the LRU now */
return 0;
}

@@ -441,20 +432,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}

-static void madvise_free_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
-{
- struct mm_walk free_walk = {
- .pmd_entry = madvise_free_pte_range,
- .mm = vma->vm_mm,
- .private = tlb,
- };
-
- tlb_start_vma(tlb, vma);
- walk_page_range(addr, end, &free_walk);
- tlb_end_vma(tlb, vma);
-}
+static const struct mm_walk_ops madvise_free_walk_ops = {
+ .pmd_entry = madvise_free_pte_range,
+};

static int madvise_free_single_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr)
@@ -481,7 +461,10 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
update_hiwater_rss(mm);

mmu_notifier_invalidate_range_start(&range);
- madvise_free_page_range(&tlb, vma, range.start, range.end);
+ tlb_start_vma(&tlb, vma);
+ walk_page_range(vma->vm_mm, range.start, range.end,
+ &madvise_free_walk_ops, &tlb);
+ tlb_end_vma(&tlb, vma);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb, range.start, range.end);

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4c3af5d71ab1..9b2516a76be2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5283,17 +5283,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
return 0;
}

+static const struct mm_walk_ops precharge_walk_ops = {
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
+};
+
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;

- struct mm_walk mem_cgroup_count_precharge_walk = {
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
- .mm = mm,
- };
down_read(&mm->mmap_sem);
- walk_page_range(0, mm->highest_vm_end,
- &mem_cgroup_count_precharge_walk);
+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
up_read(&mm->mmap_sem);

precharge = mc.precharge;
@@ -5562,13 +5561,12 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
return ret;
}

+static const struct mm_walk_ops charge_walk_ops = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+};
+
static void mem_cgroup_move_charge(void)
{
- struct mm_walk mem_cgroup_move_charge_walk = {
- .pmd_entry = mem_cgroup_move_charge_pte_range,
- .mm = mc.mm,
- };
-
lru_add_drain_all();
/*
* Signal lock_page_memcg() to take the memcg's move_lock
@@ -5594,7 +5592,8 @@ static void mem_cgroup_move_charge(void)
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
+ NULL);

up_read(&mc.mm->mmap_sem);
atomic_dec(&mc.from->moving_account);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3a96def1e796..f000771558d8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -655,6 +655,12 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
return 1;
}

+static const struct mm_walk_ops queue_pages_walk_ops = {
+ .hugetlb_entry = queue_pages_hugetlb,
+ .pmd_entry = queue_pages_pte_range,
+ .test_walk = queue_pages_test_walk,
+};
+
/*
* Walk through page tables and collect pages to be migrated.
*
@@ -679,15 +685,8 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
.nmask = nodes,
.prev = NULL,
};
- struct mm_walk queue_pages_walk = {
- .hugetlb_entry = queue_pages_hugetlb,
- .pmd_entry = queue_pages_pte_range,
- .test_walk = queue_pages_test_walk,
- .mm = mm,
- .private = &qp,
- };

- return walk_page_range(start, end, &queue_pages_walk);
+ return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
}

/*
diff --git a/mm/migrate.c b/mm/migrate.c
index c9c73a35aca7..9f4ed4e985c1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2320,6 +2320,11 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
return 0;
}

+static const struct mm_walk_ops migrate_vma_walk_ops = {
+ .pmd_entry = migrate_vma_collect_pmd,
+ .pte_hole = migrate_vma_collect_hole,
+};
+
/*
* migrate_vma_collect() - collect pages over a range of virtual addresses
* @migrate: migrate struct containing all migration information
@@ -2331,21 +2336,15 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
static void migrate_vma_collect(struct migrate_vma *migrate)
{
struct mmu_notifier_range range;
- struct mm_walk mm_walk = {
- .pmd_entry = migrate_vma_collect_pmd,
- .pte_hole = migrate_vma_collect_hole,
- .vma = migrate->vma,
- .mm = migrate->vma->vm_mm,
- .private = migrate,
- };

- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm,
- migrate->start,
- migrate->end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL,
+ migrate->vma->vm_mm, migrate->start, migrate->end);
mmu_notifier_invalidate_range_start(&range);
- walk_page_range(migrate->start, migrate->end, &mm_walk);
- mmu_notifier_invalidate_range_end(&range);

+ walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
+ &migrate_vma_walk_ops, migrate);
+
+ mmu_notifier_invalidate_range_end(&range);
migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
}

diff --git a/mm/mincore.c b/mm/mincore.c
index 3b051b6ab3fe..f9a9dbe8cd33 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -193,6 +193,12 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
}

+static const struct mm_walk_ops mincore_walk_ops = {
+ .pmd_entry = mincore_pte_range,
+ .pte_hole = mincore_unmapped_range,
+ .hugetlb_entry = mincore_hugetlb,
+};
+
/*
* Do a chunk of "sys_mincore()". We've already checked
* all the arguments, we hold the mmap semaphore: we should
@@ -203,12 +209,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
struct vm_area_struct *vma;
unsigned long end;
int err;
- struct mm_walk mincore_walk = {
- .pmd_entry = mincore_pte_range,
- .pte_hole = mincore_unmapped_range,
- .hugetlb_entry = mincore_hugetlb,
- .private = vec,
- };

vma = find_vma(current->mm, addr);
if (!vma || addr < vma->vm_start)
@@ -219,8 +219,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
memset(vec, 1, pages);
return pages;
}
- mincore_walk.mm = vma->vm_mm;
- err = walk_page_range(addr, end, &mincore_walk);
+ err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec);
if (err < 0)
return err;
return (end - addr) >> PAGE_SHIFT;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index cc73318dbc25..675e5d34a507 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -329,20 +329,11 @@ static int prot_none_test(unsigned long addr, unsigned long next,
return 0;
}

-static int prot_none_walk(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, unsigned long newflags)
-{
- pgprot_t new_pgprot = vm_get_page_prot(newflags);
- struct mm_walk prot_none_walk = {
- .pte_entry = prot_none_pte_entry,
- .hugetlb_entry = prot_none_hugetlb_entry,
- .test_walk = prot_none_test,
- .mm = current->mm,
- .private = &new_pgprot,
- };
-
- return walk_page_range(start, end, &prot_none_walk);
-}
+static const struct mm_walk_ops prot_none_walk_ops = {
+ .pte_entry = prot_none_pte_entry,
+ .hugetlb_entry = prot_none_hugetlb_entry,
+ .test_walk = prot_none_test,
+};

int
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
@@ -369,7 +360,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
if (arch_has_pfn_modify_check() &&
(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
(newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
- error = prot_none_walk(vma, start, end, newflags);
+ pgprot_t new_pgprot = vm_get_page_prot(newflags);
+
+ error = walk_page_range(current->mm, start, end,
+ &prot_none_walk_ops, &new_pgprot);
if (error)
return error;
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 8a92a961a2ee..b8762b673a3d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -9,10 +9,11 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
{
pte_t *pte;
int err = 0;
+ const struct mm_walk_ops *ops = walk->ops;

pte = pte_offset_map(pmd, addr);
for (;;) {
- err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
+ err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
break;
addr += PAGE_SIZE;
@@ -30,6 +31,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
{
pmd_t *pmd;
unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;

pmd = pmd_offset(pud, addr);
@@ -37,8 +39,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
again:
next = pmd_addr_end(addr, end);
if (pmd_none(*pmd) || !walk->vma) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
continue;
@@ -47,8 +49,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
* This implies that each ->pmd_entry() handler
* needs to know about pmd_trans_huge() pmds
*/
- if (walk->pmd_entry)
- err = walk->pmd_entry(pmd, addr, next, walk);
+ if (ops->pmd_entry)
+ err = ops->pmd_entry(pmd, addr, next, walk);
if (err)
break;

@@ -56,7 +58,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
* Check this here so we only break down trans_huge
* pages when we _need_ to
*/
- if (!walk->pte_entry)
+ if (!ops->pte_entry)
continue;

split_huge_pmd(walk->vma, pmd, addr);
@@ -75,6 +77,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
{
pud_t *pud;
unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;

pud = pud_offset(p4d, addr);
@@ -82,18 +85,18 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
again:
next = pud_addr_end(addr, end);
if (pud_none(*pud) || !walk->vma) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
continue;
}

- if (walk->pud_entry) {
+ if (ops->pud_entry) {
spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);

if (ptl) {
- err = walk->pud_entry(pud, addr, next, walk);
+ err = ops->pud_entry(pud, addr, next, walk);
spin_unlock(ptl);
if (err)
break;
@@ -105,7 +108,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
if (pud_none(*pud))
goto again;

- if (walk->pmd_entry || walk->pte_entry)
+ if (ops->pmd_entry || ops->pte_entry)
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
@@ -119,19 +122,20 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
{
p4d_t *p4d;
unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;

p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d)) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
- if (walk->pmd_entry || walk->pte_entry)
+ if (ops->pmd_entry || ops->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
@@ -145,19 +149,20 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
{
pgd_t *pgd;
unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;

pgd = pgd_offset(walk->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
- if (walk->pmd_entry || walk->pte_entry)
+ if (ops->pmd_entry || ops->pte_entry)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
@@ -183,6 +188,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
unsigned long hmask = huge_page_mask(h);
unsigned long sz = huge_page_size(h);
pte_t *pte;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;

do {
@@ -190,9 +196,9 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
pte = huge_pte_offset(walk->mm, addr & hmask, sz);

if (pte)
- err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
- else if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
+ else if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);

if (err)
break;
@@ -220,9 +226,10 @@ static int walk_page_test(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
+ const struct mm_walk_ops *ops = walk->ops;

- if (walk->test_walk)
- return walk->test_walk(start, end, walk);
+ if (ops->test_walk)
+ return ops->test_walk(start, end, walk);

/*
* vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
@@ -234,8 +241,8 @@ static int walk_page_test(unsigned long start, unsigned long end,
*/
if (vma->vm_flags & VM_PFNMAP) {
int err = 1;
- if (walk->pte_hole)
- err = walk->pte_hole(start, end, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(start, end, walk);
return err ? err : 1;
}
return 0;
@@ -248,7 +255,7 @@ static int __walk_page_range(unsigned long start, unsigned long end,
struct vm_area_struct *vma = walk->vma;

if (vma && is_vm_hugetlb_page(vma)) {
- if (walk->hugetlb_entry)
+ if (walk->ops->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
err = walk_pgd_range(start, end, walk);
@@ -258,11 +265,13 @@ static int __walk_page_range(unsigned long start, unsigned long end,

/**
* walk_page_range - walk page table with caller specific callbacks
- * @start: start address of the virtual address range
- * @end: end address of the virtual address range
- * @walk: mm_walk structure defining the callbacks and the target address space
+ * @mm: mm_struct representing the target process of page table walk
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @ops: operation to call during the walk
+ * @private: private data for callbacks' usage
*
- * Recursively walk the page table tree of the process represented by @walk->mm
+ * Recursively walk the page table tree of the process represented by @mm
* within the virtual address range [@start, @end). During walking, we can do
* some caller-specific works for each entry, by setting up pmd_entry(),
* pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
@@ -278,47 +287,52 @@ static int __walk_page_range(unsigned long start, unsigned long end,
*
* Before starting to walk page table, some callers want to check whether
* they really want to walk over the current vma, typically by checking
- * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
+ * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
* purpose.
*
* struct mm_walk keeps current values of some common data like vma and pmd,
* which are useful for the access from callbacks. If you want to pass some
- * caller-specific data to callbacks, @walk->private should be helpful.
+ * caller-specific data to callbacks, @private should be helpful.
*
* Locking:
- * Callers of walk_page_range() and walk_page_vma() should hold
- * @walk->mm->mmap_sem, because these function traverse vma list and/or
- * access to vma's data.
+ * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem,
+ * because these function traverse vma list and/or access to vma's data.
*/
-int walk_page_range(unsigned long start, unsigned long end,
- struct mm_walk *walk)
+int walk_page_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private)
{
int err = 0;
unsigned long next;
struct vm_area_struct *vma;
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = mm,
+ .private = private,
+ };

if (start >= end)
return -EINVAL;

- if (!walk->mm)
+ if (!walk.mm)
return -EINVAL;

- VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
+ VM_BUG_ON_MM(!rwsem_is_locked(&walk.mm->mmap_sem), walk.mm);

- vma = find_vma(walk->mm, start);
+ vma = find_vma(walk.mm, start);
do {
if (!vma) { /* after the last vma */
- walk->vma = NULL;
+ walk.vma = NULL;
next = end;
} else if (start < vma->vm_start) { /* outside vma */
- walk->vma = NULL;
+ walk.vma = NULL;
next = min(end, vma->vm_start);
} else { /* inside vma */
- walk->vma = vma;
+ walk.vma = vma;
next = min(end, vma->vm_end);
vma = vma->vm_next;

- err = walk_page_test(start, next, walk);
+ err = walk_page_test(start, next, &walk);
if (err > 0) {
/*
* positive return values are purely for
@@ -331,28 +345,34 @@ int walk_page_range(unsigned long start, unsigned long end,
if (err < 0)
break;
}
- if (walk->vma || walk->pte_hole)
- err = __walk_page_range(start, next, walk);
+ if (walk.vma || walk.ops->pte_hole)
+ err = __walk_page_range(start, next, &walk);
if (err)
break;
} while (start = next, start < end);
return err;
}

-int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
+int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
+ void *private)
{
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = vma->vm_mm,
+ .vma = vma,
+ .private = private,
+ };
int err;

- if (!walk->mm)
+ if (!walk.mm)
return -EINVAL;

- VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
- VM_BUG_ON(!vma);
- walk->vma = vma;
- err = walk_page_test(vma->vm_start, vma->vm_end, walk);
+ VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
+
+ err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
if (err > 0)
return 0;
if (err < 0)
return err;
- return __walk_page_range(vma->vm_start, vma->vm_end, walk);
+ return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
}
--
2.20.1

2019-08-28 14:23:05

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 3/3] pagewalk: use lockdep_assert_held for locking validation

Use lockdep to check for held locks instead of using home grown
asserts.

Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Thomas Hellstrom <[email protected]>
Reviewed-by: Steven Price <[email protected]>
---
mm/pagewalk.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index b8762b673a3d..d48c2a986ea3 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -317,7 +317,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
if (!walk.mm)
return -EINVAL;

- VM_BUG_ON_MM(!rwsem_is_locked(&walk.mm->mmap_sem), walk.mm);
+ lockdep_assert_held(&walk.mm->mmap_sem);

vma = find_vma(walk.mm, start);
do {
@@ -367,7 +367,7 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
if (!walk.mm)
return -EINVAL;

- VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
+ lockdep_assert_held(&walk.mm->mmap_sem);

err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
if (err > 0)
--
2.20.1

2019-08-28 15:07:02

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 2/3] pagewalk: separate function pointers from iterator data

On Wed, Aug 28, 2019 at 04:19:54PM +0200, Christoph Hellwig wrote:
> @@ -2546,7 +2542,7 @@ int s390_enable_sie(void)
> mm->context.has_pgste = 1;
> /* split thp mappings and disable thp for future mappings */
> thp_split_mm(mm);
> - zap_zero_pages(mm);
> + walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
> up_write(&mm->mmap_sem);
> return 0;
> }

[..]

> @@ -1217,7 +1222,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
> 0, NULL, mm, 0, -1UL);
> mmu_notifier_invalidate_range_start(&range);
> }
> - walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
> + walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
> + &cp);

Is the difference between TASK_SIZE and 'highest_vm_end' deliberate,
or should we add a 'walk_all_pages'() mini helper for this? I see most
of the users are using one or the other variant.

Otherwise the mechanical transformation looked OK to me

Reviewed-by: Jason Gunthorpe <[email protected]>

Jason

2019-08-28 15:09:04

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: cleanup the walk_page_range interface v2

On Wed, Aug 28, 2019 at 04:19:52PM +0200, Christoph Hellwig wrote:
> Hi all,
>
> this series is based on a patch from Linus to split the callbacks
> passed to walk_page_range and walk_page_vma into a separate structure
> that can be marked const, with various cleanups from me on top.
>
> This series is also available as a git tre here:
>
> git://git.infradead.org/users/hch/misc.git pagewalk-cleanup
>
> Gitweb:
>
> http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/pagewalk-cleanup
>
>
> Diffstat:
>
> 14 files changed, 291 insertions(+), 273 deletions(-)
>
> Changes since v1:
> - minor comment typo and checkpatch fixes
> - fix a compile failure for !CONFIG_SHMEM
> - rebased to the wip/jgg-hmm branch

Applied to hmm.git, thanks

I will push it toward linux-next after 0-day completes

Jason

2019-08-29 07:01:18

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 2/3] pagewalk: separate function pointers from iterator data

On Wed, Aug 28, 2019 at 03:05:19PM +0000, Jason Gunthorpe wrote:
> > @@ -1217,7 +1222,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
> > 0, NULL, mm, 0, -1UL);
> > mmu_notifier_invalidate_range_start(&range);
> > }
> > - walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
> > + walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
> > + &cp);
>
> Is the difference between TASK_SIZE and 'highest_vm_end' deliberate,
> or should we add a 'walk_all_pages'() mini helper for this? I see most
> of the users are using one or the other variant.

I have no idea to be honest. A walk_all_pages-like helper doesn't
seem like a bad idea, but the priority seems lower than cleaning up
all the callers using walk_page_range on a vma..

2019-09-01 18:50:09

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH 2/3] pagewalk: separate function pointers from iterator data

On Wed, Aug 28, 2019 at 04:19:54PM +0200, Christoph Hellwig wrote:
> The mm_walk structure currently mixed data and code. Split out the
> operations vectors into a new mm_walk_ops structure, and while we
> are changing the API also declare the mm_walk structure inside the
> walk_page_range and walk_page_vma functions.
>
> Based on patch from Linus Torvalds.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Reviewed-by: Thomas Hellstrom <[email protected]>
> Reviewed-by: Steven Price <[email protected]>
> Reviewed-by: Jason Gunthorpe <[email protected]>

When building csky:defconfig:

In file included from mm/madvise.c:30:
mm/madvise.c: In function 'madvise_free_single_vma':
arch/csky/include/asm/tlb.h:11:11: error:
invalid type argument of '->' (have 'struct mmu_gather')

Guenter

---
# bad: [6d028043b55e54f48fbdf62ea8ce11a4ad830cac] Add linux-next specific files for 20190830
# good: [a55aa89aab90fae7c815b0551b07be37db359d76] Linux 5.3-rc6
git bisect start 'HEAD' 'v5.3-rc6'
# good: [199d454c0775386a645dd9e80b486c346816762f] Merge remote-tracking branch 'crypto/master'
git bisect good 199d454c0775386a645dd9e80b486c346816762f
# good: [450fd5809930dfee10dbbf351cdb2148cd022b1c] Merge remote-tracking branch 'regulator/for-next'
git bisect good 450fd5809930dfee10dbbf351cdb2148cd022b1c
# good: [12b85c8517393b5466dff225a338fc7416150df0] Merge remote-tracking branch 'tty/tty-next'
git bisect good 12b85c8517393b5466dff225a338fc7416150df0
# good: [ecda3e90e6357b15f3189ce00938ea5a20850b76] Merge remote-tracking branch 'scsi/for-next'
git bisect good ecda3e90e6357b15f3189ce00938ea5a20850b76
# good: [4cb65f973115d07578f5b8f4492da7d8295effe2] Merge remote-tracking branch 'rtc/rtc-next'
git bisect good 4cb65f973115d07578f5b8f4492da7d8295effe2
# good: [f3bf5fa4097e06b9cabb193599a012680380e52e] kernel/elfcore.c: include proper prototypes
git bisect good f3bf5fa4097e06b9cabb193599a012680380e52e
# bad: [e58b341134ca751d9c12bacded12a8b4dd51368d] Merge remote-tracking branch 'hmm/hmm'
git bisect bad e58b341134ca751d9c12bacded12a8b4dd51368d
# good: [47f725ee7b5f5cae1f83512961bcf8b41a7a5794] RDMA/odp: remove ib_ucontext from ib_umem
git bisect good 47f725ee7b5f5cae1f83512961bcf8b41a7a5794
# good: [f9016e8058fdcd4aadc9932e045891a7b3bc8c8f] Merge remote-tracking branch 'nvmem/for-next'
git bisect good f9016e8058fdcd4aadc9932e045891a7b3bc8c8f
# good: [2f6e2a06b51e0dd9767bf37c3542ee3b5e4611d4] Merge remote-tracking branch 'pidfd/for-next'
git bisect good 2f6e2a06b51e0dd9767bf37c3542ee3b5e4611d4
# good: [d2b219ed03d45a9799e4ba780c209edf9c510d3b] mm/mmu_notifiers: add a lockdep map for invalidate_range_start/end
git bisect good d2b219ed03d45a9799e4ba780c209edf9c510d3b
# good: [4e10e8c36663a011f77d39c937aaa473fad90de3] mm: split out a new pagewalk.h header from mm.h
git bisect good 4e10e8c36663a011f77d39c937aaa473fad90de3
# bad: [5b8f3df6239c3a9b625ab4bdc69c54d4768a4f06] pagewalk: use lockdep_assert_held for locking validation
git bisect bad 5b8f3df6239c3a9b625ab4bdc69c54d4768a4f06
# bad: [923bfc561e7535f7dc2be136da75690582268cf2] pagewalk: separate function pointers from iterator data
git bisect bad 923bfc561e7535f7dc2be136da75690582268cf2
# first bad commit: [923bfc561e7535f7dc2be136da75690582268cf2] pagewalk: separate function pointers from iterator data

2019-09-01 19:39:03

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 2/3] pagewalk: separate function pointers from iterator data

On Sun, Sep 01, 2019 at 11:45:30AM -0700, Guenter Roeck wrote:
> On Wed, Aug 28, 2019 at 04:19:54PM +0200, Christoph Hellwig wrote:
> > The mm_walk structure currently mixed data and code. Split out the
> > operations vectors into a new mm_walk_ops structure, and while we
> > are changing the API also declare the mm_walk structure inside the
> > walk_page_range and walk_page_vma functions.
> >
> > Based on patch from Linus Torvalds.
> >
> > Signed-off-by: Christoph Hellwig <[email protected]>
> > Reviewed-by: Thomas Hellstrom <[email protected]>
> > Reviewed-by: Steven Price <[email protected]>
> > Reviewed-by: Jason Gunthorpe <[email protected]>
>
> When building csky:defconfig:
>
> In file included from mm/madvise.c:30:
> mm/madvise.c: In function 'madvise_free_single_vma':
> arch/csky/include/asm/tlb.h:11:11: error:
> invalid type argument of '->' (have 'struct mmu_gather')

I belive the macros above are missing brackets.. Can you confirm the
below takes care of things? I'll add a patch if so

diff --git a/arch/csky/include/asm/tlb.h b/arch/csky/include/asm/tlb.h
index 8c7cc097666f04..fdff9b8d70c811 100644
--- a/arch/csky/include/asm/tlb.h
+++ b/arch/csky/include/asm/tlb.h
@@ -8,14 +8,14 @@

#define tlb_start_vma(tlb, vma) \
do { \
- if (!tlb->fullmm) \
- flush_cache_range(vma, vma->vm_start, vma->vm_end); \
+ if (!(tlb)->fullmm) \
+ flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \
} while (0)

#define tlb_end_vma(tlb, vma) \
do { \
- if (!tlb->fullmm) \
- flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
+ if (!(tlb)->fullmm) \
+ flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \
} while (0)

#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)

Thanks,
Jason

2019-09-01 20:38:40

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH 2/3] pagewalk: separate function pointers from iterator data

On 9/1/19 12:36 PM, Jason Gunthorpe wrote:
> On Sun, Sep 01, 2019 at 11:45:30AM -0700, Guenter Roeck wrote:
>> On Wed, Aug 28, 2019 at 04:19:54PM +0200, Christoph Hellwig wrote:
>>> The mm_walk structure currently mixed data and code. Split out the
>>> operations vectors into a new mm_walk_ops structure, and while we
>>> are changing the API also declare the mm_walk structure inside the
>>> walk_page_range and walk_page_vma functions.
>>>
>>> Based on patch from Linus Torvalds.
>>>
>>> Signed-off-by: Christoph Hellwig <[email protected]>
>>> Reviewed-by: Thomas Hellstrom <[email protected]>
>>> Reviewed-by: Steven Price <[email protected]>
>>> Reviewed-by: Jason Gunthorpe <[email protected]>
>>
>> When building csky:defconfig:
>>
>> In file included from mm/madvise.c:30:
>> mm/madvise.c: In function 'madvise_free_single_vma':
>> arch/csky/include/asm/tlb.h:11:11: error:
>> invalid type argument of '->' (have 'struct mmu_gather')
>
> I belive the macros above are missing brackets.. Can you confirm the
> below takes care of things? I'll add a patch if so
>

Good catch. Yes, that fixes the build problem.

Guenter

> diff --git a/arch/csky/include/asm/tlb.h b/arch/csky/include/asm/tlb.h
> index 8c7cc097666f04..fdff9b8d70c811 100644
> --- a/arch/csky/include/asm/tlb.h
> +++ b/arch/csky/include/asm/tlb.h
> @@ -8,14 +8,14 @@
>
> #define tlb_start_vma(tlb, vma) \
> do { \
> - if (!tlb->fullmm) \
> - flush_cache_range(vma, vma->vm_start, vma->vm_end); \
> + if (!(tlb)->fullmm) \
> + flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \
> } while (0)
>
> #define tlb_end_vma(tlb, vma) \
> do { \
> - if (!tlb->fullmm) \
> - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
> + if (!(tlb)->fullmm) \
> + flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \
> } while (0)
>
> #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
>
> Thanks,
> Jason
>

2019-09-02 05:59:10

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 2/3] pagewalk: separate function pointers from iterator data

On Sun, Sep 01, 2019 at 01:35:16PM -0700, Guenter Roeck wrote:
> > I belive the macros above are missing brackets.. Can you confirm the
> > below takes care of things? I'll add a patch if so
> >
>
> Good catch. Yes, that fixes the build problem.

I added this to the hmm tree to fix it:

From 6a7e550e0f1c1eeab75e0e2c7ffe5e9e9ae649ba Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <[email protected]>
Date: Mon, 2 Sep 2019 02:47:05 -0300
Subject: [PATCH] csky: add missing brackets in a macro for tlb.h

As an earlier patch made the macro argument more complicated, compilation
now fails with:

In file included from mm/madvise.c:30:
mm/madvise.c: In function 'madvise_free_single_vma':
arch/csky/include/asm/tlb.h:11:11: error:
invalid type argument of '->' (have 'struct mmu_gather')

Link: https://lore.kernel.org/r/[email protected]
Fixes: 923bfc561e75 ("pagewalk: separate function pointers from iterator data")
Reported-by: Guenter Roeck <[email protected]>
Signed-off-by: Jason Gunthorpe <[email protected]>
---
arch/csky/include/asm/tlb.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/csky/include/asm/tlb.h b/arch/csky/include/asm/tlb.h
index 8c7cc097666f04..fdff9b8d70c811 100644
--- a/arch/csky/include/asm/tlb.h
+++ b/arch/csky/include/asm/tlb.h
@@ -8,14 +8,14 @@

#define tlb_start_vma(tlb, vma) \
do { \
- if (!tlb->fullmm) \
- flush_cache_range(vma, vma->vm_start, vma->vm_end); \
+ if (!(tlb)->fullmm) \
+ flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \
} while (0)

#define tlb_end_vma(tlb, vma) \
do { \
- if (!tlb->fullmm) \
- flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
+ if (!(tlb)->fullmm) \
+ flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \
} while (0)

#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
--
2.23.0

2019-09-02 08:00:14

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 2/3] pagewalk: separate function pointers from iterator data

On Mon, Sep 02, 2019 at 05:51:58AM +0000, Jason Gunthorpe wrote:
> On Sun, Sep 01, 2019 at 01:35:16PM -0700, Guenter Roeck wrote:
> > > I belive the macros above are missing brackets.. Can you confirm the
> > > below takes care of things? I'll add a patch if so
> > >
> >
> > Good catch. Yes, that fixes the build problem.
>
> I added this to the hmm tree to fix it:

This looks good. Although I still haven't figure out how this is
related to the pagewalk changes to start with..

2019-09-02 10:52:46

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 2/3] pagewalk: separate function pointers from iterator data

On Mon, Sep 02, 2019 at 09:58:59AM +0200, Christoph Hellwig wrote:
> On Mon, Sep 02, 2019 at 05:51:58AM +0000, Jason Gunthorpe wrote:
> > On Sun, Sep 01, 2019 at 01:35:16PM -0700, Guenter Roeck wrote:
> > > > I belive the macros above are missing brackets.. Can you confirm the
> > > > below takes care of things? I'll add a patch if so
> > > >
> > >
> > > Good catch. Yes, that fixes the build problem.
> >
> > I added this to the hmm tree to fix it:
>
> This looks good. Although I still haven't figure out how this is
> related to the pagewalk changes to start with..

It is this hunk:

@@ -481,7 +461,10 @@ static int madvise_free_single_vma(struct
vm_area_struct *vma,
update_hiwater_rss(mm);

mmu_notifier_invalidate_range_start(&range);
- madvise_free_page_range(&tlb, vma, range.start, range.end);
+ tlb_start_vma(&tlb, vma);
+ walk_page_range(vma->vm_mm, range.start, range.end,
+ &madvise_free_walk_ops, &tlb);
+ tlb_end_vma(&tlb, vma);

&tlb does not expand properly in the csky tlb_start_vma macro, and
previously it was just tlb

Jason