This series is ver.3 of page table walker patchset.
In previous discussion I got an objection of moving pte handling code to
->pte_entry() callback, so in this version I've dropped all of such code.
The patchset mainly does fixing vma handling and applying page walker to
2 new users. Here is a brief overview:
patch 1: clean up
patch 2: fix bug-prone vma handling code
patch 3: add another interface of page walker
patch 4-10: clean up each of existing user
patch 11: apply page walker to new user queue_pages_range()
patch 12: allow clear_refs_pte_range() to handle thp (from Kirill)
patch 13: apply page walker to new user do_mincore()
Thanks,
Naoya Horiguchi
Tree: [email protected]:Naoya-Horiguchi/linux.git
Branch: v3.16-rc1/page_table_walker.ver3
---
Summary:
Kirill A. Shutemov (1):
mm: /proc/pid/clear_refs: avoid split_huge_page()
Naoya Horiguchi (12):
mm/pagewalk: remove pgd_entry() and pud_entry()
pagewalk: improve vma handling
pagewalk: add walk_page_vma()
smaps: remove mem_size_stats->vma and use walk_page_vma()
clear_refs: remove clear_refs_private->vma and introduce clear_refs_test_walk()
pagemap: use walk->vma instead of calling find_vma()
numa_maps: remove numa_maps->vma
numa_maps: fix typo in gather_hugetbl_stats
memcg: apply walk_page_vma()
arch/powerpc/mm/subpage-prot.c: use walk->vma and walk_page_vma()
mempolicy: apply page table walker on queue_pages_range()
mincore: apply page table walker on do_mincore()
arch/powerpc/mm/subpage-prot.c | 6 +-
fs/proc/task_mmu.c | 143 ++++++++++++++++----------
include/linux/mm.h | 22 ++--
mm/huge_memory.c | 20 ----
mm/memcontrol.c | 36 +++----
mm/mempolicy.c | 228 +++++++++++++++++------------------------
mm/mincore.c | 174 ++++++++++++-------------------
mm/pagewalk.c | 223 ++++++++++++++++++++++++----------------
8 files changed, 406 insertions(+), 446 deletions(-)
pagewalk.c can handle vma in itself, so we don't have to pass vma via
walk->private. And show_smap() walks pages on vma basis, so using
walk_page_vma() is preferable.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
fs/proc/task_mmu.c | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git v3.16-rc1.orig/fs/proc/task_mmu.c v3.16-rc1/fs/proc/task_mmu.c
index cfa63ee92c96..9b6c7d4fd3f4 100644
--- v3.16-rc1.orig/fs/proc/task_mmu.c
+++ v3.16-rc1/fs/proc/task_mmu.c
@@ -430,7 +430,6 @@ const struct file_operations proc_tid_maps_operations = {
#ifdef CONFIG_PROC_PAGE_MONITOR
struct mem_size_stats {
- struct vm_area_struct *vma;
unsigned long resident;
unsigned long shared_clean;
unsigned long shared_dirty;
@@ -449,7 +448,7 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
unsigned long ptent_size, struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
- struct vm_area_struct *vma = mss->vma;
+ struct vm_area_struct *vma = walk->vma;
pgoff_t pgoff = linear_page_index(vma, addr);
struct page *page = NULL;
int mapcount;
@@ -501,7 +500,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
- struct vm_area_struct *vma = mss->vma;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
@@ -590,14 +589,13 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
struct mm_walk smaps_walk = {
.pmd_entry = smaps_pte_range,
.mm = vma->vm_mm,
+ .vma = vma,
.private = &mss,
};
memset(&mss, 0, sizeof mss);
- mss.vma = vma;
/* mmap_sem is held in m_start */
- if (vma->vm_mm && !is_vm_hugetlb_page(vma))
- walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
+ walk_page_vma(vma, &smaps_walk);
show_map_vma(m, vma, is_pid);
--
1.9.3
From: "Kirill A. Shutemov" <[email protected]>
Currently pagewalker splits all THP pages on any clear_refs request. It's
not necessary. We can handle this on PMD level.
One side effect is that soft dirty will potentially see more dirty memory,
since we will mark whole THP page dirty at once.
Sanity checked with CRIU test suite. More testing is required.
ChangeLog:
- move code for thp to clear_refs_pte_range()
Signed-off-by: Kirill A. Shutemov <[email protected]>
Cc: Pavel Emelyanov <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Dave Hansen <[email protected]>
Signed-off-by: Naoya Horiguchi <[email protected]>
Cc: Cyrill Gorcunov <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---
fs/proc/task_mmu.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 44 insertions(+), 3 deletions(-)
diff --git v3.16-rc1.orig/fs/proc/task_mmu.c v3.16-rc1/fs/proc/task_mmu.c
index ac50a829320a..2acbe144152c 100644
--- v3.16-rc1.orig/fs/proc/task_mmu.c
+++ v3.16-rc1/fs/proc/task_mmu.c
@@ -719,10 +719,10 @@ struct clear_refs_private {
enum clear_refs_types type;
};
+#ifdef CONFIG_MEM_SOFT_DIRTY
static inline void clear_soft_dirty(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte)
{
-#ifdef CONFIG_MEM_SOFT_DIRTY
/*
* The soft-dirty tracker uses #PF-s to catch writes
* to pages, so write-protect the pte as well. See the
@@ -741,9 +741,35 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
}
set_pte_at(vma->vm_mm, addr, pte, ptent);
-#endif
}
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t pmd = *pmdp;
+
+ pmd = pmd_wrprotect(pmd);
+ pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+}
+
+#else
+
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+}
+
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+}
+#endif
+
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
@@ -753,7 +779,22 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
spinlock_t *ptl;
struct page *page;
- split_huge_page_pmd(vma, addr, pmd);
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ clear_soft_dirty_pmd(vma, addr, pmd);
+ goto out;
+ }
+
+ page = pmd_page(*pmd);
+
+ /* Clear accessed and referenced bits. */
+ pmdp_test_and_clear_young(vma, addr, pmd);
+ ClearPageReferenced(page);
+out:
+ spin_unlock(ptl);
+ return 0;
+ }
+
if (pmd_trans_unstable(pmd))
return 0;
--
1.9.3
pagewalk.c can handle vma in itself, so we don't have to pass vma via
walk->private. And show_numa_map() walks pages on vma basis, so using
walk_page_vma() is preferable.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
fs/proc/task_mmu.c | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git v3.16-rc1.orig/fs/proc/task_mmu.c v3.16-rc1/fs/proc/task_mmu.c
index 74f87794afab..b4459c006d50 100644
--- v3.16-rc1.orig/fs/proc/task_mmu.c
+++ v3.16-rc1/fs/proc/task_mmu.c
@@ -1247,7 +1247,6 @@ const struct file_operations proc_pagemap_operations = {
#ifdef CONFIG_NUMA
struct numa_maps {
- struct vm_area_struct *vma;
unsigned long pages;
unsigned long anon;
unsigned long active;
@@ -1316,18 +1315,17 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
- struct numa_maps *md;
+ struct numa_maps *md = walk->private;
+ struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
pte_t *orig_pte;
pte_t *pte;
- md = walk->private;
-
- if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
- page = can_gather_numa_stats(huge_pte, md->vma, addr);
+ page = can_gather_numa_stats(huge_pte, vma, addr);
if (page)
gather_stats(page, md, pte_dirty(huge_pte),
HPAGE_PMD_SIZE/PAGE_SIZE);
@@ -1339,7 +1337,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
return 0;
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
do {
- struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
+ struct page *page = can_gather_numa_stats(*pte, vma, addr);
if (!page)
continue;
gather_stats(page, md, pte_dirty(*pte), 1);
@@ -1398,12 +1396,11 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
/* Ensure we start with an empty set of numa_maps statistics. */
memset(md, 0, sizeof(*md));
- md->vma = vma;
-
walk.hugetlb_entry = gather_hugetbl_stats;
walk.pmd_entry = gather_pte_stats;
walk.private = md;
walk.mm = mm;
+ walk.vma = vma;
pol = get_vma_policy(task, vma, vma->vm_start);
mpol_to_str(buffer, sizeof(buffer), pol);
@@ -1434,7 +1431,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
if (is_vm_hugetlb_page(vma))
seq_puts(m, " huge");
- walk_page_range(vma->vm_start, vma->vm_end, &walk);
+ /* mmap_sem is held by m_start */
+ walk_page_vma(vma, &walk);
if (!md->pages)
goto out;
--
1.9.3
Page table walker has the information of the current vma in mm_walk, so
we don't have to call find_vma() in each pagemap_hugetlb_range() call.
NULL-vma check is omitted because we assume that we never run hugetlb_entry()
callback on the address without vma. And even if it were broken, null pointer
dereference would be detected, so we can get enough information for debugging.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
fs/proc/task_mmu.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git v3.16-rc1.orig/fs/proc/task_mmu.c v3.16-rc1/fs/proc/task_mmu.c
index 3c42cd40ad36..74f87794afab 100644
--- v3.16-rc1.orig/fs/proc/task_mmu.c
+++ v3.16-rc1/fs/proc/task_mmu.c
@@ -1082,15 +1082,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = walk->vma;
int err = 0;
int flags2;
pagemap_entry_t pme;
- vma = find_vma(walk->mm, addr);
- WARN_ON_ONCE(!vma);
-
- if (vma && (vma->vm_flags & VM_SOFTDIRTY))
+ if (vma->vm_flags & VM_SOFTDIRTY)
flags2 = __PM_SOFT_DIRTY;
else
flags2 = 0;
--
1.9.3
We don't have to use mm_walk->private to pass vma to the callback function
because of mm_walk->vma. And walk_page_vma() is useful if we walk over a
single vma.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
arch/powerpc/mm/subpage-prot.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git v3.16-rc1.orig/arch/powerpc/mm/subpage-prot.c v3.16-rc1/arch/powerpc/mm/subpage-prot.c
index 6c0b1f5f8d2c..fa9fb5b4c66c 100644
--- v3.16-rc1.orig/arch/powerpc/mm/subpage-prot.c
+++ v3.16-rc1/arch/powerpc/mm/subpage-prot.c
@@ -134,7 +134,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->private;
+ struct vm_area_struct *vma = walk->vma;
split_huge_page_pmd(vma, addr, pmd);
return 0;
}
@@ -163,9 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
if (vma->vm_start >= (addr + len))
break;
vma->vm_flags |= VM_NOHUGEPAGE;
- subpage_proto_walk.private = vma;
- walk_page_range(vma->vm_start, vma->vm_end,
- &subpage_proto_walk);
+ walk_page_vma(vma, &subpage_proto_walk);
vma = vma->vm_next;
}
}
--
1.9.3
Currently no user of page table walker sets ->pgd_entry() or ->pud_entry(),
so checking their existence in each loop is just wasting CPU cycle.
So let's remove it to reduce overhead.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
include/linux/mm.h | 6 ------
mm/pagewalk.c | 9 ++-------
2 files changed, 2 insertions(+), 13 deletions(-)
diff --git v3.16-rc1.orig/include/linux/mm.h v3.16-rc1/include/linux/mm.h
index e03dd29145a0..c5cb6394e6cb 100644
--- v3.16-rc1.orig/include/linux/mm.h
+++ v3.16-rc1/include/linux/mm.h
@@ -1100,8 +1100,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
/**
* mm_walk - callbacks for walk_page_range
- * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
- * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
* @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
* this handler is required to be able to handle
* pmd_trans_huge() pmds. They may simply choose to
@@ -1115,10 +1113,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
* (see walk_page_range for more details)
*/
struct mm_walk {
- int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
- unsigned long next, struct mm_walk *walk);
- int (*pud_entry)(pud_t *pud, unsigned long addr,
- unsigned long next, struct mm_walk *walk);
int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk);
int (*pte_entry)(pte_t *pte, unsigned long addr,
diff --git v3.16-rc1.orig/mm/pagewalk.c v3.16-rc1/mm/pagewalk.c
index 2beeabf502c5..335690650b12 100644
--- v3.16-rc1.orig/mm/pagewalk.c
+++ v3.16-rc1/mm/pagewalk.c
@@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
break;
continue;
}
- if (walk->pud_entry)
- err = walk->pud_entry(pud, addr, next, walk);
- if (!err && (walk->pmd_entry || walk->pte_entry))
+ if (walk->pmd_entry || walk->pte_entry)
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
@@ -234,10 +232,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd++;
continue;
}
- if (walk->pgd_entry)
- err = walk->pgd_entry(pgd, addr, next, walk);
- if (!err &&
- (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
+ if (walk->pmd_entry || walk->pte_entry)
err = walk_pud_range(pgd, addr, next, walk);
if (err)
break;
--
1.9.3
Just doing s/gather_hugetbl_stats/gather_hugetlb_stats/g, this makes code
grep-friendly.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
fs/proc/task_mmu.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git v3.16-rc1.orig/fs/proc/task_mmu.c v3.16-rc1/fs/proc/task_mmu.c
index b4459c006d50..ac50a829320a 100644
--- v3.16-rc1.orig/fs/proc/task_mmu.c
+++ v3.16-rc1/fs/proc/task_mmu.c
@@ -1347,7 +1347,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
return 0;
}
#ifdef CONFIG_HUGETLB_PAGE
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
struct numa_maps *md;
@@ -1366,7 +1366,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
}
#else
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
return 0;
@@ -1396,7 +1396,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
/* Ensure we start with an empty set of numa_maps statistics. */
memset(md, 0, sizeof(*md));
- walk.hugetlb_entry = gather_hugetbl_stats;
+ walk.hugetlb_entry = gather_hugetlb_stats;
walk.pmd_entry = gather_pte_stats;
walk.private = md;
walk.mm = mm;
--
1.9.3
Introduces walk_page_vma(), which is useful for the callers which want to
walk over a given vma. It's used by later patches.
ChangeLog:
- check walk_page_test's return value instead of walk->skip
Signed-off-by: Naoya Horiguchi <[email protected]>
---
include/linux/mm.h | 1 +
mm/pagewalk.c | 18 ++++++++++++++++++
2 files changed, 19 insertions(+)
diff --git v3.16-rc1.orig/include/linux/mm.h v3.16-rc1/include/linux/mm.h
index 489a63a06a4a..7e9287750866 100644
--- v3.16-rc1.orig/include/linux/mm.h
+++ v3.16-rc1/include/linux/mm.h
@@ -1137,6 +1137,7 @@ struct mm_walk {
int walk_page_range(unsigned long addr, unsigned long end,
struct mm_walk *walk);
+int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk);
void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
diff --git v3.16-rc1.orig/mm/pagewalk.c v3.16-rc1/mm/pagewalk.c
index 86d811202374..16865e5029f6 100644
--- v3.16-rc1.orig/mm/pagewalk.c
+++ v3.16-rc1/mm/pagewalk.c
@@ -271,3 +271,21 @@ int walk_page_range(unsigned long start, unsigned long end,
} while (start = next, start < end);
return err;
}
+
+int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
+{
+ int err;
+
+ if (!walk->mm)
+ return -EINVAL;
+
+ VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+ VM_BUG_ON(!vma);
+ walk->vma = vma;
+ err = walk_page_test(vma->vm_start, vma->vm_end, walk);
+ if (err > 0)
+ return 0;
+ if (err < 0)
+ return err;
+ return __walk_page_range(vma->vm_start, vma->vm_end, walk);
+}
--
1.9.3
This patch makes do_mincore() use walk_page_vma(), which reduces many lines
of code by using common page table walk code.
ChangeLog v3:
- add NULL vma check in mincore_unmapped_range()
- don't use pte_entry()
ChangeLog v2:
- change type of args of callbacks to void *
- move definition of mincore_walk to the start of the function to fix compiler
warning
Signed-off-by: Naoya Horiguchi <[email protected]>
---
mm/huge_memory.c | 20 -------
mm/mincore.c | 174 ++++++++++++++++++++-----------------------------------
2 files changed, 63 insertions(+), 131 deletions(-)
diff --git v3.16-rc1.orig/mm/huge_memory.c v3.16-rc1/mm/huge_memory.c
index c5ff461e0253..53f451969931 100644
--- v3.16-rc1.orig/mm/huge_memory.c
+++ v3.16-rc1/mm/huge_memory.c
@@ -1379,26 +1379,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return ret;
}
-int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
-{
- spinlock_t *ptl;
- int ret = 0;
-
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- /*
- * All logical pages in the range are present
- * if backed by a huge page.
- */
- spin_unlock(ptl);
- memset(vec, 1, (end - addr) >> PAGE_SHIFT);
- ret = 1;
- }
-
- return ret;
-}
-
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
diff --git v3.16-rc1.orig/mm/mincore.c v3.16-rc1/mm/mincore.c
index 725c80961048..964bafed6320 100644
--- v3.16-rc1.orig/mm/mincore.c
+++ v3.16-rc1/mm/mincore.c
@@ -19,38 +19,26 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
+static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
+ int err = 0;
#ifdef CONFIG_HUGETLB_PAGE
- struct hstate *h;
+ unsigned char present;
+ unsigned char *vec = walk->private;
- h = hstate_vma(vma);
- while (1) {
- unsigned char present;
- pte_t *ptep;
- /*
- * Huge pages are always in RAM for now, but
- * theoretically it needs to be checked.
- */
- ptep = huge_pte_offset(current->mm,
- addr & huge_page_mask(h));
- present = ptep && !huge_pte_none(huge_ptep_get(ptep));
- while (1) {
- *vec = present;
- vec++;
- addr += PAGE_SIZE;
- if (addr == end)
- return;
- /* check hugepage border */
- if (!(addr & ~huge_page_mask(h)))
- break;
- }
- }
+ /*
+ * Hugepages under user process are always in RAM and never
+ * swapped out, but theoretically it needs to be checked.
+ */
+ present = pte && !huge_pte_none(huge_ptep_get(pte));
+ for (; addr != end; vec++, addr += PAGE_SIZE)
+ *vec = present;
+ walk->private += (end - addr) >> PAGE_SHIFT;
#else
BUG();
#endif
+ return err;
}
/*
@@ -94,14 +82,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
return present;
}
-static void mincore_unmapped_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
+static int mincore_unmapped_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
+ struct vm_area_struct *vma = walk->vma;
+ unsigned char *vec = walk->private;
unsigned long nr = (end - addr) >> PAGE_SHIFT;
int i;
- if (vma->vm_file) {
+ if (vma && vma->vm_file) {
pgoff_t pgoff;
pgoff = linear_page_index(vma, addr);
@@ -111,25 +100,38 @@ static void mincore_unmapped_range(struct vm_area_struct *vma,
for (i = 0; i < nr; i++)
vec[i] = 0;
}
+ walk->private += nr;
+ return 0;
}
-static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
+static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
- unsigned long next;
spinlock_t *ptl;
+ struct vm_area_struct *vma = walk->vma;
pte_t *ptep;
- ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- do {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ memset(walk->private, 1, (end - addr) >> PAGE_SHIFT);
+ walk->private += (end - addr) >> PAGE_SHIFT;
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ for (; addr != end; ptep++, addr += PAGE_SIZE) {
pte_t pte = *ptep;
pgoff_t pgoff;
+ unsigned char *vec = walk->private;
- next = addr + PAGE_SIZE;
- if (pte_none(pte))
- mincore_unmapped_range(vma, addr, next, vec);
- else if (pte_present(pte))
+ if (pte_none(pte)) {
+ mincore_unmapped_range(addr, addr + PAGE_SIZE, walk);
+ continue;
+ }
+ if (pte_present(pte))
*vec = 1;
else if (pte_file(pte)) {
pgoff = pte_to_pgoff(pte);
@@ -151,70 +153,11 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
#endif
}
}
- vec++;
- } while (ptep++, addr = next, addr != end);
+ walk->private++;
+ }
pte_unmap_unlock(ptep - 1, ptl);
-}
-
-static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
-{
- unsigned long next;
- pmd_t *pmd;
-
- pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*pmd)) {
- if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
- vec += (next - addr) >> PAGE_SHIFT;
- continue;
- }
- /* fall through */
- }
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- mincore_unmapped_range(vma, addr, next, vec);
- else
- mincore_pte_range(vma, pmd, addr, next, vec);
- vec += (next - addr) >> PAGE_SHIFT;
- } while (pmd++, addr = next, addr != end);
-}
-
-static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
-{
- unsigned long next;
- pud_t *pud;
-
- pud = pud_offset(pgd, addr);
- do {
- next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud))
- mincore_unmapped_range(vma, addr, next, vec);
- else
- mincore_pmd_range(vma, pud, addr, next, vec);
- vec += (next - addr) >> PAGE_SHIFT;
- } while (pud++, addr = next, addr != end);
-}
-
-static void mincore_page_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
-{
- unsigned long next;
- pgd_t *pgd;
-
- pgd = pgd_offset(vma->vm_mm, addr);
- do {
- next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
- mincore_unmapped_range(vma, addr, next, vec);
- else
- mincore_pud_range(vma, pgd, addr, next, vec);
- vec += (next - addr) >> PAGE_SHIFT;
- } while (pgd++, addr = next, addr != end);
+ cond_resched();
+ return 0;
}
/*
@@ -225,20 +168,29 @@ static void mincore_page_range(struct vm_area_struct *vma,
static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
{
struct vm_area_struct *vma;
- unsigned long end;
+ int err;
+ struct mm_walk mincore_walk = {
+ .pmd_entry = mincore_pte_range,
+ .pte_hole = mincore_unmapped_range,
+ .hugetlb_entry = mincore_hugetlb,
+ .private = vec,
+ };
vma = find_vma(current->mm, addr);
if (!vma || addr < vma->vm_start)
return -ENOMEM;
+ mincore_walk.mm = vma->vm_mm;
+ mincore_walk.vma = vma;
- end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
-
- if (is_vm_hugetlb_page(vma))
- mincore_hugetlb_page_range(vma, addr, end, vec);
- else
- mincore_page_range(vma, addr, end, vec);
+ err = walk_page_vma(vma, &mincore_walk);
+ if (err < 0)
+ return err;
+ else {
+ unsigned long end;
- return (end - addr) >> PAGE_SHIFT;
+ end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
+ return (end - addr) >> PAGE_SHIFT;
+ }
}
/*
--
1.9.3
clear_refs_write() has some prechecks to determine if we really walk over
a given vma. Now we have a test_walk() callback to filter vmas, so let's
utilize it.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
fs/proc/task_mmu.c | 55 +++++++++++++++++++++++++++---------------------------
1 file changed, 27 insertions(+), 28 deletions(-)
diff --git v3.16-rc1.orig/fs/proc/task_mmu.c v3.16-rc1/fs/proc/task_mmu.c
index 9b6c7d4fd3f4..3c42cd40ad36 100644
--- v3.16-rc1.orig/fs/proc/task_mmu.c
+++ v3.16-rc1/fs/proc/task_mmu.c
@@ -716,7 +716,6 @@ enum clear_refs_types {
};
struct clear_refs_private {
- struct vm_area_struct *vma;
enum clear_refs_types type;
};
@@ -749,7 +748,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct clear_refs_private *cp = walk->private;
- struct vm_area_struct *vma = cp->vma;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
@@ -783,6 +782,29 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
+static int clear_refs_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct clear_refs_private *cp = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+
+ /*
+ * Writing 1 to /proc/pid/clear_refs affects all pages.
+ * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
+ * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
+ * Writing 4 to /proc/pid/clear_refs affects all pages.
+ */
+ if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
+ return 1;
+ if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
+ return 1;
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+ }
+ return 0;
+}
+
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -823,38 +845,15 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
};
struct mm_walk clear_refs_walk = {
.pmd_entry = clear_refs_pte_range,
+ .test_walk = clear_refs_test_walk,
.mm = mm,
.private = &cp,
};
down_read(&mm->mmap_sem);
if (type == CLEAR_REFS_SOFT_DIRTY)
mmu_notifier_invalidate_range_start(mm, 0, -1);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- cp.vma = vma;
- if (is_vm_hugetlb_page(vma))
- continue;
- /*
- * Writing 1 to /proc/pid/clear_refs affects all pages.
- *
- * Writing 2 to /proc/pid/clear_refs only affects
- * Anonymous pages.
- *
- * Writing 3 to /proc/pid/clear_refs only affects file
- * mapped pages.
- *
- * Writing 4 to /proc/pid/clear_refs affects all pages.
- */
- if (type == CLEAR_REFS_ANON && vma->vm_file)
- continue;
- if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
- continue;
- if (type == CLEAR_REFS_SOFT_DIRTY) {
- if (vma->vm_flags & VM_SOFTDIRTY)
- vma->vm_flags &= ~VM_SOFTDIRTY;
- }
- walk_page_range(vma->vm_start, vma->vm_end,
- &clear_refs_walk);
- }
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ walk_page_vma(vma, &clear_refs_walk);
if (type == CLEAR_REFS_SOFT_DIRTY)
mmu_notifier_invalidate_range_end(mm, 0, -1);
flush_tlb_mm(mm);
--
1.9.3
pagewalk.c can handle vma in itself, so we don't have to pass vma via
walk->private. And both of mem_cgroup_count_precharge() and
mem_cgroup_move_charge() walk over all vmas (not interested in outside vma,)
so using walk_page_vma() is preferable.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
mm/memcontrol.c | 36 ++++++++++++++----------------------
1 file changed, 14 insertions(+), 22 deletions(-)
diff --git v3.16-rc1.orig/mm/memcontrol.c v3.16-rc1/mm/memcontrol.c
index a2c7bcb0e6eb..24c2cc8ff451 100644
--- v3.16-rc1.orig/mm/memcontrol.c
+++ v3.16-rc1/mm/memcontrol.c
@@ -6654,7 +6654,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->private;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
@@ -6682,18 +6682,13 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
unsigned long precharge;
struct vm_area_struct *vma;
+ struct mm_walk mem_cgroup_count_precharge_walk = {
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
+ .mm = mm,
+ };
down_read(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- struct mm_walk mem_cgroup_count_precharge_walk = {
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
- .mm = mm,
- .private = vma,
- };
- if (is_vm_hugetlb_page(vma))
- continue;
- walk_page_range(vma->vm_start, vma->vm_end,
- &mem_cgroup_count_precharge_walk);
- }
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ walk_page_vma(vma, &mem_cgroup_count_precharge_walk);
up_read(&mm->mmap_sem);
precharge = mc.precharge;
@@ -6832,7 +6827,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
struct mm_walk *walk)
{
int ret = 0;
- struct vm_area_struct *vma = walk->private;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
enum mc_target_type target_type;
@@ -6933,6 +6928,10 @@ put: /* get_mctgt_type() gets the page */
static void mem_cgroup_move_charge(struct mm_struct *mm)
{
struct vm_area_struct *vma;
+ struct mm_walk mem_cgroup_move_charge_walk = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+ .mm = mm,
+ };
lru_add_drain_all();
retry:
@@ -6950,15 +6949,8 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret;
- struct mm_walk mem_cgroup_move_charge_walk = {
- .pmd_entry = mem_cgroup_move_charge_pte_range,
- .mm = mm,
- .private = vma,
- };
- if (is_vm_hugetlb_page(vma))
- continue;
- ret = walk_page_range(vma->vm_start, vma->vm_end,
- &mem_cgroup_move_charge_walk);
+
+ ret = walk_page_vma(vma, &mem_cgroup_move_charge_walk);
if (ret)
/*
* means we have consumed all precharges and failed in
--
1.9.3
queue_pages_range() does page table walking in its own way now, but there
is some code duplicate. This patch applies page table walker to reduce
lines of code.
queue_pages_range() has to do some precheck to determine whether we really
walk over the vma or just skip it. Now we have test_walk() callback in
mm_walk for this purpose, so we can do this replacement cleanly.
queue_pages_test_walk() depends on not only the current vma but also the
previous one, so queue_pages->prev is introduced to remember it.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
mm/mempolicy.c | 228 +++++++++++++++++++++++----------------------------------
1 file changed, 93 insertions(+), 135 deletions(-)
diff --git v3.16-rc1.orig/mm/mempolicy.c v3.16-rc1/mm/mempolicy.c
index 284974230459..17467d476c63 100644
--- v3.16-rc1.orig/mm/mempolicy.c
+++ v3.16-rc1/mm/mempolicy.c
@@ -479,24 +479,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
+struct queue_pages {
+ struct list_head *pagelist;
+ unsigned long flags;
+ nodemask_t *nmask;
+ struct vm_area_struct *prev;
+};
+
/*
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
*/
-static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
+static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
- pte_t *orig_pte;
+ struct vm_area_struct *vma = walk->vma;
+ struct page *page;
+ struct queue_pages *qp = walk->private;
+ unsigned long flags = qp->flags;
+ int nid;
pte_t *pte;
spinlock_t *ptl;
- orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- do {
- struct page *page;
- int nid;
+ split_huge_page_pmd(vma, addr, pmd);
+ if (pmd_trans_unstable(pmd))
+ return 0;
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
@@ -509,114 +519,46 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (PageReserved(page))
continue;
nid = page_to_nid(page);
- if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+ if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
continue;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- migrate_page_add(page, private, flags);
- else
- break;
- } while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap_unlock(orig_pte, ptl);
- return addr != end;
+ migrate_page_add(page, qp->pagelist, flags);
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+ return 0;
}
-static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
- pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
- void *private)
+static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
+ struct queue_pages *qp = walk->private;
+ unsigned long flags = qp->flags;
int nid;
struct page *page;
spinlock_t *ptl;
pte_t entry;
- ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
- entry = huge_ptep_get((pte_t *)pmd);
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ entry = huge_ptep_get(pte);
if (!pte_present(entry))
goto unlock;
page = pte_page(entry);
nid = page_to_nid(page);
- if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+ if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
goto unlock;
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
- isolate_huge_page(page, private);
+ isolate_huge_page(page, qp->pagelist);
unlock:
spin_unlock(ptl);
#else
BUG();
#endif
-}
-
-static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
- pmd_t *pmd;
- unsigned long next;
-
- pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- if (!pmd_present(*pmd))
- continue;
- if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
- queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
- flags, private);
- continue;
- }
- split_huge_page_pmd(vma, addr, pmd);
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- continue;
- if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pmd++, addr = next, addr != end);
- return 0;
-}
-
-static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
- pud_t *pud;
- unsigned long next;
-
- pud = pud_offset(pgd, addr);
- do {
- next = pud_addr_end(addr, end);
- if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
- continue;
- if (pud_none_or_clear_bad(pud))
- continue;
- if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pud++, addr = next, addr != end);
- return 0;
-}
-
-static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
- pgd_t *pgd;
- unsigned long next;
-
- pgd = pgd_offset(vma->vm_mm, addr);
- do {
- next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
- continue;
- if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pgd++, addr = next, addr != end);
return 0;
}
@@ -649,6 +591,44 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
}
#endif /* CONFIG_NUMA_BALANCING */
+static int queue_pages_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ struct queue_pages *qp = walk->private;
+ unsigned long endvma = vma->vm_end;
+ unsigned long flags = qp->flags;
+
+ if (endvma > end)
+ endvma = end;
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+
+ if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+ if (!vma->vm_next && vma->vm_end < end)
+ return -EFAULT;
+ if (qp->prev && qp->prev->vm_end < vma->vm_start)
+ return -EFAULT;
+ }
+
+ qp->prev = vma;
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ if (flags & MPOL_MF_LAZY) {
+ change_prot_numa(vma, start, endvma);
+ return 1;
+ }
+
+ if ((flags & MPOL_MF_STRICT) ||
+ ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+ vma_migratable(vma)))
+ /* queue pages from current vma */
+ return 0;
+ return 1;
+}
+
/*
* Walk through page tables and collect pages to be migrated.
*
@@ -658,51 +638,29 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
*/
static struct vm_area_struct *
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
- const nodemask_t *nodes, unsigned long flags, void *private)
+ nodemask_t *nodes, unsigned long flags,
+ struct list_head *pagelist)
{
int err;
- struct vm_area_struct *first, *vma, *prev;
-
-
- first = find_vma(mm, start);
- if (!first)
- return ERR_PTR(-EFAULT);
- prev = NULL;
- for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
- unsigned long endvma = vma->vm_end;
-
- if (endvma > end)
- endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
-
- if (!(flags & MPOL_MF_DISCONTIG_OK)) {
- if (!vma->vm_next && vma->vm_end < end)
- return ERR_PTR(-EFAULT);
- if (prev && prev->vm_end < vma->vm_start)
- return ERR_PTR(-EFAULT);
- }
-
- if (flags & MPOL_MF_LAZY) {
- change_prot_numa(vma, start, endvma);
- goto next;
- }
-
- if ((flags & MPOL_MF_STRICT) ||
- ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma))) {
-
- err = queue_pages_pgd_range(vma, start, endvma, nodes,
- flags, private);
- if (err) {
- first = ERR_PTR(err);
- break;
- }
- }
-next:
- prev = vma;
- }
- return first;
+ struct queue_pages qp = {
+ .pagelist = pagelist,
+ .flags = flags,
+ .nmask = nodes,
+ .prev = NULL,
+ };
+ struct mm_walk queue_pages_walk = {
+ .hugetlb_entry = queue_pages_hugetlb,
+ .pmd_entry = queue_pages_pte_range,
+ .test_walk = queue_pages_test_walk,
+ .mm = mm,
+ .private = &qp,
+ };
+
+ err = walk_page_range(start, end, &queue_pages_walk);
+ if (err < 0)
+ return ERR_PTR(err);
+ else
+ return find_vma(mm, start);
}
/*
--
1.9.3
Current implementation of page table walker has a fundamental problem
in vma handling, which started when we tried to handle vma(VM_HUGETLB).
Because it's done in pgd loop, considering vma boundary makes code
complicated and bug-prone.
>From the users viewpoint, some user checks some vma-related condition to
determine whether the user really does page walk over the vma.
In order to solve these, this patch moves vma check outside pgd loop and
introduce a new callback ->test_walk().
ChangeLog v3:
- drop walk->skip control
Signed-off-by: Naoya Horiguchi <[email protected]>
---
include/linux/mm.h | 15 +++-
mm/pagewalk.c | 198 ++++++++++++++++++++++++++++++-----------------------
2 files changed, 126 insertions(+), 87 deletions(-)
diff --git v3.16-rc1.orig/include/linux/mm.h v3.16-rc1/include/linux/mm.h
index c5cb6394e6cb..489a63a06a4a 100644
--- v3.16-rc1.orig/include/linux/mm.h
+++ v3.16-rc1/include/linux/mm.h
@@ -1107,10 +1107,16 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
* @pte_entry: if set, called for each non-empty PTE (4th-level) entry
* @pte_hole: if set, called for each hole at all levels
* @hugetlb_entry: if set, called for each hugetlb entry
- * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry
- * is used.
+ * @test_walk: caller specific callback function to determine whether
+ * we walk over the current vma or not. A positive returned
+ * value means "do page table walk over the current vma,"
+ * and a negative one means "abort current page table walk
+ * right now." 0 means "skip the current vma."
+ * @mm: mm_struct representing the target process of page table walk
+ * @vma: vma currently walked (NULL if walking outside vmas)
+ * @private: private data for callbacks' usage
*
- * (see walk_page_range for more details)
+ * (see the comment on walk_page_range() for more details)
*/
struct mm_walk {
int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
@@ -1122,7 +1128,10 @@ struct mm_walk {
int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long next,
struct mm_walk *walk);
+ int (*test_walk)(unsigned long addr, unsigned long next,
+ struct mm_walk *walk);
struct mm_struct *mm;
+ struct vm_area_struct *vma;
void *private;
};
diff --git v3.16-rc1.orig/mm/pagewalk.c v3.16-rc1/mm/pagewalk.c
index 335690650b12..86d811202374 100644
--- v3.16-rc1.orig/mm/pagewalk.c
+++ v3.16-rc1/mm/pagewalk.c
@@ -59,7 +59,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
continue;
split_huge_page_pmd_mm(walk->mm, addr, pmd);
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ if (pmd_trans_unstable(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
if (err)
@@ -95,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
return err;
}
+static int walk_pgd_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ int err = 0;
+
+ pgd = pgd_offset(walk->mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, walk);
+ if (err)
+ break;
+ continue;
+ }
+ if (walk->pmd_entry || walk->pte_entry)
+ err = walk_pud_range(pgd, addr, next, walk);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+
+ return err;
+}
+
#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
unsigned long end)
@@ -103,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
return boundary < end ? boundary : end;
}
-static int walk_hugetlb_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
+ struct vm_area_struct *vma = walk->vma;
struct hstate *h = hstate_vma(vma);
unsigned long next;
unsigned long hmask = huge_page_mask(h);
@@ -119,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
if (pte && walk->hugetlb_entry)
err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
if (err)
- return err;
+ break;
} while (addr = next, addr != end);
return 0;
}
#else /* CONFIG_HUGETLB_PAGE */
-static int walk_hugetlb_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
return 0;
@@ -135,109 +160,114 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
#endif /* CONFIG_HUGETLB_PAGE */
+/*
+ * Decide whether we really walk over the current vma on [@start, @end)
+ * or skip it via the returned value. Return 0 if we do walk over the
+ * current vma, and return 1 if we skip the vma. Negative values means
+ * error, where we abort the current walk.
+ *
+ * Default check (only VM_PFNMAP check for now) is used when the caller
+ * doesn't define test_walk() callback.
+ */
+static int walk_page_test(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+
+ if (walk->test_walk)
+ return walk->test_walk(start, end, walk);
+ /*
+ * Do not walk over vma(VM_PFNMAP), because we have no valid struct
+ * page backing a VM_PFNMAP range. See also commit a9ff785e4437.
+ */
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+ return 0;
+}
+
+static int __walk_page_range(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ int err = 0;
+ struct vm_area_struct *vma = walk->vma;
+
+ if (vma && is_vm_hugetlb_page(vma)) {
+ if (walk->hugetlb_entry)
+ err = walk_hugetlb_range(start, end, walk);
+ } else
+ err = walk_pgd_range(start, end, walk);
+
+ return err;
+}
/**
- * walk_page_range - walk a memory map's page tables with a callback
- * @addr: starting address
- * @end: ending address
- * @walk: set of callbacks to invoke for each level of the tree
- *
- * Recursively walk the page table for the memory area in a VMA,
- * calling supplied callbacks. Callbacks are called in-order (first
- * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
- * etc.). If lower-level callbacks are omitted, walking depth is reduced.
+ * walk_page_range - walk page table with caller specific callbacks
*
- * Each callback receives an entry pointer and the start and end of the
- * associated range, and a copy of the original mm_walk for access to
- * the ->private or ->mm fields.
+ * Recursively walk the page table tree of the process represented by @walk->mm
+ * within the virtual address range [@start, @end). During walking, we can do
+ * some caller-specific works for each entry, by setting up pmd_entry(),
+ * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
+ * callbacks, the associated entries/pages are just ignored.
+ * The return values of these callbacks are commonly defined like below:
+ * - 0 : succeeded to handle the current entry, and if you don't reach the
+ * end address yet, continue to walk.
+ * - >0 : succeeded to handle the current entry, and return to the caller
+ * with caller specific value.
+ * - <0 : failed to handle the current entry, and return to the caller
+ * with error code.
*
- * Usually no locks are taken, but splitting transparent huge page may
- * take page table lock. And the bottom level iterator will map PTE
- * directories from highmem if necessary.
+ * Before starting to walk page table, some callers want to check whether
+ * they really want to walk over the current vma, typically by checking
+ * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
+ * purpose.
*
- * If any callback returns a non-zero value, the walk is aborted and
- * the return value is propagated back to the caller. Otherwise 0 is returned.
+ * struct mm_walk keeps current values of some common data like vma and pmd,
+ * which are useful for the access from callbacks. If you want to pass some
+ * caller-specific data to callbacks, @walk->private should be helpful.
*
- * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
- * is !NULL.
+ * Locking:
+ * Callers of walk_page_range() and walk_page_vma() should hold
+ * @walk->mm->mmap_sem, because these function traverse vma list and/or
+ * access to vma's data.
*/
-int walk_page_range(unsigned long addr, unsigned long end,
+int walk_page_range(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
- pgd_t *pgd;
- unsigned long next;
int err = 0;
+ unsigned long next;
- if (addr >= end)
- return err;
+ if (start >= end)
+ return -EINVAL;
if (!walk->mm)
return -EINVAL;
VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
- pgd = pgd_offset(walk->mm, addr);
do {
- struct vm_area_struct *vma = NULL;
+ struct vm_area_struct *vma;
- next = pgd_addr_end(addr, end);
+ vma = find_vma(walk->mm, start);
+ if (!vma) { /* after the last vma */
+ walk->vma = NULL;
+ next = end;
+ } else if (start < vma->vm_start) { /* outside the found vma */
+ walk->vma = NULL;
+ next = vma->vm_start;
+ } else { /* inside the found vma */
+ walk->vma = vma;
+ next = min(end, vma->vm_end);
- /*
- * This function was not intended to be vma based.
- * But there are vma special cases to be handled:
- * - hugetlb vma's
- * - VM_PFNMAP vma's
- */
- vma = find_vma(walk->mm, addr);
- if (vma) {
- /*
- * There are no page structures backing a VM_PFNMAP
- * range, so do not allow split_huge_page_pmd().
- */
- if ((vma->vm_start <= addr) &&
- (vma->vm_flags & VM_PFNMAP)) {
- next = vma->vm_end;
- pgd = pgd_offset(walk->mm, next);
- continue;
- }
- /*
- * Handle hugetlb vma individually because pagetable
- * walk for the hugetlb page is dependent on the
- * architecture and we can't handled it in the same
- * manner as non-huge pages.
- */
- if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
- is_vm_hugetlb_page(vma)) {
- if (vma->vm_end < next)
- next = vma->vm_end;
- /*
- * Hugepage is very tightly coupled with vma,
- * so walk through hugetlb entries within a
- * given vma.
- */
- err = walk_hugetlb_range(vma, addr, next, walk);
- if (err)
- break;
- pgd = pgd_offset(walk->mm, next);
+ err = walk_page_test(start, next, walk);
+ if (err > 0)
continue;
- }
- }
-
- if (pgd_none_or_clear_bad(pgd)) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
- if (err)
+ if (err < 0)
break;
- pgd++;
- continue;
}
- if (walk->pmd_entry || walk->pte_entry)
- err = walk_pud_range(pgd, addr, next, walk);
+ err = __walk_page_range(start, next, walk);
if (err)
break;
- pgd++;
- } while (addr = next, addr < end);
-
+ } while (start = next, start < end);
return err;
}
--
1.9.3
On 06/20/2014 10:11 PM, Naoya Horiguchi wrote:
> pagewalk.c can handle vma in itself, so we don't have to pass vma via
> walk->private. And show_smap() walks pages on vma basis, so using
> walk_page_vma() is preferable.
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
> ---
> fs/proc/task_mmu.c | 10 ++++------
> 1 file changed, 4 insertions(+), 6 deletions(-)
>
> diff --git v3.16-rc1.orig/fs/proc/task_mmu.c v3.16-rc1/fs/proc/task_mmu.c
> index cfa63ee92c96..9b6c7d4fd3f4 100644
> --- v3.16-rc1.orig/fs/proc/task_mmu.c
> +++ v3.16-rc1/fs/proc/task_mmu.c
> @@ -430,7 +430,6 @@ const struct file_operations proc_tid_maps_operations = {
>
> #ifdef CONFIG_PROC_PAGE_MONITOR
> struct mem_size_stats {
> - struct vm_area_struct *vma;
> unsigned long resident;
> unsigned long shared_clean;
> unsigned long shared_dirty;
> @@ -449,7 +448,7 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
> unsigned long ptent_size, struct mm_walk *walk)
> {
> struct mem_size_stats *mss = walk->private;
> - struct vm_area_struct *vma = mss->vma;
> + struct vm_area_struct *vma = walk->vma;
> pgoff_t pgoff = linear_page_index(vma, addr);
> struct page *page = NULL;
> int mapcount;
> @@ -501,7 +500,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> struct mm_walk *walk)
> {
> struct mem_size_stats *mss = walk->private;
> - struct vm_area_struct *vma = mss->vma;
> + struct vm_area_struct *vma = walk->vma;
> pte_t *pte;
> spinlock_t *ptl;
>
> @@ -590,14 +589,13 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
> struct mm_walk smaps_walk = {
> .pmd_entry = smaps_pte_range,
> .mm = vma->vm_mm,
> + .vma = vma,
Seems redundant: walk_page_vma() sets walk.vma anyway and so does
walk_page_range(). Is there any case when the caller should set .vma itself?
Jerome
> .private = &mss,
> };
>
> memset(&mss, 0, sizeof mss);
> - mss.vma = vma;
> /* mmap_sem is held in m_start */
> - if (vma->vm_mm && !is_vm_hugetlb_page(vma))
> - walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
> + walk_page_vma(vma, &smaps_walk);
>
> show_map_vma(m, vma, is_pid);
>
>
Hi Jerome,
On Thu, Jun 26, 2014 at 03:35:36PM +0200, Jerome Marchand wrote:
> On 06/20/2014 10:11 PM, Naoya Horiguchi wrote:
> > pagewalk.c can handle vma in itself, so we don't have to pass vma via
> > walk->private. And show_smap() walks pages on vma basis, so using
> > walk_page_vma() is preferable.
> >
> > Signed-off-by: Naoya Horiguchi <[email protected]>
> > ---
> > fs/proc/task_mmu.c | 10 ++++------
> > 1 file changed, 4 insertions(+), 6 deletions(-)
> >
> > diff --git v3.16-rc1.orig/fs/proc/task_mmu.c v3.16-rc1/fs/proc/task_mmu.c
> > index cfa63ee92c96..9b6c7d4fd3f4 100644
> > --- v3.16-rc1.orig/fs/proc/task_mmu.c
> > +++ v3.16-rc1/fs/proc/task_mmu.c
> > @@ -430,7 +430,6 @@ const struct file_operations proc_tid_maps_operations = {
> >
> > #ifdef CONFIG_PROC_PAGE_MONITOR
> > struct mem_size_stats {
> > - struct vm_area_struct *vma;
> > unsigned long resident;
> > unsigned long shared_clean;
> > unsigned long shared_dirty;
> > @@ -449,7 +448,7 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
> > unsigned long ptent_size, struct mm_walk *walk)
> > {
> > struct mem_size_stats *mss = walk->private;
> > - struct vm_area_struct *vma = mss->vma;
> > + struct vm_area_struct *vma = walk->vma;
> > pgoff_t pgoff = linear_page_index(vma, addr);
> > struct page *page = NULL;
> > int mapcount;
> > @@ -501,7 +500,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> > struct mm_walk *walk)
> > {
> > struct mem_size_stats *mss = walk->private;
> > - struct vm_area_struct *vma = mss->vma;
> > + struct vm_area_struct *vma = walk->vma;
> > pte_t *pte;
> > spinlock_t *ptl;
> >
> > @@ -590,14 +589,13 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
> > struct mm_walk smaps_walk = {
> > .pmd_entry = smaps_pte_range,
> > .mm = vma->vm_mm,
> > + .vma = vma,
>
> Seems redundant: walk_page_vma() sets walk.vma anyway and so does
> walk_page_range(). Is there any case when the caller should set .vma itself?
Correct, no need to set in caller side, thank you.
Naoya Horiguchi
On Fri, Jun 20, 2014 at 04:11:27PM -0400, Naoya Horiguchi wrote:
> Currently no user of page table walker sets ->pgd_entry() or ->pud_entry(),
> so checking their existence in each loop is just wasting CPU cycle.
> So let's remove it to reduce overhead.
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Fri, Jun 20, 2014 at 04:11:28PM -0400, Naoya Horiguchi wrote:
> Current implementation of page table walker has a fundamental problem
> in vma handling, which started when we tried to handle vma(VM_HUGETLB).
> Because it's done in pgd loop, considering vma boundary makes code
> complicated and bug-prone.
>
> From the users viewpoint, some user checks some vma-related condition to
> determine whether the user really does page walk over the vma.
>
> In order to solve these, this patch moves vma check outside pgd loop and
> introduce a new callback ->test_walk().
>
> ChangeLog v3:
> - drop walk->skip control
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
> ---
> include/linux/mm.h | 15 +++-
> mm/pagewalk.c | 198 ++++++++++++++++++++++++++++++-----------------------
> 2 files changed, 126 insertions(+), 87 deletions(-)
>
> diff --git v3.16-rc1.orig/include/linux/mm.h v3.16-rc1/include/linux/mm.h
> index c5cb6394e6cb..489a63a06a4a 100644
> --- v3.16-rc1.orig/include/linux/mm.h
> +++ v3.16-rc1/include/linux/mm.h
> @@ -1107,10 +1107,16 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
> * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
> * @pte_hole: if set, called for each hole at all levels
> * @hugetlb_entry: if set, called for each hugetlb entry
> - * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry
> - * is used.
> + * @test_walk: caller specific callback function to determine whether
> + * we walk over the current vma or not. A positive returned
> + * value means "do page table walk over the current vma,"
> + * and a negative one means "abort current page table walk
> + * right now." 0 means "skip the current vma."
> + * @mm: mm_struct representing the target process of page table walk
> + * @vma: vma currently walked (NULL if walking outside vmas)
> + * @private: private data for callbacks' usage
> *
> - * (see walk_page_range for more details)
> + * (see the comment on walk_page_range() for more details)
> */
> struct mm_walk {
> int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
> @@ -1122,7 +1128,10 @@ struct mm_walk {
> int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
> unsigned long addr, unsigned long next,
> struct mm_walk *walk);
> + int (*test_walk)(unsigned long addr, unsigned long next,
> + struct mm_walk *walk);
> struct mm_struct *mm;
> + struct vm_area_struct *vma;
> void *private;
> };
>
> diff --git v3.16-rc1.orig/mm/pagewalk.c v3.16-rc1/mm/pagewalk.c
> index 335690650b12..86d811202374 100644
> --- v3.16-rc1.orig/mm/pagewalk.c
> +++ v3.16-rc1/mm/pagewalk.c
> @@ -59,7 +59,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
> continue;
>
> split_huge_page_pmd_mm(walk->mm, addr, pmd);
> - if (pmd_none_or_trans_huge_or_clear_bad(pmd))
> + if (pmd_trans_unstable(pmd))
> goto again;
> err = walk_pte_range(pmd, addr, next, walk);
> if (err)
> @@ -95,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
> return err;
> }
>
> +static int walk_pgd_range(unsigned long addr, unsigned long end,
> + struct mm_walk *walk)
> +{
> + pgd_t *pgd;
> + unsigned long next;
> + int err = 0;
> +
> + pgd = pgd_offset(walk->mm, addr);
> + do {
> + next = pgd_addr_end(addr, end);
> + if (pgd_none_or_clear_bad(pgd)) {
> + if (walk->pte_hole)
> + err = walk->pte_hole(addr, next, walk);
> + if (err)
> + break;
> + continue;
> + }
> + if (walk->pmd_entry || walk->pte_entry)
> + err = walk_pud_range(pgd, addr, next, walk);
> + if (err)
> + break;
> + } while (pgd++, addr = next, addr != end);
> +
> + return err;
> +}
> +
> #ifdef CONFIG_HUGETLB_PAGE
> static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
> unsigned long end)
> @@ -103,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
> return boundary < end ? boundary : end;
> }
>
> -static int walk_hugetlb_range(struct vm_area_struct *vma,
> - unsigned long addr, unsigned long end,
> +static int walk_hugetlb_range(unsigned long addr, unsigned long end,
> struct mm_walk *walk)
> {
> + struct vm_area_struct *vma = walk->vma;
> struct hstate *h = hstate_vma(vma);
> unsigned long next;
> unsigned long hmask = huge_page_mask(h);
> @@ -119,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
> if (pte && walk->hugetlb_entry)
> err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
> if (err)
> - return err;
> + break;
> } while (addr = next, addr != end);
>
> return 0;
I guess it should be 'return err;', right?
> }
>
> #else /* CONFIG_HUGETLB_PAGE */
> -static int walk_hugetlb_range(struct vm_area_struct *vma,
> - unsigned long addr, unsigned long end,
> +static int walk_hugetlb_range(unsigned long addr, unsigned long end,
> struct mm_walk *walk)
> {
> return 0;
> @@ -135,109 +160,114 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
>
> #endif /* CONFIG_HUGETLB_PAGE */
>
> +/*
> + * Decide whether we really walk over the current vma on [@start, @end)
> + * or skip it via the returned value. Return 0 if we do walk over the
> + * current vma, and return 1 if we skip the vma. Negative values means
> + * error, where we abort the current walk.
> + *
> + * Default check (only VM_PFNMAP check for now) is used when the caller
> + * doesn't define test_walk() callback.
> + */
> +static int walk_page_test(unsigned long start, unsigned long end,
> + struct mm_walk *walk)
> +{
> + struct vm_area_struct *vma = walk->vma;
> +
> + if (walk->test_walk)
> + return walk->test_walk(start, end, walk);
>
> + /*
> + * Do not walk over vma(VM_PFNMAP), because we have no valid struct
> + * page backing a VM_PFNMAP range. See also commit a9ff785e4437.
> + */
> + if (vma->vm_flags & VM_PFNMAP)
> + return 1;
> + return 0;
> +}
> +
> +static int __walk_page_range(unsigned long start, unsigned long end,
> + struct mm_walk *walk)
> +{
> + int err = 0;
> + struct vm_area_struct *vma = walk->vma;
> +
> + if (vma && is_vm_hugetlb_page(vma)) {
> + if (walk->hugetlb_entry)
> + err = walk_hugetlb_range(start, end, walk);
> + } else
> + err = walk_pgd_range(start, end, walk);
> +
> + return err;
> +}
>
> /**
> - * walk_page_range - walk a memory map's page tables with a callback
> - * @addr: starting address
> - * @end: ending address
> - * @walk: set of callbacks to invoke for each level of the tree
> - *
> - * Recursively walk the page table for the memory area in a VMA,
> - * calling supplied callbacks. Callbacks are called in-order (first
> - * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
> - * etc.). If lower-level callbacks are omitted, walking depth is reduced.
> + * walk_page_range - walk page table with caller specific callbacks
> *
> - * Each callback receives an entry pointer and the start and end of the
> - * associated range, and a copy of the original mm_walk for access to
> - * the ->private or ->mm fields.
> + * Recursively walk the page table tree of the process represented by @walk->mm
> + * within the virtual address range [@start, @end). During walking, we can do
> + * some caller-specific works for each entry, by setting up pmd_entry(),
> + * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
> + * callbacks, the associated entries/pages are just ignored.
> + * The return values of these callbacks are commonly defined like below:
> + * - 0 : succeeded to handle the current entry, and if you don't reach the
> + * end address yet, continue to walk.
> + * - >0 : succeeded to handle the current entry, and return to the caller
> + * with caller specific value.
> + * - <0 : failed to handle the current entry, and return to the caller
> + * with error code.
> *
> - * Usually no locks are taken, but splitting transparent huge page may
> - * take page table lock. And the bottom level iterator will map PTE
> - * directories from highmem if necessary.
> + * Before starting to walk page table, some callers want to check whether
> + * they really want to walk over the current vma, typically by checking
> + * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
> + * purpose.
> *
> - * If any callback returns a non-zero value, the walk is aborted and
> - * the return value is propagated back to the caller. Otherwise 0 is returned.
> + * struct mm_walk keeps current values of some common data like vma and pmd,
> + * which are useful for the access from callbacks. If you want to pass some
> + * caller-specific data to callbacks, @walk->private should be helpful.
> *
> - * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
> - * is !NULL.
> + * Locking:
> + * Callers of walk_page_range() and walk_page_vma() should hold
> + * @walk->mm->mmap_sem, because these function traverse vma list and/or
> + * access to vma's data.
> */
> -int walk_page_range(unsigned long addr, unsigned long end,
> +int walk_page_range(unsigned long start, unsigned long end,
> struct mm_walk *walk)
> {
> - pgd_t *pgd;
> - unsigned long next;
> int err = 0;
> + unsigned long next;
>
> - if (addr >= end)
> - return err;
> + if (start >= end)
> + return -EINVAL;
>
> if (!walk->mm)
> return -EINVAL;
>
> VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
>
> - pgd = pgd_offset(walk->mm, addr);
> do {
> - struct vm_area_struct *vma = NULL;
> + struct vm_area_struct *vma;
>
> - next = pgd_addr_end(addr, end);
> + vma = find_vma(walk->mm, start);
> + if (!vma) { /* after the last vma */
> + walk->vma = NULL;
> + next = end;
> + } else if (start < vma->vm_start) { /* outside the found vma */
> + walk->vma = NULL;
> + next = vma->vm_start;
Is there a reason why we shoul go for __walk_page_range() for these two
cases if walkj->pte_hole() is not defined?
Otherwise, looks okay.
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Fri, Jun 20, 2014 at 04:11:29PM -0400, Naoya Horiguchi wrote:
> Introduces walk_page_vma(), which is useful for the callers which want to
> walk over a given vma. It's used by later patches.
>
> ChangeLog:
> - check walk_page_test's return value instead of walk->skip
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Fri, Jun 20, 2014 at 04:11:30PM -0400, Naoya Horiguchi wrote:
> pagewalk.c can handle vma in itself, so we don't have to pass vma via
> walk->private. And show_smap() walks pages on vma basis, so using
> walk_page_vma() is preferable.
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Fri, Jun 20, 2014 at 04:11:31PM -0400, Naoya Horiguchi wrote:
> clear_refs_write() has some prechecks to determine if we really walk over
> a given vma. Now we have a test_walk() callback to filter vmas, so let's
> utilize it.
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Fri, Jun 20, 2014 at 04:11:32PM -0400, Naoya Horiguchi wrote:
> Page table walker has the information of the current vma in mm_walk, so
> we don't have to call find_vma() in each pagemap_hugetlb_range() call.
>
> NULL-vma check is omitted because we assume that we never run hugetlb_entry()
> callback on the address without vma. And even if it were broken, null pointer
> dereference would be detected, so we can get enough information for debugging.
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Fri, Jun 20, 2014 at 04:11:33PM -0400, Naoya Horiguchi wrote:
> pagewalk.c can handle vma in itself, so we don't have to pass vma via
> walk->private. And show_numa_map() walks pages on vma basis, so using
> walk_page_vma() is preferable.
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
> ---
> fs/proc/task_mmu.c | 18 ++++++++----------
> 1 file changed, 8 insertions(+), 10 deletions(-)
>
> diff --git v3.16-rc1.orig/fs/proc/task_mmu.c v3.16-rc1/fs/proc/task_mmu.c
> index 74f87794afab..b4459c006d50 100644
> --- v3.16-rc1.orig/fs/proc/task_mmu.c
> +++ v3.16-rc1/fs/proc/task_mmu.c
> @@ -1247,7 +1247,6 @@ const struct file_operations proc_pagemap_operations = {
> #ifdef CONFIG_NUMA
>
> struct numa_maps {
> - struct vm_area_struct *vma;
> unsigned long pages;
> unsigned long anon;
> unsigned long active;
> @@ -1316,18 +1315,17 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
> static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
> unsigned long end, struct mm_walk *walk)
> {
> - struct numa_maps *md;
> + struct numa_maps *md = walk->private;
> + struct vm_area_struct *vma = walk->vma;
> spinlock_t *ptl;
> pte_t *orig_pte;
> pte_t *pte;
>
> - md = walk->private;
> -
> - if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
> + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
> pte_t huge_pte = *(pte_t *)pmd;
> struct page *page;
>
> - page = can_gather_numa_stats(huge_pte, md->vma, addr);
> + page = can_gather_numa_stats(huge_pte, vma, addr);
> if (page)
> gather_stats(page, md, pte_dirty(huge_pte),
> HPAGE_PMD_SIZE/PAGE_SIZE);
> @@ -1339,7 +1337,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
> return 0;
> orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
> do {
> - struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
> + struct page *page = can_gather_numa_stats(*pte, vma, addr);
> if (!page)
> continue;
> gather_stats(page, md, pte_dirty(*pte), 1);
> @@ -1398,12 +1396,11 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
> /* Ensure we start with an empty set of numa_maps statistics. */
> memset(md, 0, sizeof(*md));
>
> - md->vma = vma;
> -
> walk.hugetlb_entry = gather_hugetbl_stats;
> walk.pmd_entry = gather_pte_stats;
> walk.private = md;
> walk.mm = mm;
> + walk.vma = vma;
Redundant.
And it's probably good idea move walk initialization to declaration.
Otherwise:
Acked-by: Kirill A. Shutemov <[email protected]>
>
> pol = get_vma_policy(task, vma, vma->vm_start);
> mpol_to_str(buffer, sizeof(buffer), pol);
> @@ -1434,7 +1431,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
> if (is_vm_hugetlb_page(vma))
> seq_puts(m, " huge");
>
> - walk_page_range(vma->vm_start, vma->vm_end, &walk);
> + /* mmap_sem is held by m_start */
> + walk_page_vma(vma, &walk);
>
> if (!md->pages)
> goto out;
> --
> 1.9.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
Kirill A. Shutemov
On Fri, Jun 20, 2014 at 04:11:34PM -0400, Naoya Horiguchi wrote:
> Just doing s/gather_hugetbl_stats/gather_hugetlb_stats/g, this makes code
> grep-friendly.
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Fri, Jun 20, 2014 at 04:11:35PM -0400, Naoya Horiguchi wrote:
> pagewalk.c can handle vma in itself, so we don't have to pass vma via
> walk->private. And both of mem_cgroup_count_precharge() and
> mem_cgroup_move_charge() walk over all vmas (not interested in outside vma,)
> so using walk_page_vma() is preferable.
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
My first thought was to suggest walk_page_range(0, -1, &walk) instead
since we walk over all vmas. But walk_page_range() uses find_vma() on each
iteration, which is expensive.
Is there a reason why we cannot use vma->vm_next in walk_page_range()?
--
Kirill A. Shutemov
On Fri, Jun 20, 2014 at 04:11:36PM -0400, Naoya Horiguchi wrote:
> We don't have to use mm_walk->private to pass vma to the callback function
> because of mm_walk->vma. And walk_page_vma() is useful if we walk over a
> single vma.
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Mon, Jun 30, 2014 at 02:53:11PM +0300, Kirill A. Shutemov wrote:
> On Fri, Jun 20, 2014 at 04:11:28PM -0400, Naoya Horiguchi wrote:
> > Current implementation of page table walker has a fundamental problem
> > in vma handling, which started when we tried to handle vma(VM_HUGETLB).
> > Because it's done in pgd loop, considering vma boundary makes code
> > complicated and bug-prone.
> >
> > From the users viewpoint, some user checks some vma-related condition to
> > determine whether the user really does page walk over the vma.
> >
> > In order to solve these, this patch moves vma check outside pgd loop and
> > introduce a new callback ->test_walk().
> >
> > ChangeLog v3:
> > - drop walk->skip control
> >
> > Signed-off-by: Naoya Horiguchi <[email protected]>
> > ---
> > include/linux/mm.h | 15 +++-
> > mm/pagewalk.c | 198 ++++++++++++++++++++++++++++++-----------------------
> > 2 files changed, 126 insertions(+), 87 deletions(-)
> >
> > diff --git v3.16-rc1.orig/include/linux/mm.h v3.16-rc1/include/linux/mm.h
> > index c5cb6394e6cb..489a63a06a4a 100644
> > --- v3.16-rc1.orig/include/linux/mm.h
> > +++ v3.16-rc1/include/linux/mm.h
> > @@ -1107,10 +1107,16 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
> > * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
> > * @pte_hole: if set, called for each hole at all levels
> > * @hugetlb_entry: if set, called for each hugetlb entry
> > - * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry
> > - * is used.
> > + * @test_walk: caller specific callback function to determine whether
> > + * we walk over the current vma or not. A positive returned
> > + * value means "do page table walk over the current vma,"
> > + * and a negative one means "abort current page table walk
> > + * right now." 0 means "skip the current vma."
> > + * @mm: mm_struct representing the target process of page table walk
> > + * @vma: vma currently walked (NULL if walking outside vmas)
> > + * @private: private data for callbacks' usage
> > *
> > - * (see walk_page_range for more details)
> > + * (see the comment on walk_page_range() for more details)
> > */
> > struct mm_walk {
> > int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
> > @@ -1122,7 +1128,10 @@ struct mm_walk {
> > int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
> > unsigned long addr, unsigned long next,
> > struct mm_walk *walk);
> > + int (*test_walk)(unsigned long addr, unsigned long next,
> > + struct mm_walk *walk);
> > struct mm_struct *mm;
> > + struct vm_area_struct *vma;
> > void *private;
> > };
> >
> > diff --git v3.16-rc1.orig/mm/pagewalk.c v3.16-rc1/mm/pagewalk.c
> > index 335690650b12..86d811202374 100644
> > --- v3.16-rc1.orig/mm/pagewalk.c
> > +++ v3.16-rc1/mm/pagewalk.c
> > @@ -59,7 +59,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
> > continue;
> >
> > split_huge_page_pmd_mm(walk->mm, addr, pmd);
> > - if (pmd_none_or_trans_huge_or_clear_bad(pmd))
> > + if (pmd_trans_unstable(pmd))
> > goto again;
> > err = walk_pte_range(pmd, addr, next, walk);
> > if (err)
> > @@ -95,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
> > return err;
> > }
> >
> > +static int walk_pgd_range(unsigned long addr, unsigned long end,
> > + struct mm_walk *walk)
> > +{
> > + pgd_t *pgd;
> > + unsigned long next;
> > + int err = 0;
> > +
> > + pgd = pgd_offset(walk->mm, addr);
> > + do {
> > + next = pgd_addr_end(addr, end);
> > + if (pgd_none_or_clear_bad(pgd)) {
> > + if (walk->pte_hole)
> > + err = walk->pte_hole(addr, next, walk);
> > + if (err)
> > + break;
> > + continue;
> > + }
> > + if (walk->pmd_entry || walk->pte_entry)
> > + err = walk_pud_range(pgd, addr, next, walk);
> > + if (err)
> > + break;
> > + } while (pgd++, addr = next, addr != end);
> > +
> > + return err;
> > +}
> > +
> > #ifdef CONFIG_HUGETLB_PAGE
> > static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
> > unsigned long end)
> > @@ -103,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
> > return boundary < end ? boundary : end;
> > }
> >
> > -static int walk_hugetlb_range(struct vm_area_struct *vma,
> > - unsigned long addr, unsigned long end,
> > +static int walk_hugetlb_range(unsigned long addr, unsigned long end,
> > struct mm_walk *walk)
> > {
> > + struct vm_area_struct *vma = walk->vma;
> > struct hstate *h = hstate_vma(vma);
> > unsigned long next;
> > unsigned long hmask = huge_page_mask(h);
> > @@ -119,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
> > if (pte && walk->hugetlb_entry)
> > err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
> > if (err)
> > - return err;
> > + break;
> > } while (addr = next, addr != end);
> >
> > return 0;
>
> I guess it should be 'return err;', right?
>
> > }
> >
> > #else /* CONFIG_HUGETLB_PAGE */
> > -static int walk_hugetlb_range(struct vm_area_struct *vma,
> > - unsigned long addr, unsigned long end,
> > +static int walk_hugetlb_range(unsigned long addr, unsigned long end,
> > struct mm_walk *walk)
> > {
> > return 0;
> > @@ -135,109 +160,114 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
> >
> > #endif /* CONFIG_HUGETLB_PAGE */
> >
> > +/*
> > + * Decide whether we really walk over the current vma on [@start, @end)
> > + * or skip it via the returned value. Return 0 if we do walk over the
> > + * current vma, and return 1 if we skip the vma. Negative values means
> > + * error, where we abort the current walk.
> > + *
> > + * Default check (only VM_PFNMAP check for now) is used when the caller
> > + * doesn't define test_walk() callback.
> > + */
> > +static int walk_page_test(unsigned long start, unsigned long end,
> > + struct mm_walk *walk)
> > +{
> > + struct vm_area_struct *vma = walk->vma;
> > +
> > + if (walk->test_walk)
> > + return walk->test_walk(start, end, walk);
> >
> > + /*
> > + * Do not walk over vma(VM_PFNMAP), because we have no valid struct
> > + * page backing a VM_PFNMAP range. See also commit a9ff785e4437.
> > + */
> > + if (vma->vm_flags & VM_PFNMAP)
> > + return 1;
> > + return 0;
> > +}
> > +
> > +static int __walk_page_range(unsigned long start, unsigned long end,
> > + struct mm_walk *walk)
> > +{
> > + int err = 0;
> > + struct vm_area_struct *vma = walk->vma;
> > +
> > + if (vma && is_vm_hugetlb_page(vma)) {
> > + if (walk->hugetlb_entry)
> > + err = walk_hugetlb_range(start, end, walk);
> > + } else
> > + err = walk_pgd_range(start, end, walk);
> > +
> > + return err;
> > +}
> >
> > /**
> > - * walk_page_range - walk a memory map's page tables with a callback
> > - * @addr: starting address
> > - * @end: ending address
> > - * @walk: set of callbacks to invoke for each level of the tree
> > - *
> > - * Recursively walk the page table for the memory area in a VMA,
> > - * calling supplied callbacks. Callbacks are called in-order (first
> > - * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
> > - * etc.). If lower-level callbacks are omitted, walking depth is reduced.
> > + * walk_page_range - walk page table with caller specific callbacks
> > *
> > - * Each callback receives an entry pointer and the start and end of the
> > - * associated range, and a copy of the original mm_walk for access to
> > - * the ->private or ->mm fields.
> > + * Recursively walk the page table tree of the process represented by @walk->mm
> > + * within the virtual address range [@start, @end). During walking, we can do
> > + * some caller-specific works for each entry, by setting up pmd_entry(),
> > + * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
> > + * callbacks, the associated entries/pages are just ignored.
> > + * The return values of these callbacks are commonly defined like below:
> > + * - 0 : succeeded to handle the current entry, and if you don't reach the
> > + * end address yet, continue to walk.
> > + * - >0 : succeeded to handle the current entry, and return to the caller
> > + * with caller specific value.
> > + * - <0 : failed to handle the current entry, and return to the caller
> > + * with error code.
> > *
> > - * Usually no locks are taken, but splitting transparent huge page may
> > - * take page table lock. And the bottom level iterator will map PTE
> > - * directories from highmem if necessary.
> > + * Before starting to walk page table, some callers want to check whether
> > + * they really want to walk over the current vma, typically by checking
> > + * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
> > + * purpose.
> > *
> > - * If any callback returns a non-zero value, the walk is aborted and
> > - * the return value is propagated back to the caller. Otherwise 0 is returned.
> > + * struct mm_walk keeps current values of some common data like vma and pmd,
> > + * which are useful for the access from callbacks. If you want to pass some
> > + * caller-specific data to callbacks, @walk->private should be helpful.
> > *
> > - * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
> > - * is !NULL.
> > + * Locking:
> > + * Callers of walk_page_range() and walk_page_vma() should hold
> > + * @walk->mm->mmap_sem, because these function traverse vma list and/or
> > + * access to vma's data.
> > */
> > -int walk_page_range(unsigned long addr, unsigned long end,
> > +int walk_page_range(unsigned long start, unsigned long end,
> > struct mm_walk *walk)
> > {
> > - pgd_t *pgd;
> > - unsigned long next;
> > int err = 0;
> > + unsigned long next;
> >
> > - if (addr >= end)
> > - return err;
> > + if (start >= end)
> > + return -EINVAL;
> >
> > if (!walk->mm)
> > return -EINVAL;
> >
> > VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
> >
> > - pgd = pgd_offset(walk->mm, addr);
> > do {
> > - struct vm_area_struct *vma = NULL;
> > + struct vm_area_struct *vma;
> >
> > - next = pgd_addr_end(addr, end);
> > + vma = find_vma(walk->mm, start);
> > + if (!vma) { /* after the last vma */
> > + walk->vma = NULL;
> > + next = end;
> > + } else if (start < vma->vm_start) { /* outside the found vma */
> > + walk->vma = NULL;
> > + next = vma->vm_start;
>
> Is there a reason why we shoul go for __walk_page_range() for these two
> cases if walkj->pte_hole() is not defined?
Oh, I see, we can omit it.
I'll do it.
>
> Otherwise, looks okay.
>
> Acked-by: Kirill A. Shutemov <[email protected]>
Thank you for your reviewing.
Naoya
On Mon, Jun 30, 2014 at 03:20:16PM +0300, Kirill A. Shutemov wrote:
> On Fri, Jun 20, 2014 at 04:11:35PM -0400, Naoya Horiguchi wrote:
> > pagewalk.c can handle vma in itself, so we don't have to pass vma via
> > walk->private. And both of mem_cgroup_count_precharge() and
> > mem_cgroup_move_charge() walk over all vmas (not interested in outside vma,)
> > so using walk_page_vma() is preferable.
> >
> > Signed-off-by: Naoya Horiguchi <[email protected]>
>
> My first thought was to suggest walk_page_range(0, -1, &walk) instead
> since we walk over all vmas. But walk_page_range() uses find_vma() on each
> iteration, which is expensive.
> Is there a reason why we cannot use vma->vm_next in walk_page_range()?
Right, we can use vma->vm_next. The old code uses find_vma() because
addr can jump to the next pgd boundary, but that doesn't happen with
this patch, so using vma->vm_next is fine.
Thanks,
Naoya Horiguchi