2022-05-04 01:03:30

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH 0/1] Prepare for maple tree

mips uses mt_init internally. Move this out of the way for the maple
tree.

Liam R. Howlett (1):
mips: rename mt_init to mips_mt_init

arch/mips/kernel/mips-mt.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

--
2.35.1


2022-05-04 15:57:40

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 15/69] damon: Convert __damon_va_three_regions to use the VMA iterator

From: "Liam R. Howlett" <[email protected]>

This rather specialised walk can use the VMA iterator. If this proves to
be too slow, we can write a custom routine to find the two largest gaps,
but it will be somewhat complicated, so let's see if we need it first.

Update the kunit test case to use the maple tree. This also fixes an
issue with the kunit testcase not adding the last VMA to the list.

Fixes: 17ccae8bb5c9 (mm/damon: add kunit tests)
Signed-off-by: Liam R. Howlett <[email protected]>
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Reviewed-by: SeongJae Park <[email protected]>
---
mm/damon/vaddr-test.h | 37 +++++++++++-------------------
mm/damon/vaddr.c | 53 ++++++++++++++++++++++---------------------
2 files changed, 40 insertions(+), 50 deletions(-)

diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index 5431da4fe9d4..dbf2b8759607 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -13,34 +13,21 @@
#define _DAMON_VADDR_TEST_H

#include <kunit/test.h>
+#include "../../mm/internal.h"

-static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas)
+static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas,
+ ssize_t nr_vmas)
{
- int i, j;
- unsigned long largest_gap, gap;
+ int i;
+ MA_STATE(mas, mt, 0, 0);

if (!nr_vmas)
return;

- for (i = 0; i < nr_vmas - 1; i++) {
- vmas[i].vm_next = &vmas[i + 1];
-
- vmas[i].vm_rb.rb_left = NULL;
- vmas[i].vm_rb.rb_right = &vmas[i + 1].vm_rb;
-
- largest_gap = 0;
- for (j = i; j < nr_vmas; j++) {
- if (j == 0)
- continue;
- gap = vmas[j].vm_start - vmas[j - 1].vm_end;
- if (gap > largest_gap)
- largest_gap = gap;
- }
- vmas[i].rb_subtree_gap = largest_gap;
- }
- vmas[i].vm_next = NULL;
- vmas[i].vm_rb.rb_right = NULL;
- vmas[i].rb_subtree_gap = 0;
+ mas_lock(&mas);
+ for (i = 0; i < nr_vmas; i++)
+ vma_mas_store(&vmas[i], &mas);
+ mas_unlock(&mas);
}

/*
@@ -72,6 +59,7 @@ static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas)
*/
static void damon_test_three_regions_in_vmas(struct kunit *test)
{
+ static struct mm_struct mm;
struct damon_addr_range regions[3] = {0,};
/* 10-20-25, 200-210-220, 300-305, 307-330 */
struct vm_area_struct vmas[] = {
@@ -83,9 +71,10 @@ static void damon_test_three_regions_in_vmas(struct kunit *test)
(struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
};

- __link_vmas(vmas, 6);
+ mt_init_flags(&mm.mm_mt, MM_MT_FLAGS);
+ __link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas));

- __damon_va_three_regions(&vmas[0], regions);
+ __damon_va_three_regions(&mm, regions);

KUNIT_EXPECT_EQ(test, 10ul, regions[0].start);
KUNIT_EXPECT_EQ(test, 25ul, regions[0].end);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index b2ec0aa1ff45..9a7c52982c35 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -113,37 +113,38 @@ static unsigned long sz_range(struct damon_addr_range *r)
*
* Returns 0 if success, or negative error code otherwise.
*/
-static int __damon_va_three_regions(struct vm_area_struct *vma,
+static int __damon_va_three_regions(struct mm_struct *mm,
struct damon_addr_range regions[3])
{
- struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0};
- struct vm_area_struct *last_vma = NULL;
- unsigned long start = 0;
- struct rb_root rbroot;
-
- /* Find two biggest gaps so that first_gap > second_gap > others */
- for (; vma; vma = vma->vm_next) {
- if (!last_vma) {
- start = vma->vm_start;
- goto next;
- }
+ struct damon_addr_range first_gap = {0}, second_gap = {0};
+ VMA_ITERATOR(vmi, mm, 0);
+ struct vm_area_struct *vma, *prev = NULL;
+ unsigned long start;

- if (vma->rb_subtree_gap <= sz_range(&second_gap)) {
- rbroot.rb_node = &vma->vm_rb;
- vma = rb_entry(rb_last(&rbroot),
- struct vm_area_struct, vm_rb);
+ /*
+ * Find the two biggest gaps so that first_gap > second_gap > others.
+ * If this is too slow, it can be optimised to examine the maple
+ * tree gaps.
+ */
+ for_each_vma(vmi, vma) {
+ unsigned long gap;
+
+ if (!prev) {
+ start = vma->vm_start;
goto next;
}
-
- gap.start = last_vma->vm_end;
- gap.end = vma->vm_start;
- if (sz_range(&gap) > sz_range(&second_gap)) {
- swap(gap, second_gap);
- if (sz_range(&second_gap) > sz_range(&first_gap))
- swap(second_gap, first_gap);
+ gap = vma->vm_start - prev->vm_end;
+
+ if (gap > sz_range(&first_gap)) {
+ second_gap = first_gap;
+ first_gap.start = prev->vm_end;
+ first_gap.end = vma->vm_start;
+ } else if (gap > sz_range(&second_gap)) {
+ second_gap.start = prev->vm_end;
+ second_gap.end = vma->vm_start;
}
next:
- last_vma = vma;
+ prev = vma;
}

if (!sz_range(&second_gap) || !sz_range(&first_gap))
@@ -159,7 +160,7 @@ static int __damon_va_three_regions(struct vm_area_struct *vma,
regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION);
regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION);
regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION);
- regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION);
+ regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION);

return 0;
}
@@ -180,7 +181,7 @@ static int damon_va_three_regions(struct damon_target *t,
return -EINVAL;

mmap_read_lock(mm);
- rc = __damon_va_three_regions(mm->mmap, regions);
+ rc = __damon_va_three_regions(mm, regions);
mmap_read_unlock(mm);

mmput(mm);
--
2.35.1

2022-05-04 17:17:45

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 16/69] proc: remove VMA rbtree use from nommu

From: "Matthew Wilcox (Oracle)" <[email protected]>

These users of the rbtree should probably have been walks of the linked
list, but convert them to use walks of the maple tree.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
fs/proc/task_nommu.c | 45 +++++++++++++++++++++-----------------------
1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index a6d21fc0033c..2fd06f52b6a4 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -20,15 +20,13 @@
*/
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
- struct rb_node *p;
unsigned long bytes = 0, sbytes = 0, slack = 0, size;
-
- mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);

+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
bytes += kobjsize(vma);

region = vma->vm_region;
@@ -82,15 +80,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)

unsigned long task_vsize(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- struct rb_node *p;
unsigned long vsize = 0;

mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- }
mmap_read_unlock(mm);
return vsize;
}
@@ -99,14 +95,13 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
- struct rb_node *p;
unsigned long size = kobjsize(mm);

mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ for_each_vma(vmi, vma) {
size += kobjsize(vma);
region = vma->vm_region;
if (region) {
@@ -190,17 +185,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
*/
static int show_map(struct seq_file *m, void *_p)
{
- struct rb_node *p = _p;
-
- return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
+ return nommu_vma_show(m, _p);
}

static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_maps_private *priv = m->private;
struct mm_struct *mm;
- struct rb_node *p;
- loff_t n = *pos;
+ struct vm_area_struct *vma;
+ unsigned long addr = *pos;
+
+ /* See m_next(). Zero at the start or after lseek. */
+ if (addr == -1UL)
+ return NULL;

/* pin the task and mm whilst we play with them */
priv->task = get_proc_task(priv->inode);
@@ -216,10 +213,10 @@ static void *m_start(struct seq_file *m, loff_t *pos)
return ERR_PTR(-EINTR);
}

- /* start from the Nth VMA */
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
- if (n-- == 0)
- return p;
+ /* start the next element from addr */
+ vma = find_vma(mm, addr);
+ if (vma)
+ return vma;

mmap_read_unlock(mm);
mmput(mm);
@@ -242,10 +239,10 @@ static void m_stop(struct seq_file *m, void *_vml)

static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
{
- struct rb_node *p = _p;
+ struct vm_area_struct *vma = _p;

- (*pos)++;
- return p ? rb_next(p) : NULL;
+ *pos = vma->vm_end;
+ return find_vma(vma->vm_mm, vma->vm_end);
}

static const struct seq_operations proc_pid_maps_ops = {
--
2.35.1

2022-05-04 17:28:31

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH 1/1] mips: rename mt_init to mips_mt_init

From: "Liam R. Howlett" <[email protected]>

Move mt_init out of the way for the maple tree. Use mips_mt prefix to
match the rest of the functions in the file.

Signed-off-by: Liam R. Howlett <[email protected]>
---
arch/mips/kernel/mips-mt.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kernel/mips-mt.c b/arch/mips/kernel/mips-mt.c
index d5f7362e8c24..dc023a979803 100644
--- a/arch/mips/kernel/mips-mt.c
+++ b/arch/mips/kernel/mips-mt.c
@@ -230,7 +230,7 @@ void mips_mt_set_cpuoptions(void)

struct class *mt_class;

-static int __init mt_init(void)
+static int __init mips_mt_init(void)
{
struct class *mtc;

@@ -243,4 +243,4 @@ static int __init mt_init(void)
return 0;
}

-subsys_initcall(mt_init);
+subsys_initcall(mips_mt_init);
--
2.35.1

2022-05-04 18:03:35

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 35/69] x86: remove vma linked list walks

From: "Matthew Wilcox (Oracle)" <[email protected]>

Use the VMA iterator instead.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
arch/x86/entry/vdso/vma.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 235a5794296a..3883da001c62 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -127,17 +127,17 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
{
struct mm_struct *mm = task->mm;
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);

mmap_read_lock(mm);
-
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
unsigned long size = vma->vm_end - vma->vm_start;

if (vma_is_special_mapping(vma, &vvar_mapping))
zap_page_range(vma, vma->vm_start, size);
}
-
mmap_read_unlock(mm);
+
return 0;
}
#else
@@ -354,6 +354,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);

mmap_write_lock(mm);
/*
@@ -363,7 +364,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
* We could search vma near context.vdso, but it's a slowpath,
* so let's explicitly check all VMAs to be completely sure.
*/
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (vma_is_special_mapping(vma, &vdso_mapping) ||
vma_is_special_mapping(vma, &vvar_mapping)) {
mmap_write_unlock(mm);
--
2.35.1

2022-05-04 18:03:35

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 17/69] mm: remove rb tree.

From: "Liam R. Howlett" <[email protected]>

Remove the RB tree and start using the maple tree for vm_area_struct
tracking.

Drop validate_mm() calls in expand_upwards() and expand_downwards() as the
lock is not held.

Signed-off-by: Liam R. Howlett <[email protected]>
---
arch/x86/kernel/tboot.c | 1 -
drivers/firmware/efi/efi.c | 1 -
include/linux/mm.h | 2 -
include/linux/mm_types.h | 14 -
kernel/fork.c | 8 -
mm/init-mm.c | 2 -
mm/mmap.c | 509 ++++++++-----------------------------
mm/nommu.c | 87 ++-----
mm/util.c | 10 +-
9 files changed, 144 insertions(+), 490 deletions(-)

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 859e8d2ea070..a8e3130890ea 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -97,7 +97,6 @@ void __init tboot_probe(void)

static pgd_t *tboot_pg_dir;
static struct mm_struct tboot_mm = {
- .mm_rb = RB_ROOT,
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock),
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 92a765d8d3b6..f18c256bbf89 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -54,7 +54,6 @@ static unsigned long __initdata mem_reserve = EFI_INVALID_TABLE_ADDR;
static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR;

struct mm_struct efi_mm = {
- .mm_rb = RB_ROOT,
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock),
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c259f15c58ac..d11673080c33 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2660,8 +2660,6 @@ extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
extern int split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
- struct rb_node **, struct rb_node *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
unsigned long addr, unsigned long len, pgoff_t pgoff,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e3c7855fc622..50c53f370bf6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -400,19 +400,6 @@ struct vm_area_struct {

/* linked list of VM areas per task, sorted by address */
struct vm_area_struct *vm_next, *vm_prev;
-
- struct rb_node vm_rb;
-
- /*
- * Largest free memory gap in bytes to the left of this VMA.
- * Either between this VMA and vma->vm_prev, or between one of the
- * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
- * get_unmapped_area find a free area of the right size.
- */
- unsigned long rb_subtree_gap;
-
- /* Second cache line starts here. */
-
struct mm_struct *vm_mm; /* The address space we belong to. */

/*
@@ -478,7 +465,6 @@ struct mm_struct {
struct {
struct vm_area_struct *mmap; /* list of VMAs */
struct maple_tree mm_mt;
- struct rb_root mm_rb;
u64 vmacache_seqnum; /* per-thread vmacache */
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
diff --git a/kernel/fork.c b/kernel/fork.c
index 79af6e908539..60783abc21c8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -581,7 +581,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
{
struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
- struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge = 0;
MA_STATE(old_mas, &oldmm->mm_mt, 0, 0);
@@ -608,8 +607,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;

- rb_link = &mm->mm_rb.rb_node;
- rb_parent = NULL;
pprev = &mm->mmap;
retval = ksm_fork(mm, oldmm);
if (retval)
@@ -703,10 +700,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->vm_prev = prev;
prev = tmp;

- __vma_link_rb(mm, tmp, rb_link, rb_parent);
- rb_link = &tmp->vm_rb.rb_right;
- rb_parent = &tmp->vm_rb;
-
/* Link the vma into the MT */
mas.index = tmp->vm_start;
mas.last = tmp->vm_end - 1;
@@ -1124,7 +1117,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
mm->mmap = NULL;
- mm->mm_rb = RB_ROOT;
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
mm->vmacache_seqnum = 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index b912b0f2eced..c9327abb771c 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm_types.h>
-#include <linux/rbtree.h>
#include <linux/maple_tree.h>
#include <linux/rwsem.h>
#include <linux/spinlock.h>
@@ -29,7 +28,6 @@
* and size this cpu_bitmask to NR_CPUS.
*/
struct mm_struct init_mm = {
- .mm_rb = RB_ROOT,
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock),
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
diff --git a/mm/mmap.c b/mm/mmap.c
index ecdedf5191c0..44f9f4b5411e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -39,7 +39,6 @@
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
-#include <linux/rbtree_augmented.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
@@ -294,93 +293,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
return origbrk;
}

-static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
-{
- unsigned long gap, prev_end;
-
- /*
- * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
- * allow two stack_guard_gaps between them here, and when choosing
- * an unmapped area; whereas when expanding we only require one.
- * That's a little inconsistent, but keeps the code here simpler.
- */
- gap = vm_start_gap(vma);
- if (vma->vm_prev) {
- prev_end = vm_end_gap(vma->vm_prev);
- if (gap > prev_end)
- gap -= prev_end;
- else
- gap = 0;
- }
- return gap;
-}
-
-#ifdef CONFIG_DEBUG_VM_RB
-static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
-{
- unsigned long max = vma_compute_gap(vma), subtree_gap;
- if (vma->vm_rb.rb_left) {
- subtree_gap = rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb)->rb_subtree_gap;
- if (subtree_gap > max)
- max = subtree_gap;
- }
- if (vma->vm_rb.rb_right) {
- subtree_gap = rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb)->rb_subtree_gap;
- if (subtree_gap > max)
- max = subtree_gap;
- }
- return max;
-}
-
-static int browse_rb(struct mm_struct *mm)
-{
- struct rb_root *root = &mm->mm_rb;
- int i = 0, j, bug = 0;
- struct rb_node *nd, *pn = NULL;
- unsigned long prev = 0, pend = 0;
-
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- if (vma->vm_start < prev) {
- pr_emerg("vm_start %lx < prev %lx\n",
- vma->vm_start, prev);
- bug = 1;
- }
- if (vma->vm_start < pend) {
- pr_emerg("vm_start %lx < pend %lx\n",
- vma->vm_start, pend);
- bug = 1;
- }
- if (vma->vm_start > vma->vm_end) {
- pr_emerg("vm_start %lx > vm_end %lx\n",
- vma->vm_start, vma->vm_end);
- bug = 1;
- }
- spin_lock(&mm->page_table_lock);
- if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
- pr_emerg("free gap %lx, correct %lx\n",
- vma->rb_subtree_gap,
- vma_compute_subtree_gap(vma));
- bug = 1;
- }
- spin_unlock(&mm->page_table_lock);
- i++;
- pn = nd;
- prev = vma->vm_start;
- pend = vma->vm_end;
- }
- j = 0;
- for (nd = pn; nd; nd = rb_prev(nd))
- j++;
- if (i != j) {
- pr_emerg("backwards %d, forwards %d\n", j, i);
- bug = 1;
- }
- return bug ? -1 : i;
-}
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
extern void mt_validate(struct maple_tree *mt);
extern void mt_dump(const struct maple_tree *mt);
@@ -406,19 +318,25 @@ static void validate_mm_mt(struct mm_struct *mm)
(vma->vm_end - 1 != mas.last)) {
pr_emerg("issue in %s\n", current->comm);
dump_stack();
-#ifdef CONFIG_DEBUG_VM
dump_vma(vma_mt);
- pr_emerg("and next in rb\n");
+ pr_emerg("and vm_next\n");
dump_vma(vma->vm_next);
-#endif
pr_emerg("mt piv: %px %lu - %lu\n", vma_mt,
mas.index, mas.last);
pr_emerg("mt vma: %px %lu - %lu\n", vma_mt,
vma_mt->vm_start, vma_mt->vm_end);
- pr_emerg("rb vma: %px %lu - %lu\n", vma,
+ if (vma->vm_prev) {
+ pr_emerg("ll prev: %px %lu - %lu\n",
+ vma->vm_prev, vma->vm_prev->vm_start,
+ vma->vm_prev->vm_end);
+ }
+ pr_emerg("ll vma: %px %lu - %lu\n", vma,
vma->vm_start, vma->vm_end);
- pr_emerg("rb->next = %px %lu - %lu\n", vma->vm_next,
- vma->vm_next->vm_start, vma->vm_next->vm_end);
+ if (vma->vm_next) {
+ pr_emerg("ll next: %px %lu - %lu\n",
+ vma->vm_next, vma->vm_next->vm_start,
+ vma->vm_next->vm_end);
+ }

mt_dump(mas.tree);
if (vma_mt->vm_end != mas.last + 1) {
@@ -442,21 +360,6 @@ static void validate_mm_mt(struct mm_struct *mm)
VM_BUG_ON(vma);
mt_validate(&mm->mm_mt);
}
-#else
-#define validate_mm_mt(root) do { } while (0)
-#endif
-static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
-{
- struct rb_node *nd;
-
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- VM_BUG_ON_VMA(vma != ignore &&
- vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
- vma);
- }
-}

static void validate_mm(struct mm_struct *mm)
{
@@ -465,7 +368,10 @@ static void validate_mm(struct mm_struct *mm)
unsigned long highest_address = 0;
struct vm_area_struct *vma = mm->mmap;

+ validate_mm_mt(mm);
+
while (vma) {
+#ifdef CONFIG_DEBUG_VM_RB
struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;

@@ -475,6 +381,7 @@ static void validate_mm(struct mm_struct *mm)
anon_vma_interval_tree_verify(avc);
anon_vma_unlock_read(anon_vma);
}
+#endif

highest_address = vm_end_gap(vma);
vma = vma->vm_next;
@@ -489,80 +396,13 @@ static void validate_mm(struct mm_struct *mm)
mm->highest_vm_end, highest_address);
bug = 1;
}
- i = browse_rb(mm);
- if (i != mm->map_count) {
- if (i != -1)
- pr_emerg("map_count %d rb %d\n", mm->map_count, i);
- bug = 1;
- }
VM_BUG_ON_MM(bug, mm);
}
-#else
-#define validate_mm_rb(root, ignore) do { } while (0)
+
+#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */
#define validate_mm_mt(root) do { } while (0)
#define validate_mm(mm) do { } while (0)
-#endif
-
-RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
- struct vm_area_struct, vm_rb,
- unsigned long, rb_subtree_gap, vma_compute_gap)
-
-/*
- * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
- * vma->vm_prev->vm_end values changed, without modifying the vma's position
- * in the rbtree.
- */
-static void vma_gap_update(struct vm_area_struct *vma)
-{
- /*
- * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
- * a callback function that does exactly what we want.
- */
- vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
-}
-
-static inline void vma_rb_insert(struct vm_area_struct *vma,
- struct rb_root *root)
-{
- /* All rb_subtree_gap values must be consistent prior to insertion */
- validate_mm_rb(root, NULL);
-
- rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
-}
-
-static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
-{
- /*
- * Note rb_erase_augmented is a fairly large inline function,
- * so make sure we instantiate it only once with our desired
- * augmented rbtree callbacks.
- */
- rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
-}
-
-static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
- struct rb_root *root,
- struct vm_area_struct *ignore)
-{
- /*
- * All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of
- *
- * a. the "next" vma being erased if next->vm_start was reduced in
- * __vma_adjust() -> __vma_unlink()
- * b. the vma being erased in detach_vmas_to_be_unmapped() ->
- * vma_rb_erase()
- */
- validate_mm_rb(root, ignore);
-
- __vma_rb_erase(vma, root);
-}
-
-static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
- struct rb_root *root)
-{
- vma_rb_erase_ignore(vma, root, vma);
-}
+#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */

/*
* vma has some anon_vma assigned, and is already inserted on that
@@ -596,39 +436,26 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}

-static int find_vma_links(struct mm_struct *mm, unsigned long addr,
- unsigned long end, struct vm_area_struct **pprev,
- struct rb_node ***rb_link, struct rb_node **rb_parent)
+/*
+ * range_has_overlap() - Check the @start - @end range for overlapping VMAs and
+ * sets up a pointer to the previous VMA
+ * @mm: the mm struct
+ * @start: the start address of the range
+ * @end: the end address of the range
+ * @pprev: the pointer to the pointer of the previous VMA
+ *
+ * Returns: True if there is an overlapping VMA, false otherwise
+ */
+static inline
+bool range_has_overlap(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct vm_area_struct **pprev)
{
- struct rb_node **__rb_link, *__rb_parent, *rb_prev;
-
- mmap_assert_locked(mm);
- __rb_link = &mm->mm_rb.rb_node;
- rb_prev = __rb_parent = NULL;
-
- while (*__rb_link) {
- struct vm_area_struct *vma_tmp;
+ struct vm_area_struct *existing;

- __rb_parent = *__rb_link;
- vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- /* Fail if an existing vma overlaps the area */
- if (vma_tmp->vm_start < end)
- return -ENOMEM;
- __rb_link = &__rb_parent->rb_left;
- } else {
- rb_prev = __rb_parent;
- __rb_link = &__rb_parent->rb_right;
- }
- }
-
- *pprev = NULL;
- if (rb_prev)
- *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
- *rb_link = __rb_link;
- *rb_parent = __rb_parent;
- return 0;
+ MA_STATE(mas, &mm->mm_mt, start, start);
+ existing = mas_find(&mas, end - 1);
+ *pprev = mas_prev(&mas, 0);
+ return existing ? true : false;
}

/*
@@ -655,8 +482,6 @@ static inline struct vm_area_struct *__vma_next(struct mm_struct *mm,
* @start: The start of the range.
* @len: The length of the range.
* @pprev: pointer to the pointer that will be set to previous vm_area_struct
- * @rb_link: the rb_node
- * @rb_parent: the parent rb_node
*
* Find all the vm_area_struct that overlap from @start to
* @end and munmap them. Set @pprev to the previous vm_area_struct.
@@ -665,14 +490,11 @@ static inline struct vm_area_struct *__vma_next(struct mm_struct *mm,
*/
static inline int
munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
- struct vm_area_struct **pprev, struct rb_node ***link,
- struct rb_node **parent, struct list_head *uf)
+ struct vm_area_struct **pprev, struct list_head *uf)
{
-
- while (find_vma_links(mm, start, start + len, pprev, link, parent))
+ while (range_has_overlap(mm, start, start + len, pprev))
if (do_munmap(mm, start, len, uf))
return -ENOMEM;
-
return 0;
}

@@ -693,30 +515,6 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
return nr_pages;
}

-void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
- struct rb_node **rb_link, struct rb_node *rb_parent)
-{
- /* Update tracking information for the gap following the new vma. */
- if (vma->vm_next)
- vma_gap_update(vma->vm_next);
- else
- mm->highest_vm_end = vm_end_gap(vma);
-
- /*
- * vma->vm_prev wasn't known when we followed the rbtree to find the
- * correct insertion point for that vma. As a result, we could not
- * update the vma vm_rb parents rb_subtree_gap values on the way down.
- * So, we first insert the vma with a zero rb_subtree_gap value
- * (to be consistent with what we did on the way down), and then
- * immediately update the gap to the correct value. Finally we
- * rebalance the rbtree after all augmented values have been set.
- */
- rb_link_node(&vma->vm_rb, rb_parent, rb_link);
- vma->rb_subtree_gap = 0;
- vma_gap_update(vma);
- vma_rb_insert(vma, &mm->mm_rb);
-}
-
static void __vma_link_file(struct vm_area_struct *vma)
{
struct file *file;
@@ -784,18 +582,8 @@ static inline void vma_mas_szero(struct ma_state *mas, unsigned long start,
mas_store_prealloc(mas, NULL);
}

-static void
-__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
-{
- __vma_link_list(mm, vma, prev);
- __vma_link_rb(mm, vma, rb_link, rb_parent);
-}
-
static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
+ struct vm_area_struct *prev)
{
MA_STATE(mas, &mm->mm_mt, 0, 0);
struct address_space *mapping = NULL;
@@ -809,7 +597,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
}

vma_mas_store(vma, &mas);
- __vma_link(mm, vma, prev, rb_link, rb_parent);
+ __vma_link_list(mm, vma, prev);
__vma_link_file(vma);

if (mapping)
@@ -822,34 +610,20 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma,

/*
* Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree. It has already been inserted into the interval tree.
+ * mm's list and the mm tree. It has already been inserted into the interval tree.
*/
static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas,
struct vm_area_struct *vma)
{
struct vm_area_struct *prev;
- struct rb_node **rb_link, *rb_parent;
-
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
- BUG();

+ mas_set(mas, vma->vm_start);
+ prev = mas_prev(mas, 0);
vma_mas_store(vma, mas);
__vma_link_list(mm, vma, prev);
- __vma_link_rb(mm, vma, rb_link, rb_parent);
mm->map_count++;
}

-static __always_inline void __vma_unlink(struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct vm_area_struct *ignore)
-{
- vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
- __vma_unlink_list(mm, vma);
- /* Kill the cache */
- vmacache_invalidate(mm);
-}
-
/*
* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
* is already present in an i_mmap tree without adjusting the tree.
@@ -862,20 +636,18 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct vm_area_struct *expand)
{
struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
+ struct vm_area_struct *next_next, *next = find_vma(mm, vma->vm_end);
+ struct vm_area_struct *orig_vma = vma;
struct address_space *mapping = NULL;
struct rb_root_cached *root = NULL;
struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
- bool start_changed = false, end_changed = false;
+ bool vma_changed = false;
long adjust_next = 0;
int remove_next = 0;
MA_STATE(mas, &mm->mm_mt, 0, 0);
struct vm_area_struct *exporter = NULL, *importer = NULL;

- validate_mm(mm);
- validate_mm_mt(mm);
-
if (next && !insert) {
if (end >= next->vm_end) {
/*
@@ -905,8 +677,9 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* remove_next == 1 is case 1 or 7.
*/
remove_next = 1 + (end > next->vm_end);
+ next_next = find_vma(mm, next->vm_end);
VM_WARN_ON(remove_next == 2 &&
- end != next->vm_next->vm_end);
+ end != next_next->vm_end);
/* trim end to next, for case 6 first pass */
end = next->vm_end;
}
@@ -1005,21 +778,21 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
}

if (start != vma->vm_start) {
- unsigned long old_start = vma->vm_start;
+ if (vma->vm_start < start)
+ vma_mas_szero(&mas, vma->vm_start, start);
+ vma_changed = true;
vma->vm_start = start;
- if (old_start < start)
- vma_mas_szero(&mas, old_start, start);
- start_changed = true;
}
if (end != vma->vm_end) {
- unsigned long old_end = vma->vm_end;
+ if (vma->vm_end > end)
+ vma_mas_szero(&mas, end, vma->vm_end);
+ vma_changed = true;
vma->vm_end = end;
- if (old_end > end)
- vma_mas_szero(&mas, end, old_end);
- end_changed = true;
+ if (!next)
+ mm->highest_vm_end = vm_end_gap(vma);
}

- if (end_changed || start_changed)
+ if (vma_changed)
vma_mas_store(vma, &mas);

vma->vm_pgoff = pgoff;
@@ -1037,25 +810,9 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
}

if (remove_next) {
- /*
- * vma_merge has merged next into vma, and needs
- * us to remove next before dropping the locks.
- * Since we have expanded over this vma, the maple tree will
- * have overwritten by storing the value
- */
- if (remove_next != 3)
- __vma_unlink(mm, next, next);
- else
- /*
- * vma is not before next if they've been
- * swapped.
- *
- * pre-swap() next->vm_start was reduced so
- * tell validate_mm_rb to ignore pre-swap()
- * "next" (which is stored in post-swap()
- * "vma").
- */
- __vma_unlink(mm, next, vma);
+ __vma_unlink_list(mm, next);
+ /* Kill the cache */
+ vmacache_invalidate(mm);
if (file)
__remove_shared_vm_struct(next, file, mapping);
} else if (insert) {
@@ -1065,15 +822,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* (it may either follow vma or precede it).
*/
__insert_vm_struct(mm, &mas, insert);
- } else {
- if (start_changed)
- vma_gap_update(vma);
- if (end_changed) {
- if (!next)
- mm->highest_vm_end = vm_end_gap(vma);
- else if (!adjust_next)
- vma_gap_update(next);
- }
}

if (anon_vma) {
@@ -1100,7 +848,9 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
anon_vma_merge(vma, next);
mm->map_count--;
mpol_put(vma_policy(next));
+ BUG_ON(vma->vm_end < next->vm_end);
vm_area_free(next);
+
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
@@ -1113,7 +863,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* "next->vm_prev->vm_end" changed and the
* "vma->vm_next" gap must be updated.
*/
- next = vma->vm_next;
+ next = next_next;
} else {
/*
* For the scope of the comment "next" and
@@ -1128,13 +878,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
next = vma;
}
if (remove_next == 2) {
+ mas_reset(&mas);
remove_next = 1;
end = next->vm_end;
goto again;
- }
- else if (next)
- vma_gap_update(next);
- else {
+ } else if (!next) {
/*
* If remove_next == 2 we obviously can't
* reach this path.
@@ -1161,8 +909,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
uprobe_mmap(insert);

validate_mm(mm);
- validate_mm_mt(mm);
-
return 0;
}

@@ -1315,7 +1061,6 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *area, *next;
int err;

- validate_mm_mt(mm);
/*
* We later require that vma->vm_flags == vm_flags,
* so this tests vma->vm_flags & VM_SPECIAL, too.
@@ -1391,7 +1136,6 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
khugepaged_enter_vma_merge(area, vm_flags);
return area;
}
- validate_mm_mt(mm);

return NULL;
}
@@ -1561,6 +1305,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags_t vm_flags;
int pkey = 0;

+ validate_mm(mm);
*populate = 0;

if (!len)
@@ -1868,10 +1613,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev, *merge;
int error;
- struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;

- validate_mm_mt(mm);
/* Check against address space limit. */
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
unsigned long nr_pages;
@@ -1887,8 +1630,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
return -ENOMEM;
}

- /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
- if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
+ /* Clear old maps, set up prev and uf */
+ if (munmap_vma_range(mm, addr, len, &prev, uf))
return -ENOMEM;
/*
* Private writable mapping: check memory availability
@@ -1986,7 +1729,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
goto free_vma;
}

- if (vma_link(mm, vma, prev, rb_link, rb_parent)) {
+ if (vma_link(mm, vma, prev)) {
error = -ENOMEM;
if (file)
goto unmap_and_free_vma;
@@ -2026,7 +1769,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,

vma_set_page_prot(vma);

- validate_mm_mt(mm);
return addr;

unmap_and_free_vma:
@@ -2043,7 +1785,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unacct_error:
if (charged)
vm_unacct_memory(charged);
- validate_mm_mt(mm);
return error;
}

@@ -2379,7 +2120,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
int error = 0;
MA_STATE(mas, &mm->mm_mt, 0, 0);

- validate_mm_mt(mm);
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;

@@ -2431,15 +2171,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
- * vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_lock
- * lock here, so we need to protect against
- * concurrent vma expansions.
- * anon_vma_lock_write() doesn't help here, as
- * we don't guarantee that all growable vmas
- * in a mm share the same root anon vma.
- * So, we reuse mm->page_table_lock to guard
- * against concurrent vma expansions.
+ * We only hold a shared mmap_lock lock here, so
+ * we need to protect against concurrent vma
+ * expansions. anon_vma_lock_write() doesn't
+ * help here, as we don't guarantee that all
+ * growable vmas in a mm share the same root
+ * anon vma. So, we reuse mm->page_table_lock
+ * to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
@@ -2450,9 +2188,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/* Overwrite old entry in mtree. */
vma_mas_store(vma, &mas);
anon_vma_interval_tree_post_update_vma(vma);
- if (vma->vm_next)
- vma_gap_update(vma->vm_next);
- else
+ if (!vma->vm_next)
mm->highest_vm_end = vm_end_gap(vma);
spin_unlock(&mm->page_table_lock);

@@ -2462,8 +2198,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
- validate_mm(mm);
- validate_mm_mt(mm);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -2471,15 +2205,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
-int expand_downwards(struct vm_area_struct *vma,
- unsigned long address)
+int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *prev;
int error = 0;
MA_STATE(mas, &mm->mm_mt, 0, 0);

- validate_mm(mm);
address &= PAGE_MASK;
if (address < mmap_min_addr)
return -EPERM;
@@ -2521,15 +2253,13 @@ int expand_downwards(struct vm_area_struct *vma,
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
- * vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_lock
- * lock here, so we need to protect against
- * concurrent vma expansions.
- * anon_vma_lock_write() doesn't help here, as
- * we don't guarantee that all growable vmas
- * in a mm share the same root anon vma.
- * So, we reuse mm->page_table_lock to guard
- * against concurrent vma expansions.
+ * We only hold a shared mmap_lock lock here, so
+ * we need to protect against concurrent vma
+ * expansions. anon_vma_lock_write() doesn't
+ * help here, as we don't guarantee that all
+ * growable vmas in a mm share the same root
+ * anon vma. So, we reuse mm->page_table_lock
+ * to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
@@ -2541,7 +2271,6 @@ int expand_downwards(struct vm_area_struct *vma,
/* Overwrite old entry in mtree. */
vma_mas_store(vma, &mas);
anon_vma_interval_tree_post_update_vma(vma);
- vma_gap_update(vma);
spin_unlock(&mm->page_table_lock);

perf_event_mmap(vma);
@@ -2550,7 +2279,6 @@ int expand_downwards(struct vm_area_struct *vma,
}
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
- validate_mm(mm);
return error;
}

@@ -2682,10 +2410,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas,

insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
- mas_set_range(mas, vma->vm_start, end - 1);
- mas_store_prealloc(mas, NULL);
+ vma_mas_szero(mas, vma->vm_start, end);
do {
- vma_rb_erase(vma, &mm->mm_rb);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm -= vma_pages(vma);
mm->map_count--;
@@ -2693,10 +2419,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas,
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
*insertion_point = vma;
- if (vma) {
+ if (vma)
vma->vm_prev = prev;
- vma_gap_update(vma);
- } else
+ else
mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
tail_vma->vm_next = NULL;

@@ -2815,11 +2540,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
if (len == 0)
return -EINVAL;

- /*
- * arch_unmap() might do unmaps itself. It must be called
- * and finish any rbtree manipulation before this code
- * runs and also starts to manipulate the rbtree.
- */
+ /* arch_unmap() might do unmaps itself. */
arch_unmap(mm, start, end);

/* Find the first overlapping VMA where start < vma->vm_end */
@@ -2830,6 +2551,11 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
if (mas_preallocate(&mas, vma, GFP_KERNEL))
return -ENOMEM;
prev = vma->vm_prev;
+ /* we have start < vma->vm_end */
+
+ /* if it doesn't overlap, we have nothing.. */
+ if (vma->vm_start >= end)
+ return 0;

/*
* If we need to split any vma, do it now to save pain later.
@@ -2890,6 +2616,8 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
/* Fix up all other VM information */
remove_vma_list(mm, vma);

+
+ validate_mm(mm);
return downgrade ? 1 : 0;

map_count_exceeded:
@@ -3028,11 +2756,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
* anonymous maps. eventually we may be able to do some
* brk-specific accounting here.
*/
-static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
+static int do_brk_flags(unsigned long addr, unsigned long len,
+ unsigned long flags, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
- struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
unsigned long mapped_addr;
@@ -3051,8 +2779,8 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
if (error)
return error;

- /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
- if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
+ /* Clear old maps, set up prev and uf */
+ if (munmap_vma_range(mm, addr, len, &prev, uf))
return -ENOMEM;

/* Check against address space limits *after* clearing old maps... */
@@ -3086,7 +2814,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
vma->vm_pgoff = pgoff;
vma->vm_flags = flags;
vma->vm_page_prot = vm_get_page_prot(flags);
- if(vma_link(mm, vma, prev, rb_link, rb_parent))
+ if(vma_link(mm, vma, prev))
goto no_vma_link;

out:
@@ -3203,26 +2931,10 @@ void exit_mmap(struct mm_struct *mm)
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
struct vm_area_struct *prev;
- struct rb_node **rb_link, *rb_parent;
- unsigned long start = vma->vm_start;
- struct vm_area_struct *overlap = NULL;

- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
+ if (range_has_overlap(mm, vma->vm_start, vma->vm_end, &prev))
return -ENOMEM;

- overlap = mt_find(&mm->mm_mt, &start, vma->vm_end - 1);
- if (overlap) {
-
- pr_err("Found vma ending at %lu\n", start - 1);
- pr_err("vma : %lu => %lu-%lu\n", (unsigned long)overlap,
- overlap->vm_start, overlap->vm_end - 1);
-#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
- mt_dump(&mm->mm_mt);
-#endif
- BUG();
- }
-
if ((vma->vm_flags & VM_ACCOUNT) &&
security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
@@ -3244,7 +2956,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}

- if (vma_link(mm, vma, prev, rb_link, rb_parent))
+ if (vma_link(mm, vma, prev))
return -ENOMEM;

return 0;
@@ -3262,9 +2974,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
unsigned long vma_start = vma->vm_start;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
- struct rb_node **rb_link, *rb_parent;
bool faulted_in_anon_vma = true;
- unsigned long index = addr;

validate_mm_mt(mm);
/*
@@ -3276,10 +2986,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}

- if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+ if (range_has_overlap(mm, addr, addr + len, &prev))
return NULL; /* should never get here */
- if (mt_find(&mm->mm_mt, &index, addr+len - 1))
- BUG();
+
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
@@ -3320,12 +3029,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
- vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ if (vma_link(mm, new_vma, prev))
+ goto out_vma_link;
*need_rmap_locks = false;
}
validate_mm_mt(mm);
return new_vma;

+out_vma_link:
+ if (new_vma->vm_ops && new_vma->vm_ops->close)
+ new_vma->vm_ops->close(new_vma);
out_free_mempol:
mpol_put(vma_policy(new_vma));
out_free_vma:
diff --git a/mm/nommu.c b/mm/nommu.c
index 9d7afc2d959e..81408d20024f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -553,9 +553,9 @@ static void put_nommu_region(struct vm_region *region)
*/
static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *pvma, *prev;
struct address_space *mapping;
- struct rb_node **p, *parent, *rb_prev;
+ struct vm_area_struct *prev;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);

BUG_ON(!vma->vm_region);

@@ -573,42 +573,10 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
i_mmap_unlock_write(mapping);
}

+ prev = mas_prev(&mas, 0);
+ mas_reset(&mas);
/* add the VMA to the tree */
- parent = rb_prev = NULL;
- p = &mm->mm_rb.rb_node;
- while (*p) {
- parent = *p;
- pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
-
- /* sort by: start addr, end addr, VMA struct addr in that order
- * (the latter is necessary as we may get identical VMAs) */
- if (vma->vm_start < pvma->vm_start)
- p = &(*p)->rb_left;
- else if (vma->vm_start > pvma->vm_start) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else if (vma->vm_end < pvma->vm_end)
- p = &(*p)->rb_left;
- else if (vma->vm_end > pvma->vm_end) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else if (vma < pvma)
- p = &(*p)->rb_left;
- else if (vma > pvma) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else
- BUG();
- }
-
- rb_link_node(&vma->vm_rb, parent, p);
- rb_insert_color(&vma->vm_rb, &mm->mm_rb);
-
- /* add VMA to the VMA list also */
- prev = NULL;
- if (rb_prev)
- prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
-
+ vma_mas_store(vma, &mas);
__vma_link_list(mm, vma, prev);
}

@@ -621,6 +589,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
struct address_space *mapping;
struct mm_struct *mm = vma->vm_mm;
struct task_struct *curr = current;
+ MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0);

mm->map_count--;
for (i = 0; i < VMACACHE_SIZE; i++) {
@@ -643,8 +612,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
}

/* remove from the MM's tree and list */
- rb_erase(&vma->vm_rb, &mm->mm_rb);
-
+ vma_mas_remove(vma, &mas);
__vma_unlink_list(mm, vma);
}

@@ -668,24 +636,19 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);

/* check the cache first */
vma = vmacache_find(mm, addr);
if (likely(vma))
return vma;

- /* trawl the list (there may be multiple mappings in which addr
- * resides) */
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->vm_start > addr)
- return NULL;
- if (vma->vm_end > addr) {
- vmacache_update(addr, vma);
- return vma;
- }
- }
+ vma = mas_walk(&mas);

- return NULL;
+ if (vma)
+ vmacache_update(addr, vma);
+
+ return vma;
}
EXPORT_SYMBOL(find_vma);

@@ -717,26 +680,23 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
{
struct vm_area_struct *vma;
unsigned long end = addr + len;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);

/* check the cache first */
vma = vmacache_find_exact(mm, addr, end);
if (vma)
return vma;

- /* trawl the list (there may be multiple mappings in which addr
- * resides) */
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->vm_start < addr)
- continue;
- if (vma->vm_start > addr)
- return NULL;
- if (vma->vm_end == end) {
- vmacache_update(addr, vma);
- return vma;
- }
- }
+ vma = mas_walk(&mas);
+ if (!vma)
+ return NULL;
+ if (vma->vm_start != addr)
+ return NULL;
+ if (vma->vm_end != end)
+ return NULL;

- return NULL;
+ vmacache_update(addr, vma);
+ return vma;
}

/*
@@ -1533,6 +1493,7 @@ void exit_mmap(struct mm_struct *mm)
delete_vma(mm, vma);
cond_resched();
}
+ __mt_destroy(&mm->mm_mt);
}

int vm_brk(unsigned long addr, unsigned long len)
diff --git a/mm/util.c b/mm/util.c
index 3492a9e81aa3..3e97807c353b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -287,6 +287,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
vma->vm_next = next;
if (next)
next->vm_prev = vma;
+ else
+ mm->highest_vm_end = vm_end_gap(vma);
}

void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
@@ -299,8 +301,14 @@ void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
prev->vm_next = next;
else
mm->mmap = next;
- if (next)
+ if (next) {
next->vm_prev = prev;
+ } else {
+ if (prev)
+ mm->highest_vm_end = vm_end_gap(prev);
+ else
+ mm->highest_vm_end = 0;
+ }
}

/* Check if the vma is being used as a stack by this task */
--
2.35.1

2022-05-04 18:04:43

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 50/69] bpf: remove VMA linked list

From: "Liam R. Howlett" <[email protected]>

Use vma_next() and remove reference to the start of the linked list

Signed-off-by: Liam R. Howlett <[email protected]>
---
kernel/bpf/task_iter.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index d94696198ef8..9a0bbc808433 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -300,8 +300,8 @@ struct bpf_iter_seq_task_vma_info {
};

enum bpf_task_vma_iter_find_op {
- task_vma_iter_first_vma, /* use mm->mmap */
- task_vma_iter_next_vma, /* use curr_vma->vm_next */
+ task_vma_iter_first_vma, /* use find_vma() with addr 0 */
+ task_vma_iter_next_vma, /* use vma_next() with curr_vma */
task_vma_iter_find_vma, /* use find_vma() to find next vma */
};

@@ -401,10 +401,10 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)

switch (op) {
case task_vma_iter_first_vma:
- curr_vma = curr_task->mm->mmap;
+ curr_vma = find_vma(curr_task->mm, 0);
break;
case task_vma_iter_next_vma:
- curr_vma = curr_vma->vm_next;
+ curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
break;
case task_vma_iter_find_vma:
/* We dropped mmap_lock so it is necessary to use find_vma
@@ -418,7 +418,7 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
if (curr_vma &&
curr_vma->vm_start == info->prev_vm_start &&
curr_vma->vm_end == info->prev_vm_end)
- curr_vma = curr_vma->vm_next;
+ curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
break;
}
if (!curr_vma) {
--
2.35.1

2022-05-04 18:19:03

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 52/69] mm/khugepaged: stop using vma linked list

From: "Matthew Wilcox (Oracle)" <[email protected]>

Use vma iterator & find_vma() instead of vma linked list.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
---
mm/huge_memory.c | 4 ++--
mm/khugepaged.c | 11 ++++++++---
2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c468fee595ff..c72827d9cf04 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2221,11 +2221,11 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
split_huge_pmd_if_needed(vma, end);

/*
- * If we're also updating the vma->vm_next->vm_start,
+ * If we're also updating the next vma vm_start,
* check if we need to split it.
*/
if (adjust_next > 0) {
- struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
unsigned long nstart = next->vm_start;
nstart += adjust_next;
split_huge_pmd_if_needed(next, nstart);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 03fda93ade3e..208fc0e19eb1 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2089,10 +2089,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
__releases(&khugepaged_mm_lock)
__acquires(&khugepaged_mm_lock)
{
+ struct vma_iterator vmi;
struct mm_slot *mm_slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
int progress = 0;
+ unsigned long address;

VM_BUG_ON(!pages);
lockdep_assert_held(&khugepaged_mm_lock);
@@ -2116,11 +2118,14 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
vma = NULL;
if (unlikely(!mmap_read_trylock(mm)))
goto breakouterloop_mmap_lock;
- if (likely(!khugepaged_test_exit(mm)))
- vma = find_vma(mm, khugepaged_scan.address);

progress++;
- for (; vma; vma = vma->vm_next) {
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto breakouterloop;
+
+ address = khugepaged_scan.address;
+ vma_iter_init(&vmi, mm, address);
+ for_each_vma(vmi, vma) {
unsigned long hstart, hend;

cond_resched();
--
2.35.1

2022-05-04 20:34:41

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 60/69] mm/msync: use vma_find() instead of vma linked list

From: "Liam R. Howlett" <[email protected]>

Signed-off-by: Liam R. Howlett <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
mm/msync.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/msync.c b/mm/msync.c
index 137d1c104f3e..ac4c9bfea2e7 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -104,7 +104,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
error = 0;
goto out_unlock;
}
- vma = vma->vm_next;
+ vma = find_vma(mm, vma->vm_end);
}
}
out_unlock:
--
2.35.1

2022-05-04 20:36:20

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 41/69] exec: use VMA iterator instead of linked list

From: "Matthew Wilcox (Oracle)" <[email protected]>

Remove a use of the vm_next list by doing the initial lookup with the VMA
iterator and then using it to find the next entry.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
---
fs/exec.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 14e7278a1ab8..b5e3bfd52b53 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -686,6 +686,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
unsigned long length = old_end - old_start;
unsigned long new_start = old_start - shift;
unsigned long new_end = old_end - shift;
+ VMA_ITERATOR(vmi, mm, new_start);
+ struct vm_area_struct *next;
struct mmu_gather tlb;

BUG_ON(new_start > new_end);
@@ -694,7 +696,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* ensure there are no vmas between where we want to go
* and where we are
*/
- if (vma != find_vma(mm, new_start))
+ if (vma != vma_next(&vmi))
return -EFAULT;

/*
@@ -713,12 +715,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)

lru_add_drain();
tlb_gather_mmu(&tlb, mm);
+ next = vma_next(&vmi);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
*/
free_pgd_range(&tlb, new_end, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
+ next ? next->vm_start : USER_PGTABLES_CEILING);
} else {
/*
* otherwise, clean from old_start; this is done to not touch
@@ -727,7 +730,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* for the others its just a little faster.
*/
free_pgd_range(&tlb, old_start, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
+ next ? next->vm_start : USER_PGTABLES_CEILING);
}
tlb_finish_mmu(&tlb);

--
2.35.1

2022-05-04 20:40:03

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 23/69] mm: use maple tree operations for find_vma_intersection()

From: "Liam R. Howlett" <[email protected]>

Move find_vma_intersection() to mmap.c and change implementation to maple
tree.

When searching for a vma within a range, it is easier to use the maple
tree interface.

Exported find_vma_intersection() for kvm module.

Signed-off-by: Liam R. Howlett <[email protected]>
---
include/linux/mm.h | 22 ++++------------------
mm/mmap.c | 29 +++++++++++++++++++++++++++++
mm/nommu.c | 11 +++++++++++
3 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0fdb19d1b48b..6db9face6f84 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2780,26 +2780,12 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
struct vm_area_struct **pprev);

-/**
- * find_vma_intersection() - Look up the first VMA which intersects the interval
- * @mm: The process address space.
- * @start_addr: The inclusive start user address.
- * @end_addr: The exclusive end user address.
- *
- * Returns: The first VMA within the provided range, %NULL otherwise. Assumes
- * start_addr < end_addr.
+/*
+ * Look up the first VMA which intersects the interval [start_addr, end_addr)
+ * NULL if none. Assume start_addr < end_addr.
*/
-static inline
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
- unsigned long start_addr,
- unsigned long end_addr)
-{
- struct vm_area_struct *vma = find_vma(mm, start_addr);
-
- if (vma && end_addr <= vma->vm_start)
- vma = NULL;
- return vma;
-}
+ unsigned long start_addr, unsigned long end_addr);

/**
* vma_lookup() - Find a VMA at a specific address
diff --git a/mm/mmap.c b/mm/mmap.c
index ec4ce76f02dc..5f948f353376 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2069,6 +2069,35 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,

EXPORT_SYMBOL(get_unmapped_area);

+/**
+ * find_vma_intersection() - Look up the first VMA which intersects the interval
+ * @mm: The process address space.
+ * @start_addr: The inclusive start user address.
+ * @end_addr: The exclusive end user address.
+ *
+ * Returns: The first VMA within the provided range, %NULL otherwise. Assumes
+ * start_addr < end_addr.
+ */
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+ unsigned long start_addr,
+ unsigned long end_addr)
+{
+ struct vm_area_struct *vma;
+ unsigned long index = start_addr;
+
+ mmap_assert_locked(mm);
+ /* Check the cache first. */
+ vma = vmacache_find(mm, start_addr);
+ if (likely(vma))
+ return vma;
+
+ vma = mt_find(&mm->mm_mt, &index, end_addr - 1);
+ if (vma)
+ vmacache_update(start_addr, vma);
+ return vma;
+}
+EXPORT_SYMBOL(find_vma_intersection);
+
/**
* find_vma() - Find the VMA for a given address, or the next vma.
* @mm: The mm_struct to check
diff --git a/mm/nommu.c b/mm/nommu.c
index 81408d20024f..2870edfad8ed 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -629,6 +629,17 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
vm_area_free(vma);
}

+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+ unsigned long start_addr,
+ unsigned long end_addr)
+{
+ unsigned long index = start_addr;
+
+ mmap_assert_locked(mm);
+ return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+EXPORT_SYMBOL(find_vma_intersection);
+
/*
* look up the first VMA in which addr resides, NULL if none
* - should be called with mm->mmap_lock at least held readlocked
--
2.35.1

2022-05-04 22:51:38

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 65/69] nommu: remove uses of VMA linked list

From: "Matthew Wilcox (Oracle)" <[email protected]>

Use the maple tree or VMA iterator instead. This is faster and will allow
us to shrink the VMA.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
mm/nommu.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/mm/nommu.c b/mm/nommu.c
index 1c9b4e8c4d5c..d94f6adf9c31 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1383,6 +1383,7 @@ static int shrink_vma(struct mm_struct *mm,
*/
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
{
+ MA_STATE(mas, &mm->mm_mt, start, start);
struct vm_area_struct *vma;
unsigned long end;
int ret;
@@ -1394,7 +1395,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
end = start + len;

/* find the first potentially overlapping VMA */
- vma = find_vma(mm, start);
+ vma = mas_find(&mas, end - 1);
if (!vma) {
static int limit;
if (limit < 5) {
@@ -1413,7 +1414,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
return -EINVAL;
if (end == vma->vm_end)
goto erase_whole_vma;
- vma = vma->vm_next;
+ vma = mas_next(&mas, end - 1);
} while (vma);
return -EINVAL;
} else {
@@ -1462,6 +1463,7 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
*/
void exit_mmap(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;

if (!mm)
@@ -1469,12 +1471,17 @@ void exit_mmap(struct mm_struct *mm)

mm->total_vm = 0;

- while ((vma = mm->mmap)) {
- mm->mmap = vma->vm_next;
+ /*
+ * Lock the mm to avoid assert complaining even though this is the only
+ * user of the mm
+ */
+ mmap_write_lock(mm);
+ for_each_vma(vmi, vma) {
delete_vma_from_mm(vma);
delete_vma(mm, vma);
cond_resched();
}
+ mmap_write_unlock(mm);
__mt_destroy(&mm->mm_mt);
}

--
2.35.1

2022-05-04 23:08:37

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 40/69] coredump: remove vma linked list walk

From: "Matthew Wilcox (Oracle)" <[email protected]>

Use the Maple Tree iterator instead. This is too complicated for the VMA
iterator to handle, so let's open-code it for now. If this turns out to
be a common pattern, we can migrate it to common code.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
---
fs/coredump.c | 34 ++++++++++++----------------------
1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index ebc43f960b64..3a0022c1ca36 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -1072,30 +1072,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
return vma->vm_end - vma->vm_start;
}

-static struct vm_area_struct *first_vma(struct task_struct *tsk,
- struct vm_area_struct *gate_vma)
-{
- struct vm_area_struct *ret = tsk->mm->mmap;
-
- if (ret)
- return ret;
- return gate_vma;
-}
-
/*
* Helper function for iterating across a vma list. It ensures that the caller
* will visit `gate_vma' prior to terminating the search.
*/
-static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
+static struct vm_area_struct *coredump_next_vma(struct ma_state *mas,
+ struct vm_area_struct *vma,
struct vm_area_struct *gate_vma)
{
- struct vm_area_struct *ret;
-
- ret = this_vma->vm_next;
- if (ret)
- return ret;
- if (this_vma == gate_vma)
+ if (gate_vma && (vma == gate_vma))
return NULL;
+
+ vma = mas_next(mas, ULONG_MAX);
+ if (vma)
+ return vma;
return gate_vma;
}

@@ -1119,9 +1109,10 @@ static void free_vma_snapshot(struct coredump_params *cprm)
*/
static bool dump_vma_snapshot(struct coredump_params *cprm)
{
- struct vm_area_struct *vma, *gate_vma;
+ struct vm_area_struct *gate_vma, *vma = NULL;
struct mm_struct *mm = current->mm;
- int i;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ int i = 0;

/*
* Once the stack expansion code is fixed to not change VMA bounds
@@ -1141,8 +1132,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
return false;
}

- for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
- vma = next_vma(vma, gate_vma), i++) {
+ while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) {
struct core_vma_metadata *m = cprm->vma_meta + i;

m->start = vma->vm_start;
@@ -1150,10 +1140,10 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
m->flags = vma->vm_flags;
m->dump_size = vma_dump_size(vma, cprm->mm_flags);
m->pgoff = vma->vm_pgoff;
-
m->file = vma->vm_file;
if (m->file)
get_file(m->file);
+ i++;
}

mmap_write_unlock(mm);
--
2.35.1

2022-05-05 00:28:47

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 26/69] mm: convert vma_lookup() to use mtree_load()

From: "Liam R. Howlett" <[email protected]>

Unlike the rbtree, the Maple Tree will return a NULL if there's nothing at
a particular address.

Since the previous commit dropped the vmacache, it is now possible to
consult the tree directly.

Signed-off-by: Liam R. Howlett <[email protected]>
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
include/linux/mm.h | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6db9face6f84..f6d633f04a64 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2797,12 +2797,7 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
static inline
struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma = find_vma(mm, addr);
-
- if (vma && addr < vma->vm_start)
- vma = NULL;
-
- return vma;
+ return mtree_load(&mm->mm_mt, addr);
}

static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
--
2.35.1

2022-05-05 03:33:45

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 34/69] s390: remove vma linked list walks

From: "Matthew Wilcox (Oracle)" <[email protected]>

Use the VMA iterator instead.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
arch/s390/kernel/vdso.c | 3 ++-
arch/s390/mm/gmap.c | 6 ++++--
2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 99694260cac9..66f7e7c63632 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -68,10 +68,11 @@ static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
{
struct mm_struct *mm = task->mm;
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;

mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
unsigned long size = vma->vm_end - vma->vm_start;

if (!vma_is_special_mapping(vma, &vvar_mapping))
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index af03cacf34ec..8a639487f840 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2508,8 +2508,9 @@ static const struct mm_walk_ops thp_split_walk_ops = {
static inline void thp_split_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);

- for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
vma->vm_flags &= ~VM_HUGEPAGE;
vma->vm_flags |= VM_NOHUGEPAGE;
walk_page_vma(vma, &thp_split_walk_ops, NULL);
@@ -2577,8 +2578,9 @@ int gmap_mark_unmergeable(void)
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int ret;
+ VMA_ITERATOR(vmi, mm, 0);

- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
MADV_UNMERGEABLE, &vma->vm_flags);
if (ret)
--
2.35.1

2022-05-05 09:21:06

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 57/69] mm/mlock: use vma iterator and maple state instead of vma linked list

From: "Matthew Wilcox (Oracle)" <[email protected]>

Handle overflow checking in count_mm_mlocked_page_nr() differently.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
---
mm/mlock.c | 35 +++++++++++++++++++----------------
1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/mm/mlock.c b/mm/mlock.c
index 716caf851043..c41604ba5197 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -471,6 +471,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
unsigned long nstart, end, tmp;
struct vm_area_struct *vma, *prev;
int error;
+ MA_STATE(mas, &current->mm->mm_mt, start, start);

VM_BUG_ON(offset_in_page(start));
VM_BUG_ON(len != PAGE_ALIGN(len));
@@ -479,13 +480,14 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
return -EINVAL;
if (end == start)
return 0;
- vma = find_vma(current->mm, start);
- if (!vma || vma->vm_start > start)
+ vma = mas_walk(&mas);
+ if (!vma)
return -ENOMEM;

- prev = vma->vm_prev;
if (start > vma->vm_start)
prev = vma;
+ else
+ prev = mas_prev(&mas, 0);

for (nstart = start ; ; ) {
vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
@@ -505,7 +507,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
if (nstart >= end)
break;

- vma = prev->vm_next;
+ vma = find_vma(prev->vm_mm, prev->vm_end);
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
break;
@@ -526,24 +528,23 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
{
struct vm_area_struct *vma;
unsigned long count = 0;
+ unsigned long end;
+ VMA_ITERATOR(vmi, mm, start);

if (mm == NULL)
mm = current->mm;

- vma = find_vma(mm, start);
- if (vma == NULL)
- return 0;
-
- for (; vma ; vma = vma->vm_next) {
- if (start >= vma->vm_end)
- continue;
- if (start + len <= vma->vm_start)
- break;
+ /* Don't overflow past ULONG_MAX */
+ if (unlikely(ULONG_MAX - len < start))
+ end = ULONG_MAX;
+ else
+ end = start + len;
+ for_each_vma_range(vmi, vma, end) {
if (vma->vm_flags & VM_LOCKED) {
if (start > vma->vm_start)
count -= (start - vma->vm_start);
- if (start + len < vma->vm_end) {
- count += start + len - vma->vm_start;
+ if (end < vma->vm_end) {
+ count += end - vma->vm_start;
break;
}
count += vma->vm_end - vma->vm_start;
@@ -659,6 +660,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
*/
static int apply_mlockall_flags(int flags)
{
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
struct vm_area_struct *vma, *prev = NULL;
vm_flags_t to_add = 0;

@@ -679,7 +681,7 @@ static int apply_mlockall_flags(int flags)
to_add |= VM_LOCKONFAULT;
}

- for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
vm_flags_t newflags;

newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
@@ -687,6 +689,7 @@ static int apply_mlockall_flags(int flags)

/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+ mas_pause(&mas);
cond_resched();
}
out:
--
2.35.1

2022-05-05 09:48:37

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 47/69] perf: use VMA iterator

From: "Matthew Wilcox (Oracle)" <[email protected]>

The VMA iterator is faster than the linked list and removing the linked
list will shrink the vm_area_struct.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
kernel/events/core.c | 3 ++-
kernel/events/uprobes.c | 9 ++++++---
2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7858bafffa9d..e2da3045d274 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10211,8 +10211,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter,
struct perf_addr_filter_range *fr)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);

- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6418083901d4..84b5a7cdfe81 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -349,9 +349,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe,
static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *tmp;

- for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
+ for_each_vma(vmi, tmp)
if (valid_ref_ctr_vma(uprobe, tmp))
return tmp;

@@ -1230,11 +1231,12 @@ int uprobe_apply(struct inode *inode, loff_t offset,

static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
int err = 0;

mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;

@@ -1982,9 +1984,10 @@ bool uprobe_deny_signal(void)

static void mmf_recalc_uprobes(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;

- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!valid_vma(vma, false))
continue;
/*
--
2.35.1

2022-05-05 12:21:25

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 37/69] cxl: remove vma linked list walk

From: "Matthew Wilcox (Oracle)" <[email protected]>

Use the VMA iterator instead. This requires a little restructuring of the
surrounding code to hoist the mm to the caller. That turns
cxl_prefault_one() into a trivial function, so call cxl_fault_segment()
directly.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
drivers/misc/cxl/fault.c | 45 ++++++++++++++--------------------------
1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 60c829113299..2c64f55cf01f 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -280,22 +280,6 @@ void cxl_handle_fault(struct work_struct *fault_work)
mmput(mm);
}

-static void cxl_prefault_one(struct cxl_context *ctx, u64 ea)
-{
- struct mm_struct *mm;
-
- mm = get_mem_context(ctx);
- if (mm == NULL) {
- pr_devel("cxl_prefault_one unable to get mm %i\n",
- pid_nr(ctx->pid));
- return;
- }
-
- cxl_fault_segment(ctx, mm, ea);
-
- mmput(mm);
-}
-
static u64 next_segment(u64 ea, u64 vsid)
{
if (vsid & SLB_VSID_B_1T)
@@ -306,23 +290,16 @@ static u64 next_segment(u64 ea, u64 vsid)
return ea + 1;
}

-static void cxl_prefault_vma(struct cxl_context *ctx)
+static void cxl_prefault_vma(struct cxl_context *ctx, struct mm_struct *mm)
{
u64 ea, last_esid = 0;
struct copro_slb slb;
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
int rc;
- struct mm_struct *mm;
-
- mm = get_mem_context(ctx);
- if (mm == NULL) {
- pr_devel("cxl_prefault_vm unable to get mm %i\n",
- pid_nr(ctx->pid));
- return;
- }

mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
for (ea = vma->vm_start; ea < vma->vm_end;
ea = next_segment(ea, slb.vsid)) {
rc = copro_calculate_slb(mm, ea, &slb);
@@ -337,20 +314,28 @@ static void cxl_prefault_vma(struct cxl_context *ctx)
}
}
mmap_read_unlock(mm);
-
- mmput(mm);
}

void cxl_prefault(struct cxl_context *ctx, u64 wed)
{
+ struct mm_struct *mm = get_mem_context(ctx);
+
+ if (mm == NULL) {
+ pr_devel("cxl_prefault unable to get mm %i\n",
+ pid_nr(ctx->pid));
+ return;
+ }
+
switch (ctx->afu->prefault_mode) {
case CXL_PREFAULT_WED:
- cxl_prefault_one(ctx, wed);
+ cxl_fault_segment(ctx, mm, wed);
break;
case CXL_PREFAULT_ALL:
- cxl_prefault_vma(ctx);
+ cxl_prefault_vma(ctx, mm);
break;
default:
break;
}
+
+ mmput(mm);
}
--
2.35.1

2022-05-05 12:34:16

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 58/69] mm/mprotect: use maple tree navigation instead of vma linked list

From: "Liam R. Howlett" <[email protected]>

Signed-off-by: Liam R. Howlett <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
mm/mprotect.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/mprotect.c b/mm/mprotect.c
index b69ce7a7b2b7..81e5392ab13e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -539,6 +539,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
(prot & PROT_READ);
+ MA_STATE(mas, &current->mm->mm_mt, start, start);

start = untagged_addr(start);

@@ -570,7 +571,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
goto out;

- vma = find_vma(current->mm, start);
+ vma = mas_find(&mas, ULONG_MAX);
error = -ENOMEM;
if (!vma)
goto out;
@@ -596,7 +597,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if (start > vma->vm_start)
prev = vma;
else
- prev = vma->vm_prev;
+ prev = mas_prev(&mas, 0);

for (nstart = start ; ; ) {
unsigned long mask_off_old_flags;
@@ -658,7 +659,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if (nstart >= end)
goto out;

- vma = prev->vm_next;
+ vma = find_vma(current->mm, prev->vm_end);
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
goto out;
--
2.35.1

2022-05-05 12:41:11

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 49/69] fork: use VMA iterator

From: "Matthew Wilcox (Oracle)" <[email protected]>

The VMA iterator is faster than the linked list and removing the linked
list will shrink the vm_area_struct.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
kernel/fork.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 4af22dd65fc6..9fcbd0b5c0be 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1291,13 +1291,16 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
/* Forbid mm->exe_file change if old file still mapped. */
old_exe_file = get_mm_exe_file(mm);
if (old_exe_file) {
+ VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
- for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
if (path_equal(&vma->vm_file->f_path,
- &old_exe_file->f_path))
+ &old_exe_file->f_path)) {
ret = -EBUSY;
+ break;
+ }
}
mmap_read_unlock(mm);
fput(old_exe_file);
--
2.35.1

2022-05-05 13:35:01

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 21/69] mm/khugepaged: optimize collapse_pte_mapped_thp() by using vma_lookup()

From: "Liam R. Howlett" <[email protected]>

vma_lookup() will walk the vma tree once and not continue to look for the
next vma. Since the exact vma is checked below, this is a more optimal
way of searching.

Signed-off-by: Liam R. Howlett <[email protected]>
Reviewed-by: Vlastimil Babka <[email protected]>
---
mm/khugepaged.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ac53ad2c9bb1..03fda93ade3e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1435,7 +1435,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
{
unsigned long haddr = addr & HPAGE_PMD_MASK;
- struct vm_area_struct *vma = find_vma(mm, haddr);
+ struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
pmd_t *pmd;
--
2.35.1

2022-05-05 15:45:21

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 24/69] mm/mmap: use advanced maple tree API for mmap_region()

From: "Liam R. Howlett" <[email protected]>

Changing mmap_region() to use the maple tree state and the advanced maple
tree interface allows for a lot less tree walking.

This change removes the last caller of munmap_vma_range(), so drop this
unused function.

Add vma_expand() to expand a VMA if possible by doing the necessary
hugepage check, uprobe_munmap of files, dcache flush, modifications then
undoing the detaches, etc.

Signed-off-by: Liam R. Howlett <[email protected]>
---
mm/mmap.c | 245 +++++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 198 insertions(+), 47 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 5f948f353376..baf608975f99 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -516,28 +516,6 @@ static inline struct vm_area_struct *__vma_next(struct mm_struct *mm,
return vma->vm_next;
}

-/*
- * munmap_vma_range() - munmap VMAs that overlap a range.
- * @mm: The mm struct
- * @start: The start of the range.
- * @len: The length of the range.
- * @pprev: pointer to the pointer that will be set to previous vm_area_struct
- *
- * Find all the vm_area_struct that overlap from @start to
- * @end and munmap them. Set @pprev to the previous vm_area_struct.
- *
- * Returns: -ENOMEM on munmap failure or 0 on success.
- */
-static inline int
-munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
- struct vm_area_struct **pprev, struct list_head *uf)
-{
- while (range_has_overlap(mm, start, start + len, pprev))
- if (do_munmap(mm, start, len, uf))
- return -ENOMEM;
- return 0;
-}
-
static unsigned long count_vma_pages_range(struct mm_struct *mm,
unsigned long addr, unsigned long end)
{
@@ -664,6 +642,127 @@ static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas,
mm->map_count++;
}

+/*
+ * vma_expand - Expand an existing VMA
+ *
+ * @mas: The maple state
+ * @vma: The vma to expand
+ * @start: The start of the vma
+ * @end: The exclusive end of the vma
+ * @pgoff: The page offset of vma
+ * @next: The current of next vma.
+ *
+ * Expand @vma to @start and @end. Can expand off the start and end. Will
+ * expand over @next if it's different from @vma and @end == @next->vm_end.
+ * Checking if the @vma can expand and merge with @next needs to be handled by
+ * the caller.
+ *
+ * Returns: 0 on success
+ */
+inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff,
+ struct vm_area_struct *next)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct address_space *mapping = NULL;
+ struct rb_root_cached *root = NULL;
+ struct anon_vma *anon_vma = vma->anon_vma;
+ struct file *file = vma->vm_file;
+ bool remove_next = false;
+ bool anon_cloned = false;
+
+ if (next && (vma != next) && (end == next->vm_end)) {
+ remove_next = true;
+ if (next->anon_vma && !vma->anon_vma) {
+ int error;
+
+ vma->anon_vma = next->anon_vma;
+ error = anon_vma_clone(vma, next);
+ if (error)
+ return error;
+ anon_cloned = true;
+ }
+ }
+
+ /* Not merging but overwriting any part of next is not handled. */
+ VM_BUG_ON(!remove_next && next != vma && end > next->vm_start);
+ /* Only handles expanding */
+ VM_BUG_ON(vma->vm_start < start || vma->vm_end > end);
+
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ goto nomem;
+
+ vma_adjust_trans_huge(vma, start, end, 0);
+
+ if (file) {
+ mapping = file->f_mapping;
+ root = &mapping->i_mmap;
+ uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+ i_mmap_lock_write(mapping);
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_remove(vma, root);
+ } else if (anon_vma) {
+ anon_vma_lock_write(anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }
+
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+ /* Note: mas must be pointing to the expanding VMA */
+ vma_mas_store(vma, mas);
+
+ if (file) {
+ vma_interval_tree_insert(vma, root);
+ flush_dcache_mmap_unlock(mapping);
+ }
+
+ /* Expanding over the next vma */
+ if (remove_next) {
+ /* Remove from mm linked list - also updates highest_vm_end */
+ __vma_unlink_list(mm, next);
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
+
+ if (file)
+ __remove_shared_vm_struct(next, file, mapping);
+
+ } else if (!next) {
+ mm->highest_vm_end = vm_end_gap(vma);
+ }
+
+ if (anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(anon_vma);
+ }
+
+ if (file) {
+ i_mmap_unlock_write(mapping);
+ uprobe_mmap(vma);
+ }
+
+ if (remove_next) {
+ if (file) {
+ uprobe_munmap(next, next->vm_start, next->vm_end);
+ fput(file);
+ }
+ if (next->anon_vma)
+ anon_vma_merge(vma, next);
+ mm->map_count--;
+ mpol_put(vma_policy(next));
+ vm_area_free(next);
+ }
+
+ validate_mm(mm);
+ return 0;
+
+nomem:
+ if (anon_cloned)
+ unlink_anon_vmas(vma);
+ return -ENOMEM;
+}
+
/*
* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
* is already present in an i_mmap tree without adjusting the tree.
@@ -1665,9 +1764,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev, *merge;
- int error;
+ struct vm_area_struct *vma = NULL;
+ struct vm_area_struct *prev, *next;
+ pgoff_t pglen = len >> PAGE_SHIFT;
unsigned long charged = 0;
+ unsigned long end = addr + len;
+ unsigned long merge_start = addr, merge_end = end;
+ pgoff_t vm_pgoff;
+ int error;
+ MA_STATE(mas, &mm->mm_mt, addr, end - 1);

/* Check against address space limit. */
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
@@ -1677,16 +1782,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* MAP_FIXED may remove pages of mappings that intersects with
* requested mapping. Account for the pages it would unmap.
*/
- nr_pages = count_vma_pages_range(mm, addr, addr + len);
+ nr_pages = count_vma_pages_range(mm, addr, end);

if (!may_expand_vm(mm, vm_flags,
(len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}

- /* Clear old maps, set up prev and uf */
- if (munmap_vma_range(mm, addr, len, &prev, uf))
+ /* Unmap any existing mapping in the area */
+ if (do_munmap(mm, addr, len, uf))
return -ENOMEM;
+
/*
* Private writable mapping: check memory availability
*/
@@ -1697,14 +1803,43 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vm_flags |= VM_ACCOUNT;
}

- /*
- * Can we just expand an old mapping?
- */
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (vma)
- goto out;
+ next = mas_next(&mas, ULONG_MAX);
+ prev = mas_prev(&mas, 0);
+ if (vm_flags & VM_SPECIAL)
+ goto cannot_expand;
+
+ /* Attempt to expand an old mapping */
+ /* Check next */
+ if (next && next->vm_start == end && !vma_policy(next) &&
+ can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
+ NULL_VM_UFFD_CTX, NULL)) {
+ merge_end = next->vm_end;
+ vma = next;
+ vm_pgoff = next->vm_pgoff - pglen;
+ }
+
+ /* Check prev */
+ if (prev && prev->vm_end == addr && !vma_policy(prev) &&
+ (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
+ pgoff, vma->vm_userfaultfd_ctx, NULL) :
+ can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
+ NULL_VM_UFFD_CTX , NULL))) {
+ merge_start = prev->vm_start;
+ vma = prev;
+ vm_pgoff = prev->vm_pgoff;
+ }
+
+
+ /* Actually expand, if possible */
+ if (vma &&
+ !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) {
+ khugepaged_enter_vma_merge(vma, vm_flags);
+ goto expanded;
+ }

+ mas.index = addr;
+ mas.last = end - 1;
+cannot_expand:
/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
@@ -1717,7 +1852,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
}

vma->vm_start = addr;
- vma->vm_end = addr + len;
+ vma->vm_end = end;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
@@ -1738,28 +1873,30 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
- * Bug: If addr is changed, prev, rb_link, rb_parent should
- * be updated for vma_link()
*/
WARN_ON_ONCE(addr != vma->vm_start);

addr = vma->vm_start;
+ mas_reset(&mas);

/* If vm_flags changed after call_mmap(), we should try merge vma again
* as we may succeed this time.
*/
if (unlikely(vm_flags != vma->vm_flags && prev)) {
- merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
+ next = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (merge) {
+ if (next) {
/* ->mmap() can change vma->vm_file and fput the original file. So
* fput the vma->vm_file here or we would add an extra fput for file
* and cause general protection fault ultimately.
*/
fput(vma->vm_file);
vm_area_free(vma);
- vma = merge;
- /* Update vm_flags to pick up the change. */
+ vma = prev;
+ /* Update vm_flags and possible addr to pick up the change. We don't
+ * warn here if addr changed as the vma is not linked by vma_link().
+ */
+ addr = vma->vm_start;
vm_flags = vma->vm_flags;
goto unmap_writable;
}
@@ -1783,7 +1920,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
goto free_vma;
}

- if (vma_link(mm, vma, prev)) {
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
error = -ENOMEM;
if (file)
goto unmap_and_free_vma;
@@ -1791,12 +1928,28 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
goto free_vma;
}

+ if (vma->vm_file)
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ vma_mas_store(vma, &mas);
+ __vma_link_list(mm, vma, prev);
+ mm->map_count++;
+ if (vma->vm_file) {
+ if (vma->vm_flags & VM_SHARED)
+ mapping_allow_writable(vma->vm_file->f_mapping);
+
+ flush_dcache_mmap_lock(vma->vm_file->f_mapping);
+ vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap);
+ flush_dcache_mmap_unlock(vma->vm_file->f_mapping);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ }
+
/* Once vma denies write, undo our temporary denial count */
unmap_writable:
if (file && vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
file = vma->vm_file;
-out:
+expanded:
perf_event_mmap(vma);

vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
@@ -1823,6 +1976,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,

vma_set_page_prot(vma);

+ validate_mm(mm);
return addr;

unmap_and_free_vma:
@@ -1839,6 +1993,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unacct_error:
if (charged)
vm_unacct_memory(charged);
+ validate_mm(mm);
return error;
}

@@ -2636,10 +2791,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
prev = vma->vm_prev;
/* we have start < vma->vm_end */

- /* if it doesn't overlap, we have nothing.. */
- if (vma->vm_start >= end)
- return 0;
-
/*
* If we need to split any vma, do it now to save pain later.
*
--
2.35.1

2022-05-05 18:20:12

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 54/69] mm/madvise: use vma_find() instead of vma linked list

From: "Liam R. Howlett" <[email protected]>

madvise_walk_vmas() no longer uses linked list.

Signed-off-by: Liam R. Howlett <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
mm/madvise.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index c965fac7b13a..3d8413a27757 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1227,7 +1227,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
if (start >= end)
break;
if (prev)
- vma = prev->vm_next;
+ vma = find_vma(mm, prev->vm_end);
else /* madvise_remove dropped mmap_lock */
vma = find_vma(mm, start);
}
--
2.35.1

2022-05-05 18:34:52

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 56/69] mm/mempolicy: use vma iterator & maple state instead of vma linked list

From: "Liam R. Howlett" <[email protected]>

Reworked the way mbind_range() finds the first VMA to reuse the maple
state and limit the number of tree walks needed.

Note, this drops the VM_BUG_ON(!vma) call, which would catch a start
address higher than the last VMA. The code was written in a way that
allowed no VMA updates to occur and still return success. There should be
no functional change to this scenario with the new code.

Signed-off-by: Liam R. Howlett <[email protected]>
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
mm/mempolicy.c | 56 ++++++++++++++++++++++++++++----------------------
1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0288ffaea064..df4487767259 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -380,9 +380,10 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);

mmap_write_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ for_each_vma(vmi, vma)
mpol_rebind_policy(vma->vm_policy, new);
mmap_write_unlock(mm);
}
@@ -649,7 +650,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
static int queue_pages_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->vma;
+ struct vm_area_struct *next, *vma = walk->vma;
struct queue_pages *qp = walk->private;
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
@@ -664,9 +665,10 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
/* hole at head side of range */
return -EFAULT;
}
+ next = find_vma(vma->vm_mm, vma->vm_end);
if (!(flags & MPOL_MF_DISCONTIG_OK) &&
((vma->vm_end < qp->end) &&
- (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
+ (!next || vma->vm_end < next->vm_start)))
/* hole at middle or tail of range */
return -EFAULT;

@@ -780,26 +782,24 @@ static int vma_replace_policy(struct vm_area_struct *vma,
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
+ MA_STATE(mas, &mm->mm_mt, start - 1, start - 1);
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
pgoff_t pgoff;
- unsigned long vmstart;
- unsigned long vmend;
-
- vma = find_vma(mm, start);
- VM_BUG_ON(!vma);

- prev = vma->vm_prev;
- if (start > vma->vm_start)
- prev = vma;
+ prev = mas_find_rev(&mas, 0);
+ if (prev && (start < prev->vm_end))
+ vma = prev;
+ else
+ vma = mas_next(&mas, end - 1);

- for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
- vmstart = max(start, vma->vm_start);
- vmend = min(end, vma->vm_end);
+ for (; vma; vma = mas_next(&mas, end - 1)) {
+ unsigned long vmstart = max(start, vma->vm_start);
+ unsigned long vmend = min(end, vma->vm_end);

if (mpol_equal(vma_policy(vma), new_pol))
- continue;
+ goto next;

pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
@@ -808,6 +808,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
new_pol, vma->vm_userfaultfd_ctx,
anon_vma_name(vma));
if (prev) {
+ /* vma_merge() invalidated the mas */
+ mas_pause(&mas);
vma = prev;
goto replace;
}
@@ -815,19 +817,25 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
err = split_vma(vma->vm_mm, vma, vmstart, 1);
if (err)
goto out;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
if (vma->vm_end != vmend) {
err = split_vma(vma->vm_mm, vma, vmend, 0);
if (err)
goto out;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
- replace:
+replace:
err = vma_replace_policy(vma, new_pol);
if (err)
goto out;
+next:
+ prev = vma;
}

- out:
+out:
return err;
}

@@ -1042,6 +1050,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
int flags)
{
nodemask_t nmask;
+ struct vm_area_struct *vma;
LIST_HEAD(pagelist);
int err = 0;
struct migration_target_control mtc = {
@@ -1057,8 +1066,9 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
* need migration. Between passing in the full user address
* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
*/
+ vma = find_vma(mm, 0);
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
- queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+ queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);

if (!list_empty(&pagelist)) {
@@ -1188,14 +1198,13 @@ static struct page *new_page(struct page *page, unsigned long start)
struct folio *dst, *src = page_folio(page);
struct vm_area_struct *vma;
unsigned long address;
+ VMA_ITERATOR(vmi, current->mm, start);
gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;

- vma = find_vma(current->mm, start);
- while (vma) {
+ for_each_vma(vmi, vma) {
address = page_address_in_vma(page, vma);
if (address != -EFAULT)
break;
- vma = vma->vm_next;
}

if (folio_test_hugetlb(src))
@@ -1473,6 +1482,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
unsigned long vmend;
unsigned long end;
int err = -ENOENT;
+ VMA_ITERATOR(vmi, mm, start);

start = untagged_addr(start);
if (start & ~PAGE_MASK)
@@ -1498,9 +1508,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
if (end == start)
return 0;
mmap_write_lock(mm);
- vma = find_vma(mm, start);
- for (; vma && vma->vm_start < end; vma = vma->vm_next) {
-
+ for_each_vma_range(vmi, vma, end) {
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
new = mpol_dup(vma_policy(vma));
--
2.35.1

2022-05-05 21:26:44

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 36/69] xtensa: remove vma linked list walks

From: "Matthew Wilcox (Oracle)" <[email protected]>

Use the VMA iterator instead. Since VMA can no longer be NULL in the
loop, then deal with out-of-memory outside the loop. This means a
slightly longer run time in the failure case (-ENOMEM) - it will run to
the end of the VMAs before erroring instead of in the middle of the loop.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
---
arch/xtensa/kernel/syscall.c | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/xtensa/kernel/syscall.c b/arch/xtensa/kernel/syscall.c
index 201356faa7e6..b3c2450d6f23 100644
--- a/arch/xtensa/kernel/syscall.c
+++ b/arch/xtensa/kernel/syscall.c
@@ -58,6 +58,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct vm_area_struct *vmm;
+ struct vma_iterator vmi;

if (flags & MAP_FIXED) {
/* We do not accept a shared mapping if it would violate
@@ -79,15 +80,20 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
else
addr = PAGE_ALIGN(addr);

- for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
- /* At this point: (!vmm || addr < vmm->vm_end). */
- if (TASK_SIZE - len < addr)
- return -ENOMEM;
- if (!vmm || addr + len <= vm_start_gap(vmm))
- return addr;
+ vma_iter_init(&vmi, current->mm, addr);
+ for_each_vma(vmi, vmm) {
+ /* At this point: (addr < vmm->vm_end). */
+ if (addr + len <= vm_start_gap(vmm))
+ break;
+
addr = vmm->vm_end;
if (flags & MAP_SHARED)
addr = COLOUR_ALIGN(addr, pgoff);
}
+
+ if (TASK_SIZE - len < addr)
+ return -ENOMEM;
+
+ return addr;
}
#endif
--
2.35.1

2022-05-06 03:04:46

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v9 46/69] acct: use VMA iterator instead of linked list

From: "Matthew Wilcox (Oracle)" <[email protected]>

The VMA iterator is faster than the linked list.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
kernel/acct.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/acct.c b/kernel/acct.c
index 3df53cf1dcd5..2e7bf8d41f04 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -535,15 +535,14 @@ void acct_collect(long exitcode, int group_dead)
unsigned long vsize = 0;

if (group_dead && current->mm) {
+ struct mm_struct *mm = current->mm;
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;

- mmap_read_lock(current->mm);
- vma = current->mm->mmap;
- while (vma) {
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- vma = vma->vm_next;
- }
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
}

spin_lock_irq(&current->sighand->siglock);
--
2.35.1

2022-05-10 12:38:20

by SeongJae Park

[permalink] [raw]
Subject: Re: [PATCH v9 15/69] damon: Convert __damon_va_three_regions to use the VMA iterator

On Wed, 4 May 2022 01:12:26 +0000 Liam Howlett <[email protected]> wrote:

> From: "Liam R. Howlett" <[email protected]>
>
> This rather specialised walk can use the VMA iterator. If this proves to
> be too slow, we can write a custom routine to find the two largest gaps,
> but it will be somewhat complicated, so let's see if we need it first.
>
> Update the kunit test case to use the maple tree. This also fixes an
> issue with the kunit testcase not adding the last VMA to the list.
>
> Fixes: 17ccae8bb5c9 (mm/damon: add kunit tests)
> Signed-off-by: Liam R. Howlett <[email protected]>
> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
> Reviewed-by: SeongJae Park <[email protected]>
> ---
> mm/damon/vaddr-test.h | 37 +++++++++++-------------------
> mm/damon/vaddr.c | 53 ++++++++++++++++++++++---------------------
> 2 files changed, 40 insertions(+), 50 deletions(-)
>
> diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
> index 5431da4fe9d4..dbf2b8759607 100644
> --- a/mm/damon/vaddr-test.h
> +++ b/mm/damon/vaddr-test.h
> @@ -13,34 +13,21 @@
> #define _DAMON_VADDR_TEST_H
>
> #include <kunit/test.h>
> +#include "../../mm/internal.h"

V9 maple tree patchset has moved the definition of vma_mas_store() from
internal.h to mmap.c, so inclusion of internal.h wouldn't needed here, right?

If we end up moving the definitions back to internal.h, because this file is
under mm/damon/, we can also use shorter include path, "../internal.h".


Thanks,
SJ

[...]

2022-05-10 21:58:50

by Liam R. Howlett

[permalink] [raw]
Subject: Re: [PATCH v9 15/69] damon: Convert __damon_va_three_regions to use the VMA iterator

* SeongJae Park <[email protected]> [220510 03:44]:
> On Wed, 4 May 2022 01:12:26 +0000 Liam Howlett <[email protected]> wrote:
>
> > From: "Liam R. Howlett" <[email protected]>
> >
> > This rather specialised walk can use the VMA iterator. If this proves to
> > be too slow, we can write a custom routine to find the two largest gaps,
> > but it will be somewhat complicated, so let's see if we need it first.
> >
> > Update the kunit test case to use the maple tree. This also fixes an
> > issue with the kunit testcase not adding the last VMA to the list.
> >
> > Fixes: 17ccae8bb5c9 (mm/damon: add kunit tests)
> > Signed-off-by: Liam R. Howlett <[email protected]>
> > Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
> > Reviewed-by: SeongJae Park <[email protected]>
> > ---
> > mm/damon/vaddr-test.h | 37 +++++++++++-------------------
> > mm/damon/vaddr.c | 53 ++++++++++++++++++++++---------------------
> > 2 files changed, 40 insertions(+), 50 deletions(-)
> >
> > diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
> > index 5431da4fe9d4..dbf2b8759607 100644
> > --- a/mm/damon/vaddr-test.h
> > +++ b/mm/damon/vaddr-test.h
> > @@ -13,34 +13,21 @@
> > #define _DAMON_VADDR_TEST_H
> >
> > #include <kunit/test.h>
> > +#include "../../mm/internal.h"
>
> V9 maple tree patchset has moved the definition of vma_mas_store() from
> internal.h to mmap.c, so inclusion of internal.h wouldn't needed here, right?
>
> If we end up moving the definitions back to internal.h, because this file is
> under mm/damon/, we can also use shorter include path, "../internal.h".

Yeah, that seems like a good plan.

I will be moving it back to internal to restore functionality to nommu.

2022-05-11 04:43:07

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v9 15/69] damon: Convert __damon_va_three_regions to use the VMA iterator

On Tue, 10 May 2022 10:44:28 +0000 SeongJae Park <[email protected]> wrote:

> On Wed, 4 May 2022 01:12:26 +0000 Liam Howlett <[email protected]> wrote:
>
> > From: "Liam R. Howlett" <[email protected]>
> >
> > This rather specialised walk can use the VMA iterator. If this proves to
> > be too slow, we can write a custom routine to find the two largest gaps,
> > but it will be somewhat complicated, so let's see if we need it first.
> >
> > Update the kunit test case to use the maple tree. This also fixes an
> > issue with the kunit testcase not adding the last VMA to the list.
> >
> > Fixes: 17ccae8bb5c9 (mm/damon: add kunit tests)
> > Signed-off-by: Liam R. Howlett <[email protected]>
> > Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
> > Reviewed-by: SeongJae Park <[email protected]>
> > ---
> > mm/damon/vaddr-test.h | 37 +++++++++++-------------------
> > mm/damon/vaddr.c | 53 ++++++++++++++++++++++---------------------
> > 2 files changed, 40 insertions(+), 50 deletions(-)
> >
> > diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
> > index 5431da4fe9d4..dbf2b8759607 100644
> > --- a/mm/damon/vaddr-test.h
> > +++ b/mm/damon/vaddr-test.h
> > @@ -13,34 +13,21 @@
> > #define _DAMON_VADDR_TEST_H
> >
> > #include <kunit/test.h>
> > +#include "../../mm/internal.h"
>
> V9 maple tree patchset has moved the definition of vma_mas_store() from
> internal.h to mmap.c, so inclusion of internal.h wouldn't needed here, right?
>
> If we end up moving the definitions back to internal.h, because this file is
> under mm/damon/, we can also use shorter include path, "../internal.h".

I put the vma_mas_store() and vma_mas_remove() declarations into
include/linux/mm.h so yes, internal.h is no longer required. I queued
a fixlet against
damon-convert-__damon_va_three_regions-to-use-the-vma-iterator.patch


--- a/mm/damon/vaddr-test.h~damon-convert-__damon_va_three_regions-to-use-the-vma-iterator-fix
+++ a/mm/damon/vaddr-test.h
@@ -13,7 +13,6 @@
#define _DAMON_VADDR_TEST_H

#include <kunit/test.h>
-#include "../../mm/internal.h"

static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas,
ssize_t nr_vmas)
_


2022-05-13 09:28:16

by David Hildenbrand

[permalink] [raw]
Subject: Re: [PATCH 1/1] mips: rename mt_init to mips_mt_init

On 04.05.22 02:26, Liam Howlett wrote:
> From: "Liam R. Howlett" <[email protected]>
>
> Move mt_init out of the way for the maple tree. Use mips_mt prefix to
> match the rest of the functions in the file.
>
> Signed-off-by: Liam R. Howlett <[email protected]>


Reviewed-by: David Hildenbrand <[email protected]>

--
Thanks,

David / dhildenb