2012-06-18 22:05:48

by Rik van Riel

[permalink] [raw]
Subject: [PATCH -mm 0/7] mm: scalable and unified arch_get_unmapped_area

[actually include all 7 patches]

A long time ago, we decided to limit the number of VMAs per
process to 64k. As it turns out, there actually are programs
using tens of thousands of VMAs.

The linear search in arch_get_unmapped_area and
arch_get_unmapped_area_topdown can be a real issue for
those programs.

This patch series aims to fix the scalability issue by
tracking the size of each free hole in the VMA rbtree,
propagating the free hole info up the tree.

Another major goal is to put the bulk of the necessary
arch_get_unmapped_area(_topdown) functionality into one
set of functions, so we can eliminate the custom large
functions per architecture, sticking to a few much smaller
architecture specific functions instead.

In this version I have only gotten rid of the x86, ARM
and MIPS arch-specific code, and am already showing a
fairly promising diffstat:

arch/arm/include/asm/pgtable.h | 6
arch/arm/mm/init.c | 3
arch/arm/mm/mmap.c | 217 ------------------
arch/mips/include/asm/page.h | 2
arch/mips/include/asm/pgtable.h | 7
arch/mips/mm/mmap.c | 177 --------------
arch/x86/include/asm/elf.h | 3
arch/x86/include/asm/pgtable_64.h | 4
arch/x86/kernel/sys_x86_64.c | 200 ++--------------
arch/x86/vdso/vma.c | 2
include/linux/mm_types.h | 8
include/linux/sched.h | 13 +
mm/internal.h | 5
mm/mmap.c | 455 ++++++++++++++++++++++++++++++--------
14 files changed, 420 insertions(+), 682 deletions(-)

TODO:
- eliminate arch-specific functions for more architectures
- integrate hugetlbfs alignment (with Andi Kleen's patch?)

Performance

Testing performance with a benchmark that allocates tens
of thousands of VMAs, unmaps them and mmaps them some more
in a loop, shows promising results.

Vanilla 3.4 kernel:
$ ./agua_frag_test_64
..........

Min Time (ms): 6
Avg. Time (ms): 294.0000
Max Time (ms): 609
Std Dev (ms): 113.1664
Standard deviation exceeds 10

With patches:
$ ./agua_frag_test_64
..........

Min Time (ms): 14
Avg. Time (ms): 38.0000
Max Time (ms): 60
Std Dev (ms): 3.9312
All checks pass

The total run time of the test goes down by about a
factor 4. More importantly, the worst case performance
of the loop (which is what really hurt some applications)
has gone down by about a factor 10.


2012-06-18 22:06:07

by Rik van Riel

[permalink] [raw]
Subject: [PATCH -mm 3/7] Allow each architecture to specify the address range that can be used for this allocation.

From: Rik van Riel <[email protected]>

On x86-64, this is used to implement MMAP_32BIT semantics.

On PPC and IA64, allocations using certain page sizes need to be
restricted to certain virtual address ranges. This callback could
be used to implement such address restrictions with minimal hassle.

Signed-off-by: Rik van Riel <[email protected]>
---
arch/mips/mm/mmap.c | 8 ++----
arch/x86/include/asm/pgtable_64.h | 1 +
arch/x86/kernel/sys_x86_64.c | 11 ++++++---
include/linux/sched.h | 7 ++++++
mm/mmap.c | 38 ++++++++++++++++++++++++++++++++++--
5 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index 302d779..3f8af17 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -61,8 +61,6 @@ static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
((((addr) + shm_align_mask) & ~shm_align_mask) + \
(((pgoff) << PAGE_SHIFT) & shm_align_mask))

-enum mmap_allocation_direction {UP, DOWN};
-
static unsigned long arch_get_unmapped_area_common(struct file *filp,
unsigned long addr0, unsigned long len, unsigned long pgoff,
unsigned long flags, enum mmap_allocation_direction dir)
@@ -107,7 +105,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
return addr;
}

- if (dir == UP) {
+ if (dir == ALLOC_UP) {
addr = mm->mmap_base;
if (do_color_align)
addr = COLOUR_ALIGN(addr, pgoff);
@@ -204,7 +202,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
return arch_get_unmapped_area_common(filp,
- addr0, len, pgoff, flags, UP);
+ addr0, len, pgoff, flags, ALLOC_UP);
}

/*
@@ -216,7 +214,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
unsigned long flags)
{
return arch_get_unmapped_area_common(filp,
- addr0, len, pgoff, flags, DOWN);
+ addr0, len, pgoff, flags, ALLOC_DOWN);
}

void arch_pick_mmap_layout(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 975f709..8af36f6 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -169,6 +169,7 @@ extern void cleanup_highmap(void);

#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+#define HAVE_ARCH_GET_ADDRESS_RANGE

#define pgtable_cache_init() do { } while (0)
#define check_pgt_cache() do { } while (0)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index b4d3c39..2595a5e 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -95,8 +95,8 @@ out:
return error;
}

-static void find_start_end(unsigned long flags, unsigned long *begin,
- unsigned long *end)
+void arch_get_address_range(unsigned long flags, unsigned long *begin,
+ unsigned long *end, enum mmap_allocation_direction direction)
{
if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) {
unsigned long new_begin;
@@ -114,9 +114,12 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
if (new_begin)
*begin = new_begin;
}
- } else {
+ } else if (direction == ALLOC_UP) {
*begin = TASK_UNMAPPED_BASE;
*end = TASK_SIZE;
+ } else /* direction == ALLOC_DOWN */ {
+ *begin = 0;
+ *end = current->mm->mmap_base;
}
}

@@ -132,7 +135,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (flags & MAP_FIXED)
return addr;

- find_start_end(flags, &begin, &end);
+ arch_get_address_range(flags, &begin, &end, ALLOC_UP);

if (len > end)
return -ENOMEM;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4059c0f..fc76318 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -388,7 +388,14 @@ extern int sysctl_max_map_count;
#include <linux/aio.h>

#ifdef CONFIG_MMU
+enum mmap_allocation_direction {
+ ALLOC_UP,
+ ALLOC_DOWN
+};
extern void arch_pick_mmap_layout(struct mm_struct *mm);
+extern void
+arch_get_address_range(unsigned long flags, unsigned long *begin,
+ unsigned long *end, enum mmap_allocation_direction direction);
extern unsigned long
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
unsigned long, unsigned long);
diff --git a/mm/mmap.c b/mm/mmap.c
index 40c848e..92cf0bf 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1465,6 +1465,20 @@ unacct_error:
return error;
}

+#ifndef HAVE_ARCH_GET_ADDRESS_RANGE
+void arch_get_address_range(unsigned long flags, unsigned long *begin,
+ unsigned long *end, enum mmap_allocation_direction direction)
+{
+ if (direction == ALLOC_UP) {
+ *begin = TASK_UNMAPPED_BASE;
+ *end = TASK_SIZE;
+ } else /* direction == ALLOC_DOWN */ {
+ *begin = 0;
+ *end = current->mm->mmap_base;
+ }
+}
+#endif
+
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
@@ -1499,7 +1513,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
struct rb_node *rb_node;
- unsigned long lower_limit = TASK_UNMAPPED_BASE;
+ unsigned long lower_limit, upper_limit;
+
+ arch_get_address_range(flags, &lower_limit, &upper_limit, ALLOC_UP);

if (len > TASK_SIZE)
return -ENOMEM;
@@ -1546,6 +1562,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
continue;
}

+ /* We have gone too far right, and can not go left. */
+ if (vma->vm_end + len > upper_limit) {
+ if (!addr)
+ return -ENOMEM;
+ goto found_addr;
+ }
+
if (!found_here && node_free_hole(rb_node->rb_right) >= len) {
/* Last known hole is to the right of this subtree. */
rb_node = rb_node->rb_right;
@@ -1625,7 +1648,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
struct rb_node *rb_node = NULL;
- unsigned long upper_limit = mm->mmap_base;
+ unsigned long lower_limit, upper_limit;
+
+ arch_get_address_range(flags, &lower_limit, &upper_limit, ALLOC_DOWN);

/* requested length too big for entire address space */
if (len > TASK_SIZE)
@@ -1644,7 +1669,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
}

/* requested length too big; prevent integer underflow below */
- if (len > upper_limit)
+ if (len > upper_limit - lower_limit)
return -ENOMEM;

/*
@@ -1681,6 +1706,13 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
}
}

+ /* We have gone too far left, and can not go right. */
+ if (vma->vm_start < lower_limit + len) {
+ if (!addr)
+ return -ENOMEM;
+ goto found_addr;
+ }
+
if (!found_here && node_free_hole(rb_node->rb_left) >= len) {
/* Last known hole is to the right of this subtree. */
rb_node = rb_node->rb_left;
--
1.7.7.6

2012-06-18 22:06:11

by Rik van Riel

[permalink] [raw]
Subject: [PATCH -mm 4/7] mm: make page colouring code generic

From: Rik van Riel <[email protected]>

Fix the x86-64 page colouring code to take pgoff into account.
Use the x86 and MIPS page colouring code as the basis for a generic
page colouring function.

Teach the generic arch_get_unmapped_area(_topdown) code to call the
page colouring code.

Make sure that ALIGN_DOWN always aligns down, and ends up at the
right page colour.

Signed-off-by: Rik van Riel <[email protected]>
---
arch/mips/include/asm/page.h | 2 -
arch/mips/include/asm/pgtable.h | 1 +
arch/x86/include/asm/elf.h | 3 -
arch/x86/include/asm/pgtable_64.h | 1 +
arch/x86/kernel/sys_x86_64.c | 35 +++++++++-----
arch/x86/vdso/vma.c | 2 +-
include/linux/sched.h | 8 +++-
mm/mmap.c | 91 ++++++++++++++++++++++++++++++++-----
8 files changed, 111 insertions(+), 32 deletions(-)

diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h
index da9bd7d..459cc25 100644
--- a/arch/mips/include/asm/page.h
+++ b/arch/mips/include/asm/page.h
@@ -63,8 +63,6 @@ extern void build_copy_page(void);
extern void clear_page(void * page);
extern void copy_page(void * to, void * from);

-extern unsigned long shm_align_mask;
-
static inline unsigned long pages_do_alias(unsigned long addr1,
unsigned long addr2)
{
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index b2202a6..f133a4c 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -415,6 +415,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
*/
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+#define HAVE_ARCH_ALIGN_ADDR

/*
* No page table caches to initialise
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 5939f44..dc2d0bf 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -358,8 +358,6 @@ static inline int mmap_is_ia32(void)
enum align_flags {
ALIGN_VA_32 = BIT(0),
ALIGN_VA_64 = BIT(1),
- ALIGN_VDSO = BIT(2),
- ALIGN_TOPDOWN = BIT(3),
};

struct va_alignment {
@@ -368,5 +366,4 @@ struct va_alignment {
} ____cacheline_aligned;

extern struct va_alignment va_align;
-extern unsigned long align_addr(unsigned long, struct file *, enum align_flags);
#endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 8af36f6..8408ccd 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -170,6 +170,7 @@ extern void cleanup_highmap(void);
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
#define HAVE_ARCH_GET_ADDRESS_RANGE
+#define HAVE_ARCH_ALIGN_ADDR

#define pgtable_cache_init() do { } while (0)
#define check_pgt_cache() do { } while (0)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 2595a5e..ac0afb8 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -25,31 +25,40 @@
* @flags denotes the allocation direction - bottomup or topdown -
* or vDSO; see call sites below.
*/
-unsigned long align_addr(unsigned long addr, struct file *filp,
- enum align_flags flags)
+unsigned long arch_align_addr(unsigned long addr, struct file *filp,
+ unsigned long pgoff, unsigned long flags,
+ enum mmap_allocation_direction direction)
{
- unsigned long tmp_addr;
+ unsigned long tmp_addr = PAGE_ALIGN(addr);

/* handle 32- and 64-bit case with a single conditional */
if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
- return addr;
+ return tmp_addr;

- if (!(current->flags & PF_RANDOMIZE))
- return addr;
+ /* Always allow MAP_FIXED. Colouring is a performance thing only. */
+ if (flags & MAP_FIXED)
+ return tmp_addr;

- if (!((flags & ALIGN_VDSO) || filp))
- return addr;
+ if (!(current->flags & PF_RANDOMIZE))
+ return tmp_addr;

- tmp_addr = addr;
+ if (!(filp || direction == ALLOC_VDSO))
+ return tmp_addr;

/*
* We need an address which is <= than the original
* one only when in topdown direction.
*/
- if (!(flags & ALIGN_TOPDOWN))
+ if (direction == ALLOC_UP)
tmp_addr += va_align.mask;

tmp_addr &= ~va_align.mask;
+ tmp_addr += ((pgoff << PAGE_SHIFT) & va_align.mask);
+
+ if (direction == ALLOC_DOWN && tmp_addr > addr) {
+ tmp_addr -= va_align.mask;
+ tmp_addr &= ~va_align.mask;
+ }

return tmp_addr;
}
@@ -159,7 +168,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,

full_search:

- addr = align_addr(addr, filp, 0);
+ addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_UP);

for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
/* At this point: (!vma || addr < vma->vm_end). */
@@ -186,7 +195,7 @@ full_search:
mm->cached_hole_size = vma->vm_start - addr;

addr = vma->vm_end;
- addr = align_addr(addr, filp, 0);
+ addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_UP);
}
}

@@ -235,7 +244,7 @@ try_again:

addr -= len;
do {
- addr = align_addr(addr, filp, ALIGN_TOPDOWN);
+ addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_DOWN);

/*
* Lookup failure means no vma is above this address,
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 00aaf04..83e0355 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -141,7 +141,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
* unaligned here as a result of stack start randomization.
*/
addr = PAGE_ALIGN(addr);
- addr = align_addr(addr, NULL, ALIGN_VDSO);
+ addr = arch_align_addr(addr, NULL, 0, 0, ALLOC_VDSO);

return addr;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fc76318..18f9326 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -390,12 +390,18 @@ extern int sysctl_max_map_count;
#ifdef CONFIG_MMU
enum mmap_allocation_direction {
ALLOC_UP,
- ALLOC_DOWN
+ ALLOC_DOWN,
+ ALLOC_VDSO,
};
extern void arch_pick_mmap_layout(struct mm_struct *mm);
extern void
arch_get_address_range(unsigned long flags, unsigned long *begin,
unsigned long *end, enum mmap_allocation_direction direction);
+extern unsigned long shm_align_mask;
+extern unsigned long
+arch_align_addr(unsigned long addr, struct file *filp,
+ unsigned long pgoff, unsigned long flags,
+ enum mmap_allocation_direction direction);
extern unsigned long
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
unsigned long, unsigned long);
diff --git a/mm/mmap.c b/mm/mmap.c
index 92cf0bf..0314cb1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1465,6 +1465,51 @@ unacct_error:
return error;
}

+#ifndef HAVE_ARCH_ALIGN_ADDR
+/* Each architecture is responsible for setting this to the required value. */
+unsigned long shm_align_mask = PAGE_SIZE - 1;
+EXPORT_SYMBOL(shm_align_mask);
+
+unsigned long arch_align_addr(unsigned long addr, struct file *filp,
+ unsigned long pgoff, unsigned long flags,
+ enum mmap_allocation_direction direction)
+{
+ unsigned long tmp_addr = PAGE_ALIGN(addr);
+
+ if (shm_align_mask <= PAGE_SIZE)
+ return tmp_addr;
+
+ /* Allow MAP_FIXED without MAP_SHARED at any address. */
+ if ((flags & (MAP_FIXED|MAP_SHARED)) == MAP_FIXED)
+ return tmp_addr;
+
+ /* Enforce page colouring for any file or MAP_SHARED mapping. */
+ if (!(filp || (flags & MAP_SHARED)))
+ return tmp_addr;
+
+ /*
+ * We need an address which is <= than the original
+ * one only when in topdown direction.
+ */
+ if (direction == ALLOC_UP)
+ tmp_addr += shm_align_mask;
+
+ tmp_addr &= ~shm_align_mask;
+ tmp_addr += ((pgoff << PAGE_SHIFT) & shm_align_mask);
+
+ /*
+ * When aligning down, make sure we did not accidentally go up.
+ * The caller will check for underflow.
+ */
+ if (direction == ALLOC_DOWN && tmp_addr > addr) {
+ tmp_addr -= shm_align_mask;
+ tmp_addr &= ~shm_align_mask;
+ }
+
+ return tmp_addr;
+}
+#endif
+
#ifndef HAVE_ARCH_GET_ADDRESS_RANGE
void arch_get_address_range(unsigned long flags, unsigned long *begin,
unsigned long *end, enum mmap_allocation_direction direction)
@@ -1513,18 +1558,22 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
struct rb_node *rb_node;
- unsigned long lower_limit, upper_limit;
+ unsigned long lower_limit, upper_limit, tmp_addr;

arch_get_address_range(flags, &lower_limit, &upper_limit, ALLOC_UP);

if (len > TASK_SIZE)
return -ENOMEM;

- if (flags & MAP_FIXED)
+ if (flags & MAP_FIXED) {
+ tmp_addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_UP);
+ if (tmp_addr != PAGE_ALIGN(addr))
+ return -EINVAL;
return addr;
+ }

if (addr) {
- addr = PAGE_ALIGN(addr);
+ addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_UP);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start))
@@ -1533,7 +1582,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,

/* Find the left-most free area of sufficient size. */
for (addr = 0, rb_node = mm->mm_rb.rb_node; rb_node; ) {
- unsigned long vma_start;
+ unsigned long vma_start, tmp_addr;
int found_here = 0;

vma = rb_to_vma(rb_node);
@@ -1541,13 +1590,17 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (vma->vm_start > len) {
if (!vma->vm_prev) {
/* This is the left-most VMA. */
- if (vma->vm_start - len >= lower_limit) {
- addr = lower_limit;
+ tmp_addr = arch_align_addr(lower_limit, filp,
+ pgoff, flags, ALLOC_UP);
+ if (vma->vm_start - len >= tmp_addr) {
+ addr = tmp_addr;
goto found_addr;
}
} else {
/* Is this hole large enough? Remember it. */
vma_start = max(vma->vm_prev->vm_end, lower_limit);
+ vma_start = arch_align_addr(vma_start, filp,
+ pgoff, flags, ALLOC_UP);
if (vma->vm_start - len >= vma_start) {
addr = vma_start;
found_here = 1;
@@ -1599,6 +1652,8 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr < lower_limit)
addr = lower_limit;

+ addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_UP);
+
found_addr:
if (TASK_SIZE - len < addr)
return -ENOMEM;
@@ -1656,12 +1711,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
if (len > TASK_SIZE)
return -ENOMEM;

- if (flags & MAP_FIXED)
+ if (flags & MAP_FIXED) {
+ unsigned long tmp_addr;
+ tmp_addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_DOWN);
+ if (tmp_addr != PAGE_ALIGN(addr))
+ return -EINVAL;
return addr;
+ }

/* requesting a specific address */
if (addr) {
- addr = PAGE_ALIGN(addr);
+ addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_DOWN);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start))
@@ -1678,7 +1738,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
*/
if (upper_limit - len > mm->highest_vma) {
addr = upper_limit - len;
- goto found_addr;
+ addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_DOWN);
+ if (addr > mm->highest_vma);
+ goto found_addr;
}

/* Find the right-most free area of sufficient size. */
@@ -1691,9 +1753,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
/* Is this hole large enough? Remember it. */
vma_start = min(vma->vm_start, upper_limit);
if (vma_start > len) {
- if (!vma->vm_prev ||
- (vma_start - len >= vma->vm_prev->vm_end)) {
- addr = vma_start - len;
+ unsigned long tmp_addr = vma_start - len;
+ tmp_addr = arch_align_addr(tmp_addr, filp,
+ pgoff, flags, ALLOC_DOWN);
+ /* No underflow? Does it still fit the hole? */
+ if (tmp_addr && tmp_addr <= vma_start - len &&
+ (!vma->vm_prev ||
+ tmp_addr >= vma->vm_prev->vm_end)) {
+ addr = tmp_addr;
found_here = 1;
}
}
--
1.7.7.6

2012-06-18 22:06:16

by Rik van Riel

[permalink] [raw]
Subject: [PATCH -mm 2/7] mm: get unmapped area from VMA tree

From: Rik van Riel <[email protected]>

Change the generic implementations of arch_get_unmapped_area(_topdown)
to use the free space info in the VMA rbtree. This makes it possible
to find free address space in O(log(N)) complexity.

For bottom-up allocations, we pick the lowest hole that is large
enough for our allocation. For topdown allocations, we pick the
highest hole of sufficient size.

For topdown allocations, we need to keep track of the highest
mapped VMA address, because it might be below mm->mmap_base,
and we only keep track of free space to the left of each VMA
in the VMA tree. It is tempting to try and keep track of
the free space to the right of each VMA when running in
topdown mode, but that gets us into trouble when running on
x86, where a process can switch direction in the middle of
execve.

We have to leave the mm->free_area_cache and mm->largest_hole_size
in place for now, because the architecture specific versions still
use those.

Signed-off-by: Rik van Riel <[email protected]>
---
include/linux/mm_types.h | 1 +
mm/mmap.c | 270 +++++++++++++++++++++++++++++++---------------
2 files changed, 184 insertions(+), 87 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bf56d66..8ccb4e1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -307,6 +307,7 @@ struct mm_struct {
unsigned long task_size; /* size of task vm space */
unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
+ unsigned long highest_vma; /* highest vma end address */
pgd_t * pgd;
atomic_t mm_users; /* How many users with user space? */
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
diff --git a/mm/mmap.c b/mm/mmap.c
index 1963ef9..40c848e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -4,6 +4,7 @@
* Written by obz.
*
* Address space accounting code <[email protected]>
+ * Rbtree get_unmapped_area Copyright (C) 2012 Rik van Riel
*/

#include <linux/slab.h>
@@ -250,6 +251,17 @@ static void adjust_free_gap(struct vm_area_struct *vma)
rb_augment_erase_end(&vma->vm_rb, vma_rb_augment_cb, NULL);
}

+static unsigned long node_free_hole(struct rb_node *node)
+{
+ struct vm_area_struct *vma;
+
+ if (!node)
+ return 0;
+
+ vma = container_of(node, struct vm_area_struct, vm_rb);
+ return vma->free_gap;
+}
+
/*
* Unlink a file-based vm structure from its prio_tree, to hide
* vma from rmap and vmtruncate before freeing its page tables.
@@ -386,12 +398,16 @@ void validate_mm(struct mm_struct *mm)
int bug = 0;
int i = 0;
struct vm_area_struct *tmp = mm->mmap;
+ unsigned long highest_address = 0;
while (tmp) {
if (tmp->free_gap != max_free_space(&tmp->vm_rb))
printk("free space %lx, correct %lx\n", tmp->free_gap, max_free_space(&tmp->vm_rb)), bug = 1;
+ highest_address = tmp->vm_end;
tmp = tmp->vm_next;
i++;
}
+ if (highest_address != mm->highest_vma)
+ printk("mm->highest_vma %lx, found %lx\n", mm->highest_vma, highest_address), bug = 1;
if (i != mm->map_count)
printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
i = browse_rb(&mm->mm_rb);
@@ -449,6 +465,9 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
/* Propagate the new free gap between next and us up the tree. */
if (vma->vm_next)
adjust_free_gap(vma->vm_next);
+ else
+ /* This is the VMA with the highest address. */
+ mm->highest_vma = vma->vm_end;
}

static void __vma_link_file(struct vm_area_struct *vma)
@@ -648,6 +667,8 @@ again: remove_next = 1 + (end > next->vm_end);
vma->vm_start = start;
vma->vm_end = end;
vma->vm_pgoff = pgoff;
+ if (!next)
+ mm->highest_vma = end;
if (adjust_next) {
next->vm_start += adjust_next << PAGE_SHIFT;
next->vm_pgoff += adjust_next;
@@ -1456,13 +1477,29 @@ unacct_error:
* This function "knows" that -ENOMEM has the bits set.
*/
#ifndef HAVE_ARCH_UNMAPPED_AREA
+struct rb_node *continue_next_right(struct rb_node *node)
+{
+ struct rb_node *prev;
+
+ while ((prev = node) && (node = rb_parent(node))) {
+ if (prev == node->rb_right)
+ continue;
+
+ if (node->rb_right)
+ return node->rb_right;
+ }
+
+ return NULL;
+}
+
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long start_addr;
+ struct vm_area_struct *vma = NULL;
+ struct rb_node *rb_node;
+ unsigned long lower_limit = TASK_UNMAPPED_BASE;

if (len > TASK_SIZE)
return -ENOMEM;
@@ -1477,40 +1514,76 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
(!vma || addr + len <= vma->vm_start))
return addr;
}
- if (len > mm->cached_hole_size) {
- start_addr = addr = mm->free_area_cache;
- } else {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- }

-full_search:
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (TASK_SIZE - len < addr) {
- /*
- * Start a new search - just in case we missed
- * some holes.
- */
- if (start_addr != TASK_UNMAPPED_BASE) {
- addr = TASK_UNMAPPED_BASE;
- start_addr = addr;
- mm->cached_hole_size = 0;
- goto full_search;
+ /* Find the left-most free area of sufficient size. */
+ for (addr = 0, rb_node = mm->mm_rb.rb_node; rb_node; ) {
+ unsigned long vma_start;
+ int found_here = 0;
+
+ vma = rb_to_vma(rb_node);
+
+ if (vma->vm_start > len) {
+ if (!vma->vm_prev) {
+ /* This is the left-most VMA. */
+ if (vma->vm_start - len >= lower_limit) {
+ addr = lower_limit;
+ goto found_addr;
+ }
+ } else {
+ /* Is this hole large enough? Remember it. */
+ vma_start = max(vma->vm_prev->vm_end, lower_limit);
+ if (vma->vm_start - len >= vma_start) {
+ addr = vma_start;
+ found_here = 1;
+ }
}
- return -ENOMEM;
}
- if (!vma || addr + len <= vma->vm_start) {
- /*
- * Remember the place where we stopped the search:
- */
- mm->free_area_cache = addr + len;
- return addr;
+
+ /* Go left if it looks promising. */
+ if (node_free_hole(rb_node->rb_left) >= len &&
+ vma->vm_start - len >= lower_limit) {
+ rb_node = rb_node->rb_left;
+ continue;
}
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = vma->vm_end;
+
+ if (!found_here && node_free_hole(rb_node->rb_right) >= len) {
+ /* Last known hole is to the right of this subtree. */
+ rb_node = rb_node->rb_right;
+ continue;
+ } else if (!addr) {
+ rb_node = continue_next_right(rb_node);
+ continue;
+ }
+
+ /* This is the left-most hole. */
+ goto found_addr;
}
+
+ /*
+ * There is not enough space to the left of any VMA.
+ * Check the far right-hand side of the VMA tree.
+ */
+ rb_node = mm->mm_rb.rb_node;
+ while (rb_node->rb_right)
+ rb_node = rb_node->rb_right;
+ vma = rb_to_vma(rb_node);
+ addr = vma->vm_end;
+
+ /*
+ * The right-most VMA ends below the lower limit. Can only happen
+ * if a binary personality loads the stack below the executable.
+ */
+ if (addr < lower_limit)
+ addr = lower_limit;
+
+ found_addr:
+ if (TASK_SIZE - len < addr)
+ return -ENOMEM;
+
+ /* This "free area" was not really free. Tree corrupted? */
+ VM_BUG_ON(find_vma_intersection(mm, addr, addr+len));
+
+ return addr;
}
#endif

@@ -1528,14 +1601,31 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
* stack's low limit (the base):
*/
#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+struct rb_node *continue_next_left(struct rb_node *node)
+{
+ struct rb_node *prev;
+
+ while ((prev = node) && (node = rb_parent(node))) {
+ if (prev == node->rb_left)
+ continue;
+
+ if (node->rb_left)
+ return node->rb_left;
+ }
+
+ return NULL;
+}
+
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
const unsigned long len, const unsigned long pgoff,
const unsigned long flags)
{
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = NULL;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0, start_addr;
+ unsigned long addr = addr0;
+ struct rb_node *rb_node = NULL;
+ unsigned long upper_limit = mm->mmap_base;

/* requested length too big for entire address space */
if (len > TASK_SIZE)
@@ -1553,68 +1643,65 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return addr;
}

- /* check if free_area_cache is useful for us */
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
- }
+ /* requested length too big; prevent integer underflow below */
+ if (len > upper_limit)
+ return -ENOMEM;

-try_again:
- /* either no address requested or can't fit in requested address hole */
- start_addr = addr = mm->free_area_cache;
+ /*
+ * Does the highest VMA end far enough below the upper limit
+ * of our search space?
+ */
+ if (upper_limit - len > mm->highest_vma) {
+ addr = upper_limit - len;
+ goto found_addr;
+ }

- if (addr < len)
- goto fail;
+ /* Find the right-most free area of sufficient size. */
+ for (addr = 0, rb_node = mm->mm_rb.rb_node; rb_node; ) {
+ unsigned long vma_start;
+ int found_here = 0;

- addr -= len;
- do {
- /*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
- */
- vma = find_vma(mm, addr);
- if (!vma || addr+len <= vma->vm_start)
- /* remember the address as a hint for next time */
- return (mm->free_area_cache = addr);
+ vma = container_of(rb_node, struct vm_area_struct, vm_rb);

- /* remember the largest hole we saw so far */
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
+ /* Is this hole large enough? Remember it. */
+ vma_start = min(vma->vm_start, upper_limit);
+ if (vma_start > len) {
+ if (!vma->vm_prev ||
+ (vma_start - len >= vma->vm_prev->vm_end)) {
+ addr = vma_start - len;
+ found_here = 1;
+ }
+ }

- /* try just below the current vma->vm_start */
- addr = vma->vm_start-len;
- } while (len < vma->vm_start);
+ /* Go right if it looks promising. */
+ if (node_free_hole(rb_node->rb_right) >= len) {
+ if (upper_limit - len > vma->vm_end) {
+ rb_node = rb_node->rb_right;
+ continue;
+ }
+ }

-fail:
- /*
- * if hint left us with no space for the requested
- * mapping then try again:
- *
- * Note: this is different with the case of bottomup
- * which does the fully line-search, but we use find_vma
- * here that causes some holes skipped.
- */
- if (start_addr != mm->mmap_base) {
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = 0;
- goto try_again;
+ if (!found_here && node_free_hole(rb_node->rb_left) >= len) {
+ /* Last known hole is to the right of this subtree. */
+ rb_node = rb_node->rb_left;
+ continue;
+ } else if (!addr) {
+ rb_node = continue_next_left(rb_node);
+ continue;
+ }
+
+ /* This is the right-most hole. */
+ goto found_addr;
}

- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- mm->cached_hole_size = ~0UL;
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
- /*
- * Restore the topdown base:
- */
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
+ return -ENOMEM;
+
+ found_addr:
+ if (TASK_SIZE - len < addr)
+ return -ENOMEM;
+
+ /* This "free area" was not really free. Tree corrupted? */
+ VM_BUG_ON(find_vma_intersection(mm, addr, addr+len));

return addr;
}
@@ -1828,6 +1915,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
vma->vm_end = address;
if (vma->vm_next)
adjust_free_gap(vma->vm_next);
+ if (!vma->vm_next)
+ vma->vm_mm->highest_vma = vma->vm_end;
perf_event_mmap(vma);
}
}
@@ -2013,6 +2102,13 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
*insertion_point = vma;
if (vma)
vma->vm_prev = prev;
+ else {
+ /* We just unmapped the highest VMA. */
+ if (prev)
+ mm->highest_vma = prev->vm_end;
+ else
+ mm->highest_vma = 0;
+ }
if (vma)
rb_augment_erase_end(&vma->vm_rb, vma_rb_augment_cb, NULL);
tail_vma->vm_next = NULL;
--
1.7.7.6

2012-06-18 22:06:55

by Rik van Riel

[permalink] [raw]
Subject: [PATCH -mm 1/7] mm: track free size between VMAs in VMA rbtree

From: Rik van Riel <[email protected]>

Track the size of free areas between VMAs in the VMA rbtree.

This will allow get_unmapped_area_* to find a free area of the
right size in O(log(N)) time, instead of potentially having to
do a linear walk across all the VMAs.

Signed-off-by: Rik van Riel <[email protected]>
---
include/linux/mm_types.h | 7 ++++
mm/internal.h | 5 +++
mm/mmap.c | 76 +++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 87 insertions(+), 1 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index dad95bd..bf56d66 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -213,6 +213,13 @@ struct vm_area_struct {
struct rb_node vm_rb;

/*
+ * Largest free memory gap "behind" this VMA (in the direction mmap
+ * grows from), or of VMAs down the rb tree below us. This helps
+ * get_unmapped_area find a free area of the right size.
+ */
+ unsigned long free_gap;
+
+ /*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap prio tree, or
* linkage to the list of like vmas hanging off its node, or
diff --git a/mm/internal.h b/mm/internal.h
index 2ba87fb..f59f97a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -159,6 +159,11 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
}

+static inline struct vm_area_struct *rb_to_vma(struct rb_node *node)
+{
+ return container_of(node, struct vm_area_struct, vm_rb);
+}
+
/*
* Called only in fault path via page_evictable() for a new page
* to determine if it's being mapped into a LOCKED vma.
diff --git a/mm/mmap.c b/mm/mmap.c
index 3edfcdf..1963ef9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -205,6 +205,51 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
flush_dcache_mmap_unlock(mapping);
}

+static unsigned long max_free_space(struct rb_node *node)
+{
+ struct vm_area_struct *vma, *prev, *left = NULL, *right = NULL;
+ unsigned long largest = 0;
+
+ if (node->rb_left)
+ left = rb_to_vma(node->rb_left);
+ if (node->rb_right)
+ right = rb_to_vma(node->rb_right);
+
+ /*
+ * Calculate the free gap size between us and the
+ * VMA to our left.
+ */
+ vma = rb_to_vma(node);
+ prev = vma->vm_prev;
+
+ if (prev)
+ largest = vma->vm_start - prev->vm_end;
+ else
+ largest = vma->vm_start;
+
+ /* We propagate the largest of our own, or our children's free gaps. */
+ if (left)
+ largest = max(largest, left->free_gap);
+ if (right)
+ largest = max(largest, right->free_gap);
+
+ return largest;
+}
+
+static void vma_rb_augment_cb(struct rb_node *node, void *__unused)
+{
+ struct vm_area_struct *vma;
+
+ vma = rb_to_vma(node);
+
+ vma->free_gap = max_free_space(node);
+}
+
+static void adjust_free_gap(struct vm_area_struct *vma)
+{
+ rb_augment_erase_end(&vma->vm_rb, vma_rb_augment_cb, NULL);
+}
+
/*
* Unlink a file-based vm structure from its prio_tree, to hide
* vma from rmap and vmtruncate before freeing its page tables.
@@ -342,6 +387,8 @@ void validate_mm(struct mm_struct *mm)
int i = 0;
struct vm_area_struct *tmp = mm->mmap;
while (tmp) {
+ if (tmp->free_gap != max_free_space(&tmp->vm_rb))
+ printk("free space %lx, correct %lx\n", tmp->free_gap, max_free_space(&tmp->vm_rb)), bug = 1;
tmp = tmp->vm_next;
i++;
}
@@ -398,6 +445,10 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
{
rb_link_node(&vma->vm_rb, rb_parent, rb_link);
rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+ adjust_free_gap(vma);
+ /* Propagate the new free gap between next and us up the tree. */
+ if (vma->vm_next)
+ adjust_free_gap(vma->vm_next);
}

static void __vma_link_file(struct vm_area_struct *vma)
@@ -473,11 +524,17 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
struct vm_area_struct *next = vma->vm_next;
+ struct rb_node *deepest;

prev->vm_next = next;
- if (next)
+ if (next) {
next->vm_prev = prev;
+ adjust_free_gap(next);
+ }
+ deepest = rb_augment_erase_begin(&vma->vm_rb);
rb_erase(&vma->vm_rb, &mm->mm_rb);
+ rb_augment_erase_end(deepest, vma_rb_augment_cb, NULL);
+
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;
}
@@ -657,6 +714,15 @@ again: remove_next = 1 + (end > next->vm_end);
if (insert && file)
uprobe_mmap(insert);

+ /* Adjust the rb tree for changes in the free gaps between VMAs. */
+ adjust_free_gap(vma);
+ if (insert)
+ adjust_free_gap(insert);
+ if (vma->vm_next && vma->vm_next != insert)
+ adjust_free_gap(vma->vm_next);
+ if (insert && insert->vm_next && insert->vm_next != vma)
+ adjust_free_gap(insert->vm_next);
+
validate_mm(mm);

return 0;
@@ -1760,6 +1826,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
error = acct_stack_growth(vma, size, grow);
if (!error) {
vma->vm_end = address;
+ if (vma->vm_next)
+ adjust_free_gap(vma->vm_next);
perf_event_mmap(vma);
}
}
@@ -1811,6 +1879,7 @@ int expand_downwards(struct vm_area_struct *vma,
if (!error) {
vma->vm_start = address;
vma->vm_pgoff -= grow;
+ adjust_free_gap(vma);
perf_event_mmap(vma);
}
}
@@ -1933,7 +2002,10 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
do {
+ struct rb_node *deepest;
+ deepest = rb_augment_erase_begin(&vma->vm_rb);
rb_erase(&vma->vm_rb, &mm->mm_rb);
+ rb_augment_erase_end(deepest, vma_rb_augment_cb, NULL);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
@@ -1941,6 +2013,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
*insertion_point = vma;
if (vma)
vma->vm_prev = prev;
+ if (vma)
+ rb_augment_erase_end(&vma->vm_rb, vma_rb_augment_cb, NULL);
tail_vma->vm_next = NULL;
if (mm->unmap_area == arch_unmap_area)
addr = prev ? prev->vm_end : mm->mmap_base;
--
1.7.7.6

2012-06-18 22:06:53

by Rik van Riel

[permalink] [raw]
Subject: [PATCH -mm 5/7] mm: remove x86 arch_get_unmapped_area(_topdown)

From: Rik van Riel <[email protected]>

The generic arch_get_unmapped_area(_topdown) should now be able
to do everything x86 needs. Remove the x86 specific functions.

TODO: make the hugetlbfs arch_get_unmapped_area call the generic
code with proper alignment info.

Cc: Andi Kleen <[email protected]>
Signed-off-by: Rik van Riel <[email protected]>
---
arch/x86/include/asm/pgtable_64.h | 2 -
arch/x86/kernel/sys_x86_64.c | 162 -------------------------------------
2 files changed, 0 insertions(+), 164 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 8408ccd..0ff6500 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -167,8 +167,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);

-#define HAVE_ARCH_UNMAPPED_AREA
-#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
#define HAVE_ARCH_GET_ADDRESS_RANGE
#define HAVE_ARCH_ALIGN_ADDR

diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index ac0afb8..0243c58 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -131,165 +131,3 @@ void arch_get_address_range(unsigned long flags, unsigned long *begin,
*end = current->mm->mmap_base;
}
}
-
-unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long start_addr;
- unsigned long begin, end;
-
- if (flags & MAP_FIXED)
- return addr;
-
- arch_get_address_range(flags, &begin, &end, ALLOC_UP);
-
- if (len > end)
- return -ENOMEM;
-
- if (addr) {
- addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
- if (end - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
- return addr;
- }
- if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32))
- && len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = begin;
- }
- addr = mm->free_area_cache;
- if (addr < begin)
- addr = begin;
- start_addr = addr;
-
-full_search:
-
- addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_UP);
-
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (end - len < addr) {
- /*
- * Start a new search - just in case we missed
- * some holes.
- */
- if (start_addr != begin) {
- start_addr = addr = begin;
- mm->cached_hole_size = 0;
- goto full_search;
- }
- return -ENOMEM;
- }
- if (!vma || addr + len <= vma->vm_start) {
- /*
- * Remember the place where we stopped the search:
- */
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- addr = vma->vm_end;
- addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_UP);
- }
-}
-
-
-unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
-{
- struct vm_area_struct *vma;
- struct mm_struct *mm = current->mm;
- unsigned long addr = addr0, start_addr;
-
- /* requested length too big for entire address space */
- if (len > TASK_SIZE)
- return -ENOMEM;
-
- if (flags & MAP_FIXED)
- return addr;
-
- /* for MAP_32BIT mappings we force the legact mmap base */
- if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT))
- goto bottomup;
-
- /* requesting a specific address */
- if (addr) {
- addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
- return addr;
- }
-
- /* check if free_area_cache is useful for us */
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
- }
-
-try_again:
- /* either no address requested or can't fit in requested address hole */
- start_addr = addr = mm->free_area_cache;
-
- if (addr < len)
- goto fail;
-
- addr -= len;
- do {
- addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_DOWN);
-
- /*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
- */
- vma = find_vma(mm, addr);
- if (!vma || addr+len <= vma->vm_start)
- /* remember the address as a hint for next time */
- return mm->free_area_cache = addr;
-
- /* remember the largest hole we saw so far */
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = vma->vm_start-len;
- } while (len < vma->vm_start);
-
-fail:
- /*
- * if hint left us with no space for the requested
- * mapping then try again:
- */
- if (start_addr != mm->mmap_base) {
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = 0;
- goto try_again;
- }
-
-bottomup:
- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- mm->cached_hole_size = ~0UL;
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
- /*
- * Restore the topdown base:
- */
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
-
- return addr;
-}
--
1.7.7.6

2012-06-18 22:06:52

by Rik van Riel

[permalink] [raw]
Subject: [PATCH -mm 7/7] remove ARM arch_get_unmapped_area functions

From: Rik van Riel <[email protected]>

Remove the ARM special variants of arch_get_unmapped_area since the
generic functions should now be able to handle everything.

Untested because I have no ARM hardware.

Cc: Russell King <[email protected]>
Signed-off-by: Rik van Riel <[email protected]>
---
arch/arm/include/asm/pgtable.h | 6 -
arch/arm/mm/init.c | 3 +
arch/arm/mm/mmap.c | 217 +---------------------------------------
3 files changed, 4 insertions(+), 222 deletions(-)

diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index f66626d..6754183 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -296,12 +296,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
#include <asm-generic/pgtable.h>

/*
- * We provide our own arch_get_unmapped_area to cope with VIPT caches.
- */
-#define HAVE_ARCH_UNMAPPED_AREA
-#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-
-/*
* remap a physical page `pfn' of size `size' with page protection `prot'
* into virtual address `from'
*/
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index f54d592..534dd96 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -600,6 +600,9 @@ void __init mem_init(void)
extern u32 itcm_end;
#endif

+ /* Tell the page colouring code what we need. */
+ shm_align_mask = SHMLBA - 1;
+
max_mapnr = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map;

/* this will put all unused low memory onto the freelists */
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index ce8cb19..2b1f881 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -11,21 +11,7 @@
#include <linux/random.h>
#include <asm/cachetype.h>

-static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
- unsigned long pgoff)
-{
- unsigned long base = addr & ~(SHMLBA-1);
- unsigned long off = (pgoff << PAGE_SHIFT) & (SHMLBA-1);
-
- if (base + off <= addr)
- return base + off;
-
- return base - off;
-}
-
-#define COLOUR_ALIGN(addr,pgoff) \
- ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \
- (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1)))
+unsigned long shm_align_mask = SHMLBA - 1;

/* gap between mmap and stack */
#define MIN_GAP (128*1024*1024UL)
@@ -54,207 +40,6 @@ static unsigned long mmap_base(unsigned long rnd)
return PAGE_ALIGN(TASK_SIZE - gap - rnd);
}

-/*
- * We need to ensure that shared mappings are correctly aligned to
- * avoid aliasing issues with VIPT caches. We need to ensure that
- * a specific page of an object is always mapped at a multiple of
- * SHMLBA bytes.
- *
- * We unconditionally provide this function for all cases, however
- * in the VIVT case, we optimise out the alignment rules.
- */
-unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long start_addr;
- int do_align = 0;
- int aliasing = cache_is_vipt_aliasing();
-
- /*
- * We only need to do colour alignment if either the I or D
- * caches alias.
- */
- if (aliasing)
- do_align = filp || (flags & MAP_SHARED);
-
- /*
- * We enforce the MAP_FIXED case.
- */
- if (flags & MAP_FIXED) {
- if (aliasing && flags & MAP_SHARED &&
- (addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))
- return -EINVAL;
- return addr;
- }
-
- if (len > TASK_SIZE)
- return -ENOMEM;
-
- if (addr) {
- if (do_align)
- addr = COLOUR_ALIGN(addr, pgoff);
- else
- addr = PAGE_ALIGN(addr);
-
- vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
- return addr;
- }
- if (len > mm->cached_hole_size) {
- start_addr = addr = mm->free_area_cache;
- } else {
- start_addr = addr = mm->mmap_base;
- mm->cached_hole_size = 0;
- }
-
-full_search:
- if (do_align)
- addr = COLOUR_ALIGN(addr, pgoff);
- else
- addr = PAGE_ALIGN(addr);
-
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (TASK_SIZE - len < addr) {
- /*
- * Start a new search - just in case we missed
- * some holes.
- */
- if (start_addr != TASK_UNMAPPED_BASE) {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- goto full_search;
- }
- return -ENOMEM;
- }
- if (!vma || addr + len <= vma->vm_start) {
- /*
- * Remember the place where we stopped the search:
- */
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = vma->vm_end;
- if (do_align)
- addr = COLOUR_ALIGN(addr, pgoff);
- }
-}
-
-unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
-{
- struct vm_area_struct *vma;
- struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
- int do_align = 0;
- int aliasing = cache_is_vipt_aliasing();
-
- /*
- * We only need to do colour alignment if either the I or D
- * caches alias.
- */
- if (aliasing)
- do_align = filp || (flags & MAP_SHARED);
-
- /* requested length too big for entire address space */
- if (len > TASK_SIZE)
- return -ENOMEM;
-
- if (flags & MAP_FIXED) {
- if (aliasing && flags & MAP_SHARED &&
- (addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))
- return -EINVAL;
- return addr;
- }
-
- /* requesting a specific address */
- if (addr) {
- if (do_align)
- addr = COLOUR_ALIGN(addr, pgoff);
- else
- addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
- return addr;
- }
-
- /* check if free_area_cache is useful for us */
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
- }
-
- /* either no address requested or can't fit in requested address hole */
- addr = mm->free_area_cache;
- if (do_align) {
- unsigned long base = COLOUR_ALIGN_DOWN(addr - len, pgoff);
- addr = base + len;
- }
-
- /* make sure it can fit in the remaining address space */
- if (addr > len) {
- vma = find_vma(mm, addr-len);
- if (!vma || addr <= vma->vm_start)
- /* remember the address as a hint for next time */
- return (mm->free_area_cache = addr-len);
- }
-
- if (mm->mmap_base < len)
- goto bottomup;
-
- addr = mm->mmap_base - len;
- if (do_align)
- addr = COLOUR_ALIGN_DOWN(addr, pgoff);
-
- do {
- /*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
- */
- vma = find_vma(mm, addr);
- if (!vma || addr+len <= vma->vm_start)
- /* remember the address as a hint for next time */
- return (mm->free_area_cache = addr);
-
- /* remember the largest hole we saw so far */
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = vma->vm_start - len;
- if (do_align)
- addr = COLOUR_ALIGN_DOWN(addr, pgoff);
- } while (len < vma->vm_start);
-
-bottomup:
- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- mm->cached_hole_size = ~0UL;
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
- /*
- * Restore the topdown base:
- */
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
-
- return addr;
-}
-
void arch_pick_mmap_layout(struct mm_struct *mm)
{
unsigned long random_factor = 0UL;
--
1.7.7.6

2012-06-18 22:06:51

by Rik van Riel

[permalink] [raw]
Subject: [PATCH -mm 6/7] remove MIPS arch_get_unmapped_area code

From: Rik van Riel <[email protected]>

Remove all the MIPS specific arch_get_unmapped_area(_topdown) and
page colouring code, now that the generic code should be able to
handle things.

Untested, because I do not have any MIPS systems.

Cc: Ralf Baechle <[email protected]>
Cc: [email protected]
Signed-off-by: Rik van Riel <[email protected]>
---
arch/mips/include/asm/pgtable.h | 8 --
arch/mips/mm/mmap.c | 175 ---------------------------------------
2 files changed, 0 insertions(+), 183 deletions(-)

diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index f133a4c..5f9c49a 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -410,14 +410,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
#endif

/*
- * We provide our own get_unmapped area to cope with the virtual aliasing
- * constraints placed on us by the cache architecture.
- */
-#define HAVE_ARCH_UNMAPPED_AREA
-#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#define HAVE_ARCH_ALIGN_ADDR
-
-/*
* No page table caches to initialise
*/
#define pgtable_cache_init() do { } while (0)
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index 3f8af17..ac342bd 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -15,9 +15,6 @@
#include <linux/random.h>
#include <linux/sched.h>

-unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */
-EXPORT_SYMBOL(shm_align_mask);
-
/* gap between mmap and stack */
#define MIN_GAP (128*1024*1024UL)
#define MAX_GAP ((TASK_SIZE)/6*5)
@@ -45,178 +42,6 @@ static unsigned long mmap_base(unsigned long rnd)
return PAGE_ALIGN(TASK_SIZE - gap - rnd);
}

-static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
- unsigned long pgoff)
-{
- unsigned long base = addr & ~shm_align_mask;
- unsigned long off = (pgoff << PAGE_SHIFT) & shm_align_mask;
-
- if (base + off <= addr)
- return base + off;
-
- return base - off;
-}
-
-#define COLOUR_ALIGN(addr, pgoff) \
- ((((addr) + shm_align_mask) & ~shm_align_mask) + \
- (((pgoff) << PAGE_SHIFT) & shm_align_mask))
-
-static unsigned long arch_get_unmapped_area_common(struct file *filp,
- unsigned long addr0, unsigned long len, unsigned long pgoff,
- unsigned long flags, enum mmap_allocation_direction dir)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long addr = addr0;
- int do_color_align;
-
- if (unlikely(len > TASK_SIZE))
- return -ENOMEM;
-
- if (flags & MAP_FIXED) {
- /* Even MAP_FIXED mappings must reside within TASK_SIZE */
- if (TASK_SIZE - len < addr)
- return -EINVAL;
-
- /*
- * We do not accept a shared mapping if it would violate
- * cache aliasing constraints.
- */
- if ((flags & MAP_SHARED) &&
- ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask))
- return -EINVAL;
- return addr;
- }
-
- do_color_align = 0;
- if (filp || (flags & MAP_SHARED))
- do_color_align = 1;
-
- /* requesting a specific address */
- if (addr) {
- if (do_color_align)
- addr = COLOUR_ALIGN(addr, pgoff);
- else
- addr = PAGE_ALIGN(addr);
-
- vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
- return addr;
- }
-
- if (dir == ALLOC_UP) {
- addr = mm->mmap_base;
- if (do_color_align)
- addr = COLOUR_ALIGN(addr, pgoff);
- else
- addr = PAGE_ALIGN(addr);
-
- for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (TASK_SIZE - len < addr)
- return -ENOMEM;
- if (!vma || addr + len <= vma->vm_start)
- return addr;
- addr = vma->vm_end;
- if (do_color_align)
- addr = COLOUR_ALIGN(addr, pgoff);
- }
- } else {
- /* check if free_area_cache is useful for us */
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
- }
-
- /*
- * either no address requested, or the mapping can't fit into
- * the requested address hole
- */
- addr = mm->free_area_cache;
- if (do_color_align) {
- unsigned long base =
- COLOUR_ALIGN_DOWN(addr - len, pgoff);
- addr = base + len;
- }
-
- /* make sure it can fit in the remaining address space */
- if (likely(addr > len)) {
- vma = find_vma(mm, addr - len);
- if (!vma || addr <= vma->vm_start) {
- /* cache the address as a hint for next time */
- return mm->free_area_cache = addr - len;
- }
- }
-
- if (unlikely(mm->mmap_base < len))
- goto bottomup;
-
- addr = mm->mmap_base - len;
- if (do_color_align)
- addr = COLOUR_ALIGN_DOWN(addr, pgoff);
-
- do {
- /*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
- */
- vma = find_vma(mm, addr);
- if (likely(!vma || addr + len <= vma->vm_start)) {
- /* cache the address as a hint for next time */
- return mm->free_area_cache = addr;
- }
-
- /* remember the largest hole we saw so far */
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = vma->vm_start - len;
- if (do_color_align)
- addr = COLOUR_ALIGN_DOWN(addr, pgoff);
- } while (likely(len < vma->vm_start));
-
-bottomup:
- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- mm->cached_hole_size = ~0UL;
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
- /*
- * Restore the topdown base:
- */
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
-
- return addr;
- }
-}
-
-unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- return arch_get_unmapped_area_common(filp,
- addr0, len, pgoff, flags, ALLOC_UP);
-}
-
-/*
- * There is no need to export this but sched.h declares the function as
- * extern so making it static here results in an error.
- */
-unsigned long arch_get_unmapped_area_topdown(struct file *filp,
- unsigned long addr0, unsigned long len, unsigned long pgoff,
- unsigned long flags)
-{
- return arch_get_unmapped_area_common(filp,
- addr0, len, pgoff, flags, ALLOC_DOWN);
-}
-
void arch_pick_mmap_layout(struct mm_struct *mm)
{
unsigned long random_factor = 0UL;
--
1.7.7.6

2012-06-19 23:20:54

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -mm 0/7] mm: scalable and unified arch_get_unmapped_area

On Mon, 18 Jun 2012 18:05:19 -0400
Rik van Riel <[email protected]> wrote:

> [actually include all 7 patches]
>
> A long time ago, we decided to limit the number of VMAs per
> process to 64k. As it turns out, there actually are programs
> using tens of thousands of VMAs.
>
> The linear search in arch_get_unmapped_area and
> arch_get_unmapped_area_topdown can be a real issue for
> those programs.
>
> This patch series aims to fix the scalability issue by
> tracking the size of each free hole in the VMA rbtree,
> propagating the free hole info up the tree.
>
> Another major goal is to put the bulk of the necessary
> arch_get_unmapped_area(_topdown) functionality into one
> set of functions, so we can eliminate the custom large
> functions per architecture, sticking to a few much smaller
> architecture specific functions instead.
>
> In this version I have only gotten rid of the x86, ARM
> and MIPS arch-specific code, and am already showing a
> fairly promising diffstat:

Looking nice!

> Testing performance with a benchmark that allocates tens
> of thousands of VMAs, unmaps them and mmaps them some more
> in a loop, shows promising results.
>
> Vanilla 3.4 kernel:
> $ ./agua_frag_test_64
> ..........
>
> Min Time (ms): 6
> Avg. Time (ms): 294.0000
> Max Time (ms): 609
> Std Dev (ms): 113.1664
> Standard deviation exceeds 10
>
> With patches:
> $ ./agua_frag_test_64
> ..........
>
> Min Time (ms): 14
> Avg. Time (ms): 38.0000
> Max Time (ms): 60
> Std Dev (ms): 3.9312
> All checks pass
>
> The total run time of the test goes down by about a
> factor 4. More importantly, the worst case performance
> of the loop (which is what really hurt some applications)
> has gone down by about a factor 10.

OK, so you improved the bad case. But what was the impact on the
current good case? kernel compile, shell scripts, some app which
creates 20 vmas then sits in a loop doing munmap(mmap(...))? Try to
think of workloads whcih might take damage, and quantify that?

2012-06-19 23:25:49

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -mm 1/7] mm: track free size between VMAs in VMA rbtree

On Mon, 18 Jun 2012 18:05:20 -0400
Rik van Riel <[email protected]> wrote:

> From: Rik van Riel <[email protected]>
>
> Track the size of free areas between VMAs in the VMA rbtree.
>
> This will allow get_unmapped_area_* to find a free area of the
> right size in O(log(N)) time, instead of potentially having to
> do a linear walk across all the VMAs.
>
> ...
>
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -213,6 +213,13 @@ struct vm_area_struct {
> struct rb_node vm_rb;
>
> /*
> + * Largest free memory gap "behind" this VMA (in the direction mmap
> + * grows from), or of VMAs down the rb tree below us. This helps
> + * get_unmapped_area find a free area of the right size.
> + */
> + unsigned long free_gap;

Please mention the units? Seems to be "bytes", not "pages".

> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -205,6 +205,51 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> flush_dcache_mmap_unlock(mapping);
> }
>
> +static unsigned long max_free_space(struct rb_node *node)
> +{
> + struct vm_area_struct *vma, *prev, *left = NULL, *right = NULL;
> + unsigned long largest = 0;
> +
> + if (node->rb_left)
> + left = rb_to_vma(node->rb_left);
> + if (node->rb_right)
> + right = rb_to_vma(node->rb_right);
> +
> + /*
> + * Calculate the free gap size between us and the
> + * VMA to our left.
> + */

Comment will fit in a single line.

> + vma = rb_to_vma(node);
> + prev = vma->vm_prev;
> +
> + if (prev)
> + largest = vma->vm_start - prev->vm_end;
> + else
> + largest = vma->vm_start;
> +
> + /* We propagate the largest of our own, or our children's free gaps. */
> + if (left)
> + largest = max(largest, left->free_gap);
> + if (right)
> + largest = max(largest, right->free_gap);
> +
> + return largest;
> +}

This would be easier to review if it had a nice comment explaining its
role ;)

> +static void vma_rb_augment_cb(struct rb_node *node, void *__unused)
> +{
> + struct vm_area_struct *vma;
> +
> + vma = rb_to_vma(node);
> +
> + vma->free_gap = max_free_space(node);
> +}

Save some trees!

struct vm_area_struct *vma = rb_to_vma(node);
vma->free_gap = max_free_space(node);

or even

rb_to_vma(node)->free_gap = max_free_space(node);


Major stuff, huh?

2012-06-19 23:27:50

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -mm 4/7] mm: make page colouring code generic

On Mon, 18 Jun 2012 18:05:23 -0400
Rik van Riel <[email protected]> wrote:

> From: Rik van Riel <[email protected]>
>
> Fix the x86-64 page colouring code to take pgoff into account.

Could we please have a full description of what's wrong with the
current code?

> Use the x86 and MIPS page colouring code as the basis for a generic
> page colouring function.
>
> Teach the generic arch_get_unmapped_area(_topdown) code to call the
> page colouring code.
>
> Make sure that ALIGN_DOWN always aligns down, and ends up at the
> right page colour.

Some performance tests on the result would be interesting. iirc, we've
often had trouble demonstrating much or any benefit from coloring.

2012-06-21 09:02:25

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH -mm 2/7] mm: get unmapped area from VMA tree

On Mon, Jun 18, 2012 at 06:05:21PM -0400, Rik van Riel wrote:
> From: Rik van Riel <[email protected]>
>
> Change the generic implementations of arch_get_unmapped_area(_topdown)
> to use the free space info in the VMA rbtree. This makes it possible
> to find free address space in O(log(N)) complexity.
>
> For bottom-up allocations, we pick the lowest hole that is large
> enough for our allocation. For topdown allocations, we pick the
> highest hole of sufficient size.
>
> For topdown allocations, we need to keep track of the highest
> mapped VMA address, because it might be below mm->mmap_base,
> and we only keep track of free space to the left of each VMA
> in the VMA tree. It is tempting to try and keep track of
> the free space to the right of each VMA when running in
> topdown mode, but that gets us into trouble when running on
> x86, where a process can switch direction in the middle of
> execve.
>
> We have to leave the mm->free_area_cache and mm->largest_hole_size
> in place for now, because the architecture specific versions still
> use those.
>
> Signed-off-by: Rik van Riel <[email protected]>
> ---
> include/linux/mm_types.h | 1 +
> mm/mmap.c | 270 +++++++++++++++++++++++++++++++---------------
> 2 files changed, 184 insertions(+), 87 deletions(-)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index bf56d66..8ccb4e1 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -307,6 +307,7 @@ struct mm_struct {
> unsigned long task_size; /* size of task vm space */
> unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
> unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
> + unsigned long highest_vma; /* highest vma end address */

It's not clear from the name that this is an end address. Would
highest_vm_end be better?

> pgd_t * pgd;
> atomic_t mm_users; /* How many users with user space? */
> atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 1963ef9..40c848e 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -4,6 +4,7 @@
> * Written by obz.
> *
> * Address space accounting code <[email protected]>
> + * Rbtree get_unmapped_area Copyright (C) 2012 Rik van Riel
> */
>
> #include <linux/slab.h>
> @@ -250,6 +251,17 @@ static void adjust_free_gap(struct vm_area_struct *vma)
> rb_augment_erase_end(&vma->vm_rb, vma_rb_augment_cb, NULL);
> }
>
> +static unsigned long node_free_hole(struct rb_node *node)
> +{
> + struct vm_area_struct *vma;
> +
> + if (!node)
> + return 0;
> +
> + vma = container_of(node, struct vm_area_struct, vm_rb);
> + return vma->free_gap;
> +}
> +
> /*
> * Unlink a file-based vm structure from its prio_tree, to hide
> * vma from rmap and vmtruncate before freeing its page tables.
> @@ -386,12 +398,16 @@ void validate_mm(struct mm_struct *mm)
> int bug = 0;
> int i = 0;
> struct vm_area_struct *tmp = mm->mmap;
> + unsigned long highest_address = 0;
> while (tmp) {
> if (tmp->free_gap != max_free_space(&tmp->vm_rb))
> printk("free space %lx, correct %lx\n", tmp->free_gap, max_free_space(&tmp->vm_rb)), bug = 1;
> + highest_address = tmp->vm_end;
> tmp = tmp->vm_next;
> i++;
> }
> + if (highest_address != mm->highest_vma)
> + printk("mm->highest_vma %lx, found %lx\n", mm->highest_vma, highest_address), bug = 1;
> if (i != mm->map_count)
> printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
> i = browse_rb(&mm->mm_rb);
> @@ -449,6 +465,9 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
> /* Propagate the new free gap between next and us up the tree. */
> if (vma->vm_next)
> adjust_free_gap(vma->vm_next);
> + else
> + /* This is the VMA with the highest address. */
> + mm->highest_vma = vma->vm_end;
> }
>
> static void __vma_link_file(struct vm_area_struct *vma)
> @@ -648,6 +667,8 @@ again: remove_next = 1 + (end > next->vm_end);
> vma->vm_start = start;
> vma->vm_end = end;
> vma->vm_pgoff = pgoff;
> + if (!next)
> + mm->highest_vma = end;
> if (adjust_next) {
> next->vm_start += adjust_next << PAGE_SHIFT;
> next->vm_pgoff += adjust_next;
> @@ -1456,13 +1477,29 @@ unacct_error:
> * This function "knows" that -ENOMEM has the bits set.
> */
> #ifndef HAVE_ARCH_UNMAPPED_AREA
> +struct rb_node *continue_next_right(struct rb_node *node)
> +{
> + struct rb_node *prev;
> +
> + while ((prev = node) && (node = rb_parent(node))) {
> + if (prev == node->rb_right)
> + continue;
> +
> + if (node->rb_right)
> + return node->rb_right;
> + }
> +
> + return NULL;
> +}
> +
> unsigned long
> arch_get_unmapped_area(struct file *filp, unsigned long addr,
> unsigned long len, unsigned long pgoff, unsigned long flags)
> {
> struct mm_struct *mm = current->mm;
> - struct vm_area_struct *vma;
> - unsigned long start_addr;
> + struct vm_area_struct *vma = NULL;
> + struct rb_node *rb_node;
> + unsigned long lower_limit = TASK_UNMAPPED_BASE;
>
> if (len > TASK_SIZE)
> return -ENOMEM;
> @@ -1477,40 +1514,76 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
> (!vma || addr + len <= vma->vm_start))
> return addr;
> }
> - if (len > mm->cached_hole_size) {
> - start_addr = addr = mm->free_area_cache;
> - } else {
> - start_addr = addr = TASK_UNMAPPED_BASE;
> - mm->cached_hole_size = 0;
> - }
>
> -full_search:
> - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
> - /* At this point: (!vma || addr < vma->vm_end). */
> - if (TASK_SIZE - len < addr) {
> - /*
> - * Start a new search - just in case we missed
> - * some holes.
> - */
> - if (start_addr != TASK_UNMAPPED_BASE) {
> - addr = TASK_UNMAPPED_BASE;
> - start_addr = addr;
> - mm->cached_hole_size = 0;
> - goto full_search;
> + /* Find the left-most free area of sufficient size. */
> + for (addr = 0, rb_node = mm->mm_rb.rb_node; rb_node; ) {
> + unsigned long vma_start;
> + int found_here = 0;
> +
> + vma = rb_to_vma(rb_node);
> +
> + if (vma->vm_start > len) {

vmas can abut, and vma->vm_end == vma->vm_next->vm_start. Should this
be >=?

> + if (!vma->vm_prev) {
> + /* This is the left-most VMA. */
> + if (vma->vm_start - len >= lower_limit) {
> + addr = lower_limit;
> + goto found_addr;
> + }
> + } else {
> + /* Is this hole large enough? Remember it. */
> + vma_start = max(vma->vm_prev->vm_end, lower_limit);
> + if (vma->vm_start - len >= vma_start) {
> + addr = vma_start;
> + found_here = 1;
> + }
> }
> - return -ENOMEM;
> }
> - if (!vma || addr + len <= vma->vm_start) {
> - /*
> - * Remember the place where we stopped the search:
> - */
> - mm->free_area_cache = addr + len;
> - return addr;
> +
> + /* Go left if it looks promising. */
> + if (node_free_hole(rb_node->rb_left) >= len &&
> + vma->vm_start - len >= lower_limit) {
> + rb_node = rb_node->rb_left;
> + continue;

If we already are at a vma whose start has a lower address than the
overall length, does it make sense to check for a left hole?
I.e. shouldn't this be inside the if (vma->vm_start > len) block?

> }
> - if (addr + mm->cached_hole_size < vma->vm_start)
> - mm->cached_hole_size = vma->vm_start - addr;
> - addr = vma->vm_end;
> +
> + if (!found_here && node_free_hole(rb_node->rb_right) >= len) {
> + /* Last known hole is to the right of this subtree. */
> + rb_node = rb_node->rb_right;
> + continue;
> + } else if (!addr) {
> + rb_node = continue_next_right(rb_node);
> + continue;
> + }
> +
> + /* This is the left-most hole. */
> + goto found_addr;
> }
> +
> + /*
> + * There is not enough space to the left of any VMA.
> + * Check the far right-hand side of the VMA tree.
> + */
> + rb_node = mm->mm_rb.rb_node;
> + while (rb_node->rb_right)
> + rb_node = rb_node->rb_right;
> + vma = rb_to_vma(rb_node);
> + addr = vma->vm_end;

Unless I missed something, we only reach here when
continue_next_right(rb_node) above returned NULL. And if it does, the
rb_node it was passed was the right-most node in the tree, so we could
do something like

} else if (!addr) {
struct rb_node *rb_right = continue_next_right(rb_node);
if (!rb_right)
break;
rb_node = rb_right;
continue;
}

above and then save the lookup after the loop.

Also, dereferencing mm->mm_rb.rb_node unconditionally after the loop
assumes that the tree always contains at least one vma. Is this
guaranteed for all architectures?

> -fail:
> - /*
> - * if hint left us with no space for the requested
> - * mapping then try again:
> - *
> - * Note: this is different with the case of bottomup
> - * which does the fully line-search, but we use find_vma
> - * here that causes some holes skipped.
> - */
> - if (start_addr != mm->mmap_base) {
> - mm->free_area_cache = mm->mmap_base;
> - mm->cached_hole_size = 0;
> - goto try_again;
> + if (!found_here && node_free_hole(rb_node->rb_left) >= len) {
> + /* Last known hole is to the right of this subtree. */

"to the left"

So, nothing major from me, either. The patch looks really good!

2012-06-21 10:18:17

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH -mm 0/7] mm: scalable and unified arch_get_unmapped_area

On Mon, Jun 18, 2012 at 06:05:19PM -0400, Rik van Riel wrote:
> [actually include all 7 patches]
>
> A long time ago, we decided to limit the number of VMAs per
> process to 64k. As it turns out, there actually are programs
> using tens of thousands of VMAs.
>
> The linear search in arch_get_unmapped_area and
> arch_get_unmapped_area_topdown can be a real issue for
> those programs.
>
> This patch series aims to fix the scalability issue by
> tracking the size of each free hole in the VMA rbtree,
> propagating the free hole info up the tree.
>
> Another major goal is to put the bulk of the necessary
> arch_get_unmapped_area(_topdown) functionality into one
> set of functions, so we can eliminate the custom large
> functions per architecture, sticking to a few much smaller
> architecture specific functions instead.
>
> In this version I have only gotten rid of the x86, ARM
> and MIPS arch-specific code, and am already showing a
> fairly promising diffstat:
>
> arch/arm/include/asm/pgtable.h | 6
> arch/arm/mm/init.c | 3
> arch/arm/mm/mmap.c | 217 ------------------
> arch/mips/include/asm/page.h | 2
> arch/mips/include/asm/pgtable.h | 7
> arch/mips/mm/mmap.c | 177 --------------
> arch/x86/include/asm/elf.h | 3
> arch/x86/include/asm/pgtable_64.h | 4
> arch/x86/kernel/sys_x86_64.c | 200 ++--------------
> arch/x86/vdso/vma.c | 2
> include/linux/mm_types.h | 8
> include/linux/sched.h | 13 +
> mm/internal.h | 5
> mm/mmap.c | 455 ++++++++++++++++++++++++++++++--------
> 14 files changed, 420 insertions(+), 682 deletions(-)
>
> TODO:
> - eliminate arch-specific functions for more architectures
> - integrate hugetlbfs alignment (with Andi Kleen's patch?)
>
> Performance
>
> Testing performance with a benchmark that allocates tens
> of thousands of VMAs, unmaps them and mmaps them some more
> in a loop, shows promising results.
>
> Vanilla 3.4 kernel:
> $ ./agua_frag_test_64
> ..........
>
> Min Time (ms): 6
> Avg. Time (ms): 294.0000
> Max Time (ms): 609
> Std Dev (ms): 113.1664
> Standard deviation exceeds 10
>
> With patches:
> $ ./agua_frag_test_64
> ..........
>
> Min Time (ms): 14
> Avg. Time (ms): 38.0000
> Max Time (ms): 60
> Std Dev (ms): 3.9312
> All checks pass
>
> The total run time of the test goes down by about a
> factor 4. More importantly, the worst case performance
> of the loop (which is what really hurt some applications)
> has gone down by about a factor 10.

I ran 8 4-job kernel builds on before and after kernels, here are the
results:

1x4kernbench-3.5.0-rc3-next-20120619 1x4kernbench-3.5.0-rc3-next-20120619-00007-g594e750
Elapsed time 273.95 ( +0.00%) 274.11 ( +0.06%)
Elapsed time (stddev) 0.23 ( +0.00%) 0.20 ( -2.81%)
User time 463.38 ( +0.00%) 463.13 ( -0.05%)
User time (stddev) 0.16 ( +0.00%) 0.23 ( +6.66%)
System time 49.36 ( +0.00%) 50.16 ( +1.60%)
System time (stddev) 0.24 ( +0.00%) 0.26 ( +1.43%)

Walltime is unchanged, but system time went up a tiny notch.

I suspect this comes from the freegap propagation, which can be quite
a bit of work if vmas are created, split/merged, or unmapped deep down
in the rb tree. Here is a worst-case scenario that creates a bunch of
vmas next to each other and then unmaps and remaps one in the middle
repeatedly in a tight loop (code below):

vanilla: 0.802003266 seconds time elapsed ( +- 0.66% )
patched: 1.710614276 seconds time elapsed ( +- 0.28% )

vanilla:
7.50% freegap [kernel.kallsyms] [k] perf_event_mmap
6.73% freegap [kernel.kallsyms] [k] __split_vma
6.06% freegap [kernel.kallsyms] [k] unmap_single_vma
5.96% freegap [kernel.kallsyms] [k] vma_adjust
5.55% freegap [kernel.kallsyms] [k] do_munmap
5.24% freegap [kernel.kallsyms] [k] find_vma_prepare
4.21% freegap [kernel.kallsyms] [k] rb_insert_color
4.13% freegap [kernel.kallsyms] [k] vma_merge
3.39% freegap [kernel.kallsyms] [k] find_vma
2.88% freegap [kernel.kallsyms] [k] do_mmap_pgoff
patched:
28.36% freegap [kernel.kallsyms] [k] vma_rb_augment_cb
12.20% freegap [kernel.kallsyms] [k] rb_augment_path
3.85% freegap [kernel.kallsyms] [k] unmap_single_vma
3.68% freegap [kernel.kallsyms] [k] perf_event_mmap
3.56% freegap [kernel.kallsyms] [k] vma_adjust
3.30% freegap [kernel.kallsyms] [k] __split_vma
3.03% freegap [kernel.kallsyms] [k] rb_erase
2.56% freegap [kernel.kallsyms] [k] arch_get_unmapped_area_topdown
1.97% freegap [kernel.kallsyms] [k] rb_insert_color
1.92% freegap [kernel.kallsyms] [k] vma_merge

One idea would be to be a bit more generous with free space and lazily
update the holes. Chain up vmas that need to propagate gaps and do it
in batch. If the same vma is unmapped and remapped, it would first
move to the left-most gap but then the vma that'd need propagation
would always be the same neighboring one, and it can be easily checked
if it's already linked to the lazy-update chain, so no extra work in
this case. When the chain is full (or finding a hole failed), the
batched propagation can at the same time unlink any visited vma that
are also linked to the lazy-update chain, in an effort to avoid repeat
tree path traversals.

And vma_adjust() could probably be more discriminate about splits and
merges where the holes don't change, no?

---

#include <sys/mman.h>
#include <stdio.h>

int main(void)
{
unsigned int i;
char *map;

for (i = 0; i < 512; i++)
mmap(NULL, 1 << 12, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
map = mmap(NULL, 1 << 12, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
sbrk(0);
for (i = 0; i < 512; i++)
mmap(NULL, 1 << 12, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
sbrk(0);
for (i = 0; i < (1 << 20); i++) {
#if 1
mprotect(map, 1 << 6, PROT_NONE);
mprotect(map, 1 << 6, PROT_READ);
#else
munmap(map, 1 << 12);
map = mmap(NULL, 1 << 12, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
#endif
}
return 0;
}

2012-06-21 11:02:27

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -mm 1/7] mm: track free size between VMAs in VMA rbtree

On Mon, 2012-06-18 at 18:05 -0400, Rik van Riel wrote:
> +static void adjust_free_gap(struct vm_area_struct *vma)
> +{
> + rb_augment_erase_end(&vma->vm_rb, vma_rb_augment_cb, NULL);
> +}

This is a somewhat unorthodox usage of the rb_augment_erase_end()
function, at the very least that wants a comment.

2012-06-21 11:07:59

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -mm 1/7] mm: track free size between VMAs in VMA rbtree

On Mon, 2012-06-18 at 18:05 -0400, Rik van Riel wrote:
> @@ -473,11 +524,17 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
> struct vm_area_struct *prev)
> {
> struct vm_area_struct *next = vma->vm_next;
> + struct rb_node *deepest;
>
> prev->vm_next = next;
> - if (next)
> + if (next) {
> next->vm_prev = prev;
> + adjust_free_gap(next);
> + }
> + deepest = rb_augment_erase_begin(&vma->vm_rb);
> rb_erase(&vma->vm_rb, &mm->mm_rb);
> + rb_augment_erase_end(deepest, vma_rb_augment_cb, NULL);
> +
> if (mm->mmap_cache == vma)
> mm->mmap_cache = prev;
> }


> @@ -1933,7 +2002,10 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
> insertion_point = (prev ? &prev->vm_next : &mm->mmap);
> vma->vm_prev = NULL;
> do {
> + struct rb_node *deepest;
> + deepest = rb_augment_erase_begin(&vma->vm_rb);
> rb_erase(&vma->vm_rb, &mm->mm_rb);
> + rb_augment_erase_end(deepest, vma_rb_augment_cb, NULL);


---
include/linux/rbtree.h | 8 ++++++++
1 file changed, 8 insertions(+)

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 033b507..07c5843 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -155,6 +155,14 @@ extern struct rb_node *rb_augment_erase_begin(struct rb_node *node);
extern void rb_augment_erase_end(struct rb_node *node,
rb_augment_f func, void *data);

+static inline void rb_augment_erase(struct rb_node *node, struct rb_root *root,
+ rb_augment_f func, void *data)
+{
+ struct rb_node *deepest = rb_augment_erase_begin(node);
+ rb_erase(node, root);
+ rb_augment_erase_end(deepest, func, data);
+}
+
/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);

2012-06-21 11:21:43

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -mm 4/7] mm: make page colouring code generic

On Mon, 2012-06-18 at 18:05 -0400, Rik van Riel wrote:
> Fix the x86-64 page colouring code to take pgoff into account.

Shouldn't that be a separate patch?

2012-06-21 12:37:49

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH -mm 4/7] mm: make page colouring code generic

On Mon, Jun 18, 2012 at 06:05:23PM -0400, Rik van Riel wrote:
> From: Rik van Riel <[email protected]>
>
> Fix the x86-64 page colouring code to take pgoff into account.
> Use the x86 and MIPS page colouring code as the basis for a generic
> page colouring function.
>
> Teach the generic arch_get_unmapped_area(_topdown) code to call the
> page colouring code.
>
> Make sure that ALIGN_DOWN always aligns down, and ends up at the
> right page colour.
>
> Signed-off-by: Rik van Riel <[email protected]>
> ---
> arch/mips/include/asm/page.h | 2 -
> arch/mips/include/asm/pgtable.h | 1 +
> arch/x86/include/asm/elf.h | 3 -
> arch/x86/include/asm/pgtable_64.h | 1 +
> arch/x86/kernel/sys_x86_64.c | 35 +++++++++-----
> arch/x86/vdso/vma.c | 2 +-
> include/linux/sched.h | 8 +++-
> mm/mmap.c | 91 ++++++++++++++++++++++++++++++++-----
> 8 files changed, 111 insertions(+), 32 deletions(-)
>
> diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h
> index da9bd7d..459cc25 100644
> --- a/arch/mips/include/asm/page.h
> +++ b/arch/mips/include/asm/page.h
> @@ -63,8 +63,6 @@ extern void build_copy_page(void);
> extern void clear_page(void * page);
> extern void copy_page(void * to, void * from);
>
> -extern unsigned long shm_align_mask;
> -
> static inline unsigned long pages_do_alias(unsigned long addr1,
> unsigned long addr2)
> {
> diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
> index b2202a6..f133a4c 100644
> --- a/arch/mips/include/asm/pgtable.h
> +++ b/arch/mips/include/asm/pgtable.h
> @@ -415,6 +415,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
> */
> #define HAVE_ARCH_UNMAPPED_AREA
> #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
> +#define HAVE_ARCH_ALIGN_ADDR
>
> /*
> * No page table caches to initialise
> diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
> index 5939f44..dc2d0bf 100644
> --- a/arch/x86/include/asm/elf.h
> +++ b/arch/x86/include/asm/elf.h
> @@ -358,8 +358,6 @@ static inline int mmap_is_ia32(void)
> enum align_flags {
> ALIGN_VA_32 = BIT(0),
> ALIGN_VA_64 = BIT(1),
> - ALIGN_VDSO = BIT(2),
> - ALIGN_TOPDOWN = BIT(3),
> };
>
> struct va_alignment {
> @@ -368,5 +366,4 @@ struct va_alignment {
> } ____cacheline_aligned;
>
> extern struct va_alignment va_align;
> -extern unsigned long align_addr(unsigned long, struct file *, enum align_flags);
> #endif /* _ASM_X86_ELF_H */
> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
> index 8af36f6..8408ccd 100644
> --- a/arch/x86/include/asm/pgtable_64.h
> +++ b/arch/x86/include/asm/pgtable_64.h
> @@ -170,6 +170,7 @@ extern void cleanup_highmap(void);
> #define HAVE_ARCH_UNMAPPED_AREA
> #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
> #define HAVE_ARCH_GET_ADDRESS_RANGE
> +#define HAVE_ARCH_ALIGN_ADDR
>
> #define pgtable_cache_init() do { } while (0)
> #define check_pgt_cache() do { } while (0)
> diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
> index 2595a5e..ac0afb8 100644
> --- a/arch/x86/kernel/sys_x86_64.c
> +++ b/arch/x86/kernel/sys_x86_64.c
> @@ -25,31 +25,40 @@
> * @flags denotes the allocation direction - bottomup or topdown -
> * or vDSO; see call sites below.
> */
> -unsigned long align_addr(unsigned long addr, struct file *filp,
> - enum align_flags flags)
> +unsigned long arch_align_addr(unsigned long addr, struct file *filp,
> + unsigned long pgoff, unsigned long flags,
> + enum mmap_allocation_direction direction)

Arguments vertical alignment too, not only addr alignment :-)

> {
> - unsigned long tmp_addr;
> + unsigned long tmp_addr = PAGE_ALIGN(addr);

I'm guessing addr coming from arch_get_unmapped_area(_topdown) might not
be page-aligned in all cases?

>
> /* handle 32- and 64-bit case with a single conditional */
> if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
> - return addr;
> + return tmp_addr;
>
> - if (!(current->flags & PF_RANDOMIZE))
> - return addr;
> + /* Always allow MAP_FIXED. Colouring is a performance thing only. */
> + if (flags & MAP_FIXED)
> + return tmp_addr;

Why here? Maybe we should push this MAP_FIXED check up in the
arch_get_unmapped_area(_topdown) and not call arch_align_addr() for
MAP_FIXED requests?

Or do you want to save some code duplication?

> - if (!((flags & ALIGN_VDSO) || filp))
> - return addr;
> + if (!(current->flags & PF_RANDOMIZE))
> + return tmp_addr;
>
> - tmp_addr = addr;
> + if (!(filp || direction == ALLOC_VDSO))
> + return tmp_addr;
>
> /*
> * We need an address which is <= than the original
> * one only when in topdown direction.
> */
> - if (!(flags & ALIGN_TOPDOWN))
> + if (direction == ALLOC_UP)
> tmp_addr += va_align.mask;
>
> tmp_addr &= ~va_align.mask;
> + tmp_addr += ((pgoff << PAGE_SHIFT) & va_align.mask);
> +
> + if (direction == ALLOC_DOWN && tmp_addr > addr) {
> + tmp_addr -= va_align.mask;
> + tmp_addr &= ~va_align.mask;
> + }
>
> return tmp_addr;
> }
> @@ -159,7 +168,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
>
> full_search:
>
> - addr = align_addr(addr, filp, 0);
> + addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_UP);
>
> for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
> /* At this point: (!vma || addr < vma->vm_end). */
> @@ -186,7 +195,7 @@ full_search:
> mm->cached_hole_size = vma->vm_start - addr;
>
> addr = vma->vm_end;
> - addr = align_addr(addr, filp, 0);
> + addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_UP);
> }
> }
>
> @@ -235,7 +244,7 @@ try_again:
>
> addr -= len;
> do {
> - addr = align_addr(addr, filp, ALIGN_TOPDOWN);
> + addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_DOWN);
>
> /*
> * Lookup failure means no vma is above this address,
> diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
> index 00aaf04..83e0355 100644
> --- a/arch/x86/vdso/vma.c
> +++ b/arch/x86/vdso/vma.c
> @@ -141,7 +141,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
> * unaligned here as a result of stack start randomization.
> */
> addr = PAGE_ALIGN(addr);
> - addr = align_addr(addr, NULL, ALIGN_VDSO);
> + addr = arch_align_addr(addr, NULL, 0, 0, ALLOC_VDSO);
>
> return addr;
> }
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index fc76318..18f9326 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -390,12 +390,18 @@ extern int sysctl_max_map_count;
> #ifdef CONFIG_MMU
> enum mmap_allocation_direction {
> ALLOC_UP,
> - ALLOC_DOWN
> + ALLOC_DOWN,
> + ALLOC_VDSO,
> };
> extern void arch_pick_mmap_layout(struct mm_struct *mm);
> extern void
> arch_get_address_range(unsigned long flags, unsigned long *begin,
> unsigned long *end, enum mmap_allocation_direction direction);
> +extern unsigned long shm_align_mask;
> +extern unsigned long
> +arch_align_addr(unsigned long addr, struct file *filp,
> + unsigned long pgoff, unsigned long flags,
> + enum mmap_allocation_direction direction);
> extern unsigned long
> arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
> unsigned long, unsigned long);
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 92cf0bf..0314cb1 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1465,6 +1465,51 @@ unacct_error:
> return error;
> }
>
> +#ifndef HAVE_ARCH_ALIGN_ADDR
> +/* Each architecture is responsible for setting this to the required value. */
> +unsigned long shm_align_mask = PAGE_SIZE - 1;
> +EXPORT_SYMBOL(shm_align_mask);
> +
> +unsigned long arch_align_addr(unsigned long addr, struct file *filp,
> + unsigned long pgoff, unsigned long flags,
> + enum mmap_allocation_direction direction)

Ditto on arguments alignment as above.

> +{
> + unsigned long tmp_addr = PAGE_ALIGN(addr);
> +
> + if (shm_align_mask <= PAGE_SIZE)
> + return tmp_addr;
> +
> + /* Allow MAP_FIXED without MAP_SHARED at any address. */
> + if ((flags & (MAP_FIXED|MAP_SHARED)) == MAP_FIXED)
> + return tmp_addr;
> +
> + /* Enforce page colouring for any file or MAP_SHARED mapping. */
> + if (!(filp || (flags & MAP_SHARED)))
> + return tmp_addr;
> +
> + /*
> + * We need an address which is <= than the original
> + * one only when in topdown direction.
> + */
> + if (direction == ALLOC_UP)
> + tmp_addr += shm_align_mask;
> +
> + tmp_addr &= ~shm_align_mask;
> + tmp_addr += ((pgoff << PAGE_SHIFT) & shm_align_mask);
> +
> + /*
> + * When aligning down, make sure we did not accidentally go up.
> + * The caller will check for underflow.
> + */

Can we add this comment to the x86-64 version of arch_align_addr too pls?

> + if (direction == ALLOC_DOWN && tmp_addr > addr) {
> + tmp_addr -= shm_align_mask;
> + tmp_addr &= ~shm_align_mask;
> + }
> +
> + return tmp_addr;
> +}
> +#endif
> +
> #ifndef HAVE_ARCH_GET_ADDRESS_RANGE
> void arch_get_address_range(unsigned long flags, unsigned long *begin,
> unsigned long *end, enum mmap_allocation_direction direction)
> @@ -1513,18 +1558,22 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
> struct mm_struct *mm = current->mm;
> struct vm_area_struct *vma = NULL;
> struct rb_node *rb_node;
> - unsigned long lower_limit, upper_limit;
> + unsigned long lower_limit, upper_limit, tmp_addr;
>
> arch_get_address_range(flags, &lower_limit, &upper_limit, ALLOC_UP);
>
> if (len > TASK_SIZE)
> return -ENOMEM;
>
> - if (flags & MAP_FIXED)
> + if (flags & MAP_FIXED) {
> + tmp_addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_UP);
> + if (tmp_addr != PAGE_ALIGN(addr))
> + return -EINVAL;
> return addr;
> + }
>
> if (addr) {
> - addr = PAGE_ALIGN(addr);
> + addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_UP);
> vma = find_vma(mm, addr);
> if (TASK_SIZE - len >= addr &&
> (!vma || addr + len <= vma->vm_start))
> @@ -1533,7 +1582,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
>
> /* Find the left-most free area of sufficient size. */
> for (addr = 0, rb_node = mm->mm_rb.rb_node; rb_node; ) {
> - unsigned long vma_start;
> + unsigned long vma_start, tmp_addr;
> int found_here = 0;
>
> vma = rb_to_vma(rb_node);
> @@ -1541,13 +1590,17 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
> if (vma->vm_start > len) {
> if (!vma->vm_prev) {
> /* This is the left-most VMA. */
> - if (vma->vm_start - len >= lower_limit) {
> - addr = lower_limit;
> + tmp_addr = arch_align_addr(lower_limit, filp,
> + pgoff, flags, ALLOC_UP);
> + if (vma->vm_start - len >= tmp_addr) {
> + addr = tmp_addr;
> goto found_addr;
> }
> } else {
> /* Is this hole large enough? Remember it. */
> vma_start = max(vma->vm_prev->vm_end, lower_limit);
> + vma_start = arch_align_addr(vma_start, filp,
> + pgoff, flags, ALLOC_UP);
> if (vma->vm_start - len >= vma_start) {
> addr = vma_start;
> found_here = 1;
> @@ -1599,6 +1652,8 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
> if (addr < lower_limit)
> addr = lower_limit;
>
> + addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_UP);
> +
> found_addr:
> if (TASK_SIZE - len < addr)
> return -ENOMEM;
> @@ -1656,12 +1711,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
> if (len > TASK_SIZE)
> return -ENOMEM;
>
> - if (flags & MAP_FIXED)
> + if (flags & MAP_FIXED) {
> + unsigned long tmp_addr;
> + tmp_addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_DOWN);
> + if (tmp_addr != PAGE_ALIGN(addr))
> + return -EINVAL;
> return addr;
> + }
>
> /* requesting a specific address */
> if (addr) {
> - addr = PAGE_ALIGN(addr);
> + addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_DOWN);
> vma = find_vma(mm, addr);
> if (TASK_SIZE - len >= addr &&
> (!vma || addr + len <= vma->vm_start))
> @@ -1678,7 +1738,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
> */
> if (upper_limit - len > mm->highest_vma) {
> addr = upper_limit - len;
> - goto found_addr;
> + addr = arch_align_addr(addr, filp, pgoff, flags, ALLOC_DOWN);
> + if (addr > mm->highest_vma);
> + goto found_addr;
> }
>
> /* Find the right-most free area of sufficient size. */
> @@ -1691,9 +1753,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
> /* Is this hole large enough? Remember it. */
> vma_start = min(vma->vm_start, upper_limit);
> if (vma_start > len) {
> - if (!vma->vm_prev ||
> - (vma_start - len >= vma->vm_prev->vm_end)) {
> - addr = vma_start - len;
> + unsigned long tmp_addr = vma_start - len;
> + tmp_addr = arch_align_addr(tmp_addr, filp,
> + pgoff, flags, ALLOC_DOWN);
> + /* No underflow? Does it still fit the hole? */
> + if (tmp_addr && tmp_addr <= vma_start - len &&
> + (!vma->vm_prev ||
> + tmp_addr >= vma->vm_prev->vm_end)) {
> + addr = tmp_addr;
> found_here = 1;
> }
> }
> --
> 1.7.7.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

2012-06-21 13:18:28

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH -mm 2/7] mm: get unmapped area from VMA tree

On 06/21/2012 05:01 AM, Johannes Weiner wrote:

>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>> index bf56d66..8ccb4e1 100644
>> --- a/include/linux/mm_types.h
>> +++ b/include/linux/mm_types.h
>> @@ -307,6 +307,7 @@ struct mm_struct {
>> unsigned long task_size; /* size of task vm space */
>> unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
>> unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
>> + unsigned long highest_vma; /* highest vma end address */
>
> It's not clear from the name that this is an end address. Would
> highest_vm_end be better?

Good idea. Will fix.

>> + /* Find the left-most free area of sufficient size. */
>> + for (addr = 0, rb_node = mm->mm_rb.rb_node; rb_node; ) {
>> + unsigned long vma_start;
>> + int found_here = 0;
>> +
>> + vma = rb_to_vma(rb_node);
>> +
>> + if (vma->vm_start> len) {
>
> vmas can abut, and vma->vm_end == vma->vm_next->vm_start. Should this
> be>=?

We do not want to mmap at address 0.

>> + /* Go left if it looks promising. */
>> + if (node_free_hole(rb_node->rb_left)>= len&&
>> + vma->vm_start - len>= lower_limit) {
>> + rb_node = rb_node->rb_left;
>> + continue;
>
> If we already are at a vma whose start has a lower address than the
> overall length, does it make sense to check for a left hole?
> I.e. shouldn't this be inside the if (vma->vm_start> len) block?

I am trying to preserve the same fragmentation
semantics as the current code, so we do not
get any regressions in that area.

>> + /*
>> + * There is not enough space to the left of any VMA.
>> + * Check the far right-hand side of the VMA tree.
>> + */
>> + rb_node = mm->mm_rb.rb_node;
>> + while (rb_node->rb_right)
>> + rb_node = rb_node->rb_right;
>> + vma = rb_to_vma(rb_node);
>> + addr = vma->vm_end;
>
> Unless I missed something, we only reach here when
> continue_next_right(rb_node) above returned NULL. And if it does, the
> rb_node it was passed was the right-most node in the tree, so we could
> do something like

We break out of the large while() loop once rb_node
is NULL, due to falling off the end of the tree.

> } else if (!addr) {
> struct rb_node *rb_right = continue_next_right(rb_node);
> if (!rb_right)
> break;
> rb_node = rb_right;
> continue;
> }
>
> above and then save the lookup after the loop.

That might work, but I expect the situation to be rare
enough that I would rather pick the more readable option.

> Also, dereferencing mm->mm_rb.rb_node unconditionally after the loop
> assumes that the tree always contains at least one vma. Is this
> guaranteed for all architectures?

When a process is execve'd, a stack VMA is set up.
This means every process has at least one VMA by the
time we can get to this code.

>> -fail:
>> - /*
>> - * if hint left us with no space for the requested
>> - * mapping then try again:
>> - *
>> - * Note: this is different with the case of bottomup
>> - * which does the fully line-search, but we use find_vma
>> - * here that causes some holes skipped.
>> - */
>> - if (start_addr != mm->mmap_base) {
>> - mm->free_area_cache = mm->mmap_base;
>> - mm->cached_hole_size = 0;
>> - goto try_again;
>> + if (!found_here&& node_free_hole(rb_node->rb_left)>= len) {
>> + /* Last known hole is to the right of this subtree. */
>
> "to the left"

Thanks, will fix.

--
All rights reversed

2012-06-21 13:25:48

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH -mm 4/7] mm: make page colouring code generic

On 06/21/2012 08:37 AM, Borislav Petkov wrote:

>> -unsigned long align_addr(unsigned long addr, struct file *filp,
>> - enum align_flags flags)
>> +unsigned long arch_align_addr(unsigned long addr, struct file *filp,
>> + unsigned long pgoff, unsigned long flags,
>> + enum mmap_allocation_direction direction)
>
> Arguments vertical alignment too, not only addr alignment :-)

Will do.

>> {
>> - unsigned long tmp_addr;
>> + unsigned long tmp_addr = PAGE_ALIGN(addr);
>
> I'm guessing addr coming from arch_get_unmapped_area(_topdown) might not
> be page-aligned in all cases?

That is my guess, too :)

In some places arch_get_unmapped_area(_topdown) called
PAGE_ALIGN(addr), so we should make sure it is called.

It is probably masking bugs in some old old application,
and calling it here really should not hurt.

>> - if (!(current->flags& PF_RANDOMIZE))
>> - return addr;
>> + /* Always allow MAP_FIXED. Colouring is a performance thing only. */
>> + if (flags& MAP_FIXED)
>> + return tmp_addr;
>
> Why here? Maybe we should push this MAP_FIXED check up in the
> arch_get_unmapped_area(_topdown) and not call arch_align_addr() for
> MAP_FIXED requests?
>
> Or do you want to save some code duplication?

The problem is that certain other architectures have
data cache alignment requirements, where mis-aligning
somebody's mmap of a file could result in actual data
corruption.

This means that, for those architectures, we have to
refuse non-colour-aligned MAP_FIXED mappings.

On x86 we can allow them, so we do. But that decision
needs to be taken in architecture specific code, not
in the shared arch_get_unmapped_area(_topdown) :)

>> + /*
>> + * When aligning down, make sure we did not accidentally go up.
>> + * The caller will check for underflow.
>> + */
>
> Can we add this comment to the x86-64 version of arch_align_addr too pls?

Will do.

--
All rights reversed

2012-06-21 14:32:33

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH -mm 4/7] mm: make page colouring code generic

On 06/21/2012 07:20 AM, Peter Zijlstra wrote:
> On Mon, 2012-06-18 at 18:05 -0400, Rik van Riel wrote:
>> Fix the x86-64 page colouring code to take pgoff into account.
>
> Shouldn't that be a separate patch?

My idea was that it would be easier to review
these two nearly identical functions together.

Andrew, do you have any strong opinions?

2012-06-21 14:47:21

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH -mm 1/7] mm: track free size between VMAs in VMA rbtree

On Mon, Jun 18, 2012 at 06:05:20PM -0400, Rik van Riel wrote:
> From: Rik van Riel <[email protected]>
>
> Track the size of free areas between VMAs in the VMA rbtree.
>
> This will allow get_unmapped_area_* to find a free area of the
> right size in O(log(N)) time, instead of potentially having to
> do a linear walk across all the VMAs.
>
> Signed-off-by: Rik van Riel <[email protected]>
> ---
> include/linux/mm_types.h | 7 ++++
> mm/internal.h | 5 +++
> mm/mmap.c | 76 +++++++++++++++++++++++++++++++++++++++++++++-
> 3 files changed, 87 insertions(+), 1 deletions(-)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index dad95bd..bf56d66 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -213,6 +213,13 @@ struct vm_area_struct {
> struct rb_node vm_rb;
>
> /*
> + * Largest free memory gap "behind" this VMA (in the direction mmap
> + * grows from), or of VMAs down the rb tree below us. This helps
> + * get_unmapped_area find a free area of the right size.
> + */
> + unsigned long free_gap;
> +
> + /*
> * For areas with an address space and backing store,
> * linkage into the address_space->i_mmap prio tree, or
> * linkage to the list of like vmas hanging off its node, or
> diff --git a/mm/internal.h b/mm/internal.h
> index 2ba87fb..f59f97a 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -159,6 +159,11 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
> munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
> }
>
> +static inline struct vm_area_struct *rb_to_vma(struct rb_node *node)
> +{
> + return container_of(node, struct vm_area_struct, vm_rb);
> +}
> +
> /*
> * Called only in fault path via page_evictable() for a new page
> * to determine if it's being mapped into a LOCKED vma.
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 3edfcdf..1963ef9 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -205,6 +205,51 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> flush_dcache_mmap_unlock(mapping);
> }
>
> +static unsigned long max_free_space(struct rb_node *node)
> +{

Ok, this is not impenetrable but it took me a while to work out what it
means and even then I'm not sure I have it right. Certainly I'll have it
all forgotten in a weeks time.

At the *very least*, this function does not return "max free space" at
all. It instead gives information on the largest gap within a local part
of the rb tree. When used in conjunction with rb_augment_erase_end then the
end result is that free space information is propogated throughout the tree.

How about this?

/*
* local_largest_gap - Returns the largest gap "nearby"
*
* This function is called when propgating "up" the rbtree for each
* node encountered during the ascent to propogate free space
* information.
*
* Take a simple example of a VMA in an rb tree
*
* G1 LEFT G2 VMA G3 RIGHT G4
* H----|oooooo|----------|ooooooo|-------------|ooooo|----H
*
* This function returns the largest "local" gap to VMA. It will return
* one of the following
* 1. The gap between this VMA and the prev (G2
* 2. The largest gap left of "LEFT"
* 3. The largest gap left of "RIGHT"
*
* The gap is always searched to the left to bias what direction we are
* searching for free space in. This left-most packing of VMAs is an
* attempt to reduce address space fragmentation
*/

There are two consequences of this that I can see.

1. Insertion time must be higher now because a tree propagation of
information is necessary. Searching time for adding a new VMA is
lower but insertion time in the tree is higher. How high depends on
the number of VMAs because it'll be related to the height of the RB
tree. Please mention this in the changelog if I'm right. If I'm wrong,
slap your knee heartily and laugh all the day long then write down
what is right and stick that in the changelog or a comment.

2. How this largest_gap information is used is important. As I write
this I am having serious trouble visualising how an RB tree walk that is
ordered by address space layout can always find the smallest hole. It'll
find *a* small hole and it should be able to do some packing but I
fear that there is an increased risk of an adverse workload externally
fragmenting the address space.

> + struct vm_area_struct *vma, *prev, *left = NULL, *right = NULL;
> + unsigned long largest = 0;
> +
> + if (node->rb_left)
> + left = rb_to_vma(node->rb_left);
> + if (node->rb_right)
> + right = rb_to_vma(node->rb_right);
> +
> + /*
> + * Calculate the free gap size between us and the
> + * VMA to our left.
> + */
> + vma = rb_to_vma(node);
> + prev = vma->vm_prev;
> +
> + if (prev)
> + largest = vma->vm_start - prev->vm_end;
> + else
> + largest = vma->vm_start;
> +
> + /* We propagate the largest of our own, or our children's free gaps. */
> + if (left)
> + largest = max(largest, left->free_gap);
> + if (right)
> + largest = max(largest, right->free_gap);
> +
> + return largest;
> +}
> +
> +static void vma_rb_augment_cb(struct rb_node *node, void *__unused)
> +{
> + struct vm_area_struct *vma;
> +
> + vma = rb_to_vma(node);
> +
> + vma->free_gap = max_free_space(node);
> +}
> +
> +static void adjust_free_gap(struct vm_area_struct *vma)
> +{
> + rb_augment_erase_end(&vma->vm_rb, vma_rb_augment_cb, NULL);
> +}
> +

That thing needs a comment too saying how it uses local_largest_gap (or
whatever you call it) to propagate information throughout the tree.

> /*
> * Unlink a file-based vm structure from its prio_tree, to hide
> * vma from rmap and vmtruncate before freeing its page tables.
> @@ -342,6 +387,8 @@ void validate_mm(struct mm_struct *mm)
> int i = 0;
> struct vm_area_struct *tmp = mm->mmap;
> while (tmp) {
> + if (tmp->free_gap != max_free_space(&tmp->vm_rb))
> + printk("free space %lx, correct %lx\n", tmp->free_gap, max_free_space(&tmp->vm_rb)), bug = 1;
> tmp = tmp->vm_next;
> i++;
> }
> @@ -398,6 +445,10 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
> {
> rb_link_node(&vma->vm_rb, rb_parent, rb_link);
> rb_insert_color(&vma->vm_rb, &mm->mm_rb);
> + adjust_free_gap(vma);
> + /* Propagate the new free gap between next and us up the tree. */
> + if (vma->vm_next)
> + adjust_free_gap(vma->vm_next);
> }
>
> static void __vma_link_file(struct vm_area_struct *vma)
> @@ -473,11 +524,17 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
> struct vm_area_struct *prev)
> {
> struct vm_area_struct *next = vma->vm_next;
> + struct rb_node *deepest;
>
> prev->vm_next = next;
> - if (next)
> + if (next) {
> next->vm_prev = prev;
> + adjust_free_gap(next);
> + }
> + deepest = rb_augment_erase_begin(&vma->vm_rb);
> rb_erase(&vma->vm_rb, &mm->mm_rb);
> + rb_augment_erase_end(deepest, vma_rb_augment_cb, NULL);
> +
> if (mm->mmap_cache == vma)
> mm->mmap_cache = prev;
> }
> @@ -657,6 +714,15 @@ again: remove_next = 1 + (end > next->vm_end);
> if (insert && file)
> uprobe_mmap(insert);
>
> + /* Adjust the rb tree for changes in the free gaps between VMAs. */
> + adjust_free_gap(vma);
> + if (insert)
> + adjust_free_gap(insert);
> + if (vma->vm_next && vma->vm_next != insert)
> + adjust_free_gap(vma->vm_next);
> + if (insert && insert->vm_next && insert->vm_next != vma)
> + adjust_free_gap(insert->vm_next);
> +

I feel this should move into a helper that is right below adjust_free_gap
with bonus points for explaining why each of the three additional
adjust_free_gaps() are necessary.

> validate_mm(mm);
>
> return 0;
> @@ -1760,6 +1826,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> error = acct_stack_growth(vma, size, grow);
> if (!error) {
> vma->vm_end = address;
> + if (vma->vm_next)
> + adjust_free_gap(vma->vm_next);
> perf_event_mmap(vma);
> }
> }
> @@ -1811,6 +1879,7 @@ int expand_downwards(struct vm_area_struct *vma,
> if (!error) {
> vma->vm_start = address;
> vma->vm_pgoff -= grow;
> + adjust_free_gap(vma);
> perf_event_mmap(vma);
> }
> }
> @@ -1933,7 +2002,10 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
> insertion_point = (prev ? &prev->vm_next : &mm->mmap);
> vma->vm_prev = NULL;
> do {
> + struct rb_node *deepest;
> + deepest = rb_augment_erase_begin(&vma->vm_rb);
> rb_erase(&vma->vm_rb, &mm->mm_rb);
> + rb_augment_erase_end(deepest, vma_rb_augment_cb, NULL);
> mm->map_count--;
> tail_vma = vma;
> vma = vma->vm_next;
> @@ -1941,6 +2013,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
> *insertion_point = vma;
> if (vma)
> vma->vm_prev = prev;
> + if (vma)
> + rb_augment_erase_end(&vma->vm_rb, vma_rb_augment_cb, NULL);
> tail_vma->vm_next = NULL;
> if (mm->unmap_area == arch_unmap_area)
> addr = prev ? prev->vm_end : mm->mmap_base;
> --
> 1.7.7.6
>

--
Mel Gorman
SUSE Labs

2012-06-21 16:16:56

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH -mm 2/7] mm: get unmapped area from VMA tree

On Mon, Jun 18, 2012 at 06:05:21PM -0400, Rik van Riel wrote:
> From: Rik van Riel <[email protected]>
>
> Change the generic implementations of arch_get_unmapped_area(_topdown)
> to use the free space info in the VMA rbtree. This makes it possible
> to find free address space in O(log(N)) complexity.
>
> For bottom-up allocations, we pick the lowest hole that is large
> enough for our allocation. For topdown allocations, we pick the
> highest hole of sufficient size.
>
> For topdown allocations, we need to keep track of the highest
> mapped VMA address, because it might be below mm->mmap_base,
> and we only keep track of free space to the left of each VMA
> in the VMA tree. It is tempting to try and keep track of
> the free space to the right of each VMA when running in
> topdown mode, but that gets us into trouble when running on
> x86, where a process can switch direction in the middle of
> execve.
>
> We have to leave the mm->free_area_cache and mm->largest_hole_size
> in place for now, because the architecture specific versions still
> use those.
>

Stick them under a config ARCH_USES_GENERIC_GET_UNMAPPED?

Worth mentioning in the changelog that the reduced search time means that
we no longer need free_area_cache and instead do a search from the root
RB node every time.

> Signed-off-by: Rik van Riel <[email protected]>
> ---
> include/linux/mm_types.h | 1 +
> mm/mmap.c | 270 +++++++++++++++++++++++++++++++---------------
> 2 files changed, 184 insertions(+), 87 deletions(-)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index bf56d66..8ccb4e1 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -307,6 +307,7 @@ struct mm_struct {
> unsigned long task_size; /* size of task vm space */
> unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
> unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
> + unsigned long highest_vma; /* highest vma end address */

With a name like highest_vma I would expect it to be a struct
vm_area_struct. Name it highest_mapped_addr or something.

> pgd_t * pgd;
> atomic_t mm_users; /* How many users with user space? */
> atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 1963ef9..40c848e 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -4,6 +4,7 @@
> * Written by obz.
> *
> * Address space accounting code <[email protected]>
> + * Rbtree get_unmapped_area Copyright (C) 2012 Rik van Riel
> */
>
> #include <linux/slab.h>
> @@ -250,6 +251,17 @@ static void adjust_free_gap(struct vm_area_struct *vma)
> rb_augment_erase_end(&vma->vm_rb, vma_rb_augment_cb, NULL);
> }
>
> +static unsigned long node_free_hole(struct rb_node *node)
> +{
> + struct vm_area_struct *vma;
> +
> + if (!node)
> + return 0;
> +
> + vma = container_of(node, struct vm_area_struct, vm_rb);

What's wrong with using rb_to_vma?

return node ? rb_to_vma(node)->free_gap : 0;

> + return vma->free_gap;
> +}

function calls it free_hole but vma calls it free_gap. Pick one or
replace the function with an inlined one-liner.

> +
> /*
> * Unlink a file-based vm structure from its prio_tree, to hide
> * vma from rmap and vmtruncate before freeing its page tables.
> @@ -386,12 +398,16 @@ void validate_mm(struct mm_struct *mm)
> int bug = 0;
> int i = 0;
> struct vm_area_struct *tmp = mm->mmap;
> + unsigned long highest_address = 0;
> while (tmp) {
> if (tmp->free_gap != max_free_space(&tmp->vm_rb))
> printk("free space %lx, correct %lx\n", tmp->free_gap, max_free_space(&tmp->vm_rb)), bug = 1;
> + highest_address = tmp->vm_end;
> tmp = tmp->vm_next;
> i++;
> }
> + if (highest_address != mm->highest_vma)
> + printk("mm->highest_vma %lx, found %lx\n", mm->highest_vma, highest_address), bug = 1;
> if (i != mm->map_count)
> printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
> i = browse_rb(&mm->mm_rb);
> @@ -449,6 +465,9 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
> /* Propagate the new free gap between next and us up the tree. */
> if (vma->vm_next)
> adjust_free_gap(vma->vm_next);
> + else
> + /* This is the VMA with the highest address. */
> + mm->highest_vma = vma->vm_end;
> }
>
> static void __vma_link_file(struct vm_area_struct *vma)
> @@ -648,6 +667,8 @@ again: remove_next = 1 + (end > next->vm_end);
> vma->vm_start = start;
> vma->vm_end = end;
> vma->vm_pgoff = pgoff;
> + if (!next)
> + mm->highest_vma = end;
> if (adjust_next) {
> next->vm_start += adjust_next << PAGE_SHIFT;
> next->vm_pgoff += adjust_next;
> @@ -1456,13 +1477,29 @@ unacct_error:
> * This function "knows" that -ENOMEM has the bits set.
> */
> #ifndef HAVE_ARCH_UNMAPPED_AREA
> +struct rb_node *continue_next_right(struct rb_node *node)
> +{

I get the very strong impression that you were translating a lot of notes
and diagrams into code because you appear to be immune to commenting :)

This is not returning the next right node because to me that means you
are walking down the tree. Here you are walking "up" the tree finding a
node to the right

/*
* find_next_right_uncle - Find the an "uncle" node to the "right"
*
* An "uncle" node is a sibling node of your parent and this function
* returns an uncle to the right. Given the following basic tree, the
* node U is an uncle of node C. P is C's parent and G is C's grandparent
*
*
* G
* / \
* P U
* \
* C
* This is necessary when searching
* for a larger gap in the address space.
*/

MUWUHAhaha, watch as I destroy your attempts to reduce line count in
your diffstat.

> + struct rb_node *prev;
> +
> + while ((prev = node) && (node = rb_parent(node))) {
> + if (prev == node->rb_right)
> + continue;
> +
> + if (node->rb_right)
> + return node->rb_right;
> + }
> +
> + return NULL;
> +}
> +
> unsigned long
> arch_get_unmapped_area(struct file *filp, unsigned long addr,
> unsigned long len, unsigned long pgoff, unsigned long flags)
> {
> struct mm_struct *mm = current->mm;
> - struct vm_area_struct *vma;
> - unsigned long start_addr;
> + struct vm_area_struct *vma = NULL;
> + struct rb_node *rb_node;
> + unsigned long lower_limit = TASK_UNMAPPED_BASE;
>
> if (len > TASK_SIZE)
> return -ENOMEM;
> @@ -1477,40 +1514,76 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
> (!vma || addr + len <= vma->vm_start))
> return addr;
> }
> - if (len > mm->cached_hole_size) {
> - start_addr = addr = mm->free_area_cache;
> - } else {
> - start_addr = addr = TASK_UNMAPPED_BASE;
> - mm->cached_hole_size = 0;
> - }
>
> -full_search:
> - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
> - /* At this point: (!vma || addr < vma->vm_end). */
> - if (TASK_SIZE - len < addr) {
> - /*
> - * Start a new search - just in case we missed
> - * some holes.
> - */
> - if (start_addr != TASK_UNMAPPED_BASE) {
> - addr = TASK_UNMAPPED_BASE;
> - start_addr = addr;
> - mm->cached_hole_size = 0;
> - goto full_search;
> + /* Find the left-most free area of sufficient size. */
> + for (addr = 0, rb_node = mm->mm_rb.rb_node; rb_node; ) {

This is where we always search from the root RB node. I tried to think
of a good reason to keep the cached information and did not find a
compelling one.

> + unsigned long vma_start;
> + int found_here = 0;
> +

bool.

> + vma = rb_to_vma(rb_node);
> +
> + if (vma->vm_start > len) {
> + if (!vma->vm_prev) {
> + /* This is the left-most VMA. */
> + if (vma->vm_start - len >= lower_limit) {
> + addr = lower_limit;
> + goto found_addr;
> + }
> + } else {
> + /* Is this hole large enough? Remember it. */
> + vma_start = max(vma->vm_prev->vm_end, lower_limit);
> + if (vma->vm_start - len >= vma_start) {
> + addr = vma_start;
> + found_here = 1;
> + }
> }
> - return -ENOMEM;
> }
> - if (!vma || addr + len <= vma->vm_start) {
> - /*
> - * Remember the place where we stopped the search:
> - */
> - mm->free_area_cache = addr + len;
> - return addr;
> +
> + /* Go left if it looks promising. */
> + if (node_free_hole(rb_node->rb_left) >= len &&
> + vma->vm_start - len >= lower_limit) {
> + rb_node = rb_node->rb_left;
> + continue;
> }
> - if (addr + mm->cached_hole_size < vma->vm_start)
> - mm->cached_hole_size = vma->vm_start - addr;
> - addr = vma->vm_end;
> +
> + if (!found_here && node_free_hole(rb_node->rb_right) >= len) {
> + /* Last known hole is to the right of this subtree. */
> + rb_node = rb_node->rb_right;
> + continue;
> + } else if (!addr) {
> + rb_node = continue_next_right(rb_node);
> + continue;
> + }
> +
> + /* This is the left-most hole. */
> + goto found_addr;
> }
> +
> + /*
> + * There is not enough space to the left of any VMA.
> + * Check the far right-hand side of the VMA tree.
> + */
> + rb_node = mm->mm_rb.rb_node;
> + while (rb_node->rb_right)
> + rb_node = rb_node->rb_right;
> + vma = rb_to_vma(rb_node);
> + addr = vma->vm_end;
> +
> + /*
> + * The right-most VMA ends below the lower limit. Can only happen
> + * if a binary personality loads the stack below the executable.
> + */
> + if (addr < lower_limit)
> + addr = lower_limit;
> +
> + found_addr:
> + if (TASK_SIZE - len < addr)
> + return -ENOMEM;
> +
> + /* This "free area" was not really free. Tree corrupted? */
> + VM_BUG_ON(find_vma_intersection(mm, addr, addr+len));
> +
> + return addr;

Ok, I didn't find any flaws and it should work. I don't think it is
necessarily O(log(n)) complexity because we walk up the tree a bit but
that is not worth sweating over. O(log(n))ish is good enough.

In terms of packing I think you should be ok. You find a small gap that
can be found relatively quickly that is one the left side of the address
space. This is not necessarily the smallest possible gap nor the leftmost
but as the tree rebalances I think it will work out. At least, I
coudln't think of an adverse workload that would result in a badly
externally framented address space on 32-bit.

> }
> #endif
>
> @@ -1528,14 +1601,31 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
> * stack's low limit (the base):
> */
> #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
> +struct rb_node *continue_next_left(struct rb_node *node)
> +{

find_next_left_uncle and point to the other comment.

> + struct rb_node *prev;
> +
> + while ((prev = node) && (node = rb_parent(node))) {
> + if (prev == node->rb_left)
> + continue;
> +
> + if (node->rb_left)
> + return node->rb_left;
> + }
> +
> + return NULL;
> +}
> +
> unsigned long
> arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
> const unsigned long len, const unsigned long pgoff,
> const unsigned long flags)
> {
> - struct vm_area_struct *vma;
> + struct vm_area_struct *vma = NULL;
> struct mm_struct *mm = current->mm;
> - unsigned long addr = addr0, start_addr;
> + unsigned long addr = addr0;
> + struct rb_node *rb_node = NULL;
> + unsigned long upper_limit = mm->mmap_base;
>
> /* requested length too big for entire address space */
> if (len > TASK_SIZE)
> @@ -1553,68 +1643,65 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
> return addr;
> }
>
> - /* check if free_area_cache is useful for us */
> - if (len <= mm->cached_hole_size) {
> - mm->cached_hole_size = 0;
> - mm->free_area_cache = mm->mmap_base;
> - }
> + /* requested length too big; prevent integer underflow below */
> + if (len > upper_limit)
> + return -ENOMEM;
>
> -try_again:
> - /* either no address requested or can't fit in requested address hole */
> - start_addr = addr = mm->free_area_cache;
> + /*
> + * Does the highest VMA end far enough below the upper limit
> + * of our search space?
> + */
> + if (upper_limit - len > mm->highest_vma) {
> + addr = upper_limit - len;
> + goto found_addr;
> + }
>
> - if (addr < len)
> - goto fail;
> + /* Find the right-most free area of sufficient size. */
> + for (addr = 0, rb_node = mm->mm_rb.rb_node; rb_node; ) {
> + unsigned long vma_start;
> + int found_here = 0;
>
> - addr -= len;
> - do {
> - /*
> - * Lookup failure means no vma is above this address,
> - * else if new region fits below vma->vm_start,
> - * return with success:
> - */
> - vma = find_vma(mm, addr);
> - if (!vma || addr+len <= vma->vm_start)
> - /* remember the address as a hint for next time */
> - return (mm->free_area_cache = addr);
> + vma = container_of(rb_node, struct vm_area_struct, vm_rb);
>

rb_to_vma

> - /* remember the largest hole we saw so far */
> - if (addr + mm->cached_hole_size < vma->vm_start)
> - mm->cached_hole_size = vma->vm_start - addr;
> + /* Is this hole large enough? Remember it. */
> + vma_start = min(vma->vm_start, upper_limit);
> + if (vma_start > len) {
> + if (!vma->vm_prev ||
> + (vma_start - len >= vma->vm_prev->vm_end)) {
> + addr = vma_start - len;
> + found_here = 1;
> + }
> + }
>
> - /* try just below the current vma->vm_start */
> - addr = vma->vm_start-len;
> - } while (len < vma->vm_start);
> + /* Go right if it looks promising. */
> + if (node_free_hole(rb_node->rb_right) >= len) {
> + if (upper_limit - len > vma->vm_end) {
> + rb_node = rb_node->rb_right;
> + continue;
> + }
> + }
>
> -fail:
> - /*
> - * if hint left us with no space for the requested
> - * mapping then try again:
> - *
> - * Note: this is different with the case of bottomup
> - * which does the fully line-search, but we use find_vma
> - * here that causes some holes skipped.
> - */
> - if (start_addr != mm->mmap_base) {
> - mm->free_area_cache = mm->mmap_base;
> - mm->cached_hole_size = 0;
> - goto try_again;
> + if (!found_here && node_free_hole(rb_node->rb_left) >= len) {
> + /* Last known hole is to the right of this subtree. */
> + rb_node = rb_node->rb_left;
> + continue;
> + } else if (!addr) {
> + rb_node = continue_next_left(rb_node);
> + continue;
> + }
> +
> + /* This is the right-most hole. */
> + goto found_addr;
> }
>
> - /*
> - * A failed mmap() very likely causes application failure,
> - * so fall back to the bottom-up function here. This scenario
> - * can happen with large stack limits and large mmap()
> - * allocations.
> - */
> - mm->cached_hole_size = ~0UL;
> - mm->free_area_cache = TASK_UNMAPPED_BASE;
> - addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
> - /*
> - * Restore the topdown base:
> - */
> - mm->free_area_cache = mm->mmap_base;
> - mm->cached_hole_size = ~0UL;
> + return -ENOMEM;
> +
> + found_addr:
> + if (TASK_SIZE - len < addr)
> + return -ENOMEM;
> +
> + /* This "free area" was not really free. Tree corrupted? */
> + VM_BUG_ON(find_vma_intersection(mm, addr, addr+len));
>

I didn't read this quite as carefully but could find nothing obviously
wrong.

> return addr;
> }
> @@ -1828,6 +1915,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> vma->vm_end = address;
> if (vma->vm_next)
> adjust_free_gap(vma->vm_next);
> + if (!vma->vm_next)
> + vma->vm_mm->highest_vma = vma->vm_end;
> perf_event_mmap(vma);
> }
> }
> @@ -2013,6 +2102,13 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
> *insertion_point = vma;
> if (vma)
> vma->vm_prev = prev;
> + else {
> + /* We just unmapped the highest VMA. */
> + if (prev)
> + mm->highest_vma = prev->vm_end;
> + else
> + mm->highest_vma = 0;
> + }
> if (vma)
> rb_augment_erase_end(&vma->vm_rb, vma_rb_augment_cb, NULL);
> tail_vma->vm_next = NULL;
> --
> 1.7.7.6
>

--
Mel Gorman
SUSE Labs

2012-06-21 17:39:30

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -mm 4/7] mm: make page colouring code generic

On Thu, 21 Jun 2012 10:30:26 -0400 Rik van Riel <[email protected]> wrote:

> On 06/21/2012 07:20 AM, Peter Zijlstra wrote:
> > On Mon, 2012-06-18 at 18:05 -0400, Rik van Riel wrote:
> >> Fix the x86-64 page colouring code to take pgoff into account.
> >
> > Shouldn't that be a separate patch?
>
> My idea was that it would be easier to review
> these two nearly identical functions together.
>
> Andrew, do you have any strong opinions?

It depends on the significance of the change. I suspect it's one of
things which speeds up many workloads by 1.5% and slows down a few
weird/important ones by 11%. Which makes it a thing to be put under
the microscope and poked at. Some people might end up reverting it,
making it tunable/configurable etc etc.

If any of that is true then yes, I guess it should be a standalone thing.

2012-06-21 17:47:35

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH -mm 4/7] mm: make page colouring code generic

On 06/21/2012 01:40 PM, Andrew Morton wrote:
> On Thu, 21 Jun 2012 10:30:26 -0400 Rik van Riel<[email protected]> wrote:
>
>> On 06/21/2012 07:20 AM, Peter Zijlstra wrote:
>>> On Mon, 2012-06-18 at 18:05 -0400, Rik van Riel wrote:
>>>> Fix the x86-64 page colouring code to take pgoff into account.
>>>
>>> Shouldn't that be a separate patch?
>>
>> My idea was that it would be easier to review
>> these two nearly identical functions together.
>>
>> Andrew, do you have any strong opinions?
>
> It depends on the significance of the change. I suspect it's one of
> things which speeds up many workloads by 1.5% and slows down a few
> weird/important ones by 11%. Which makes it a thing to be put under
> the microscope and poked at. Some people might end up reverting it,
> making it tunable/configurable etc etc.
>
> If any of that is true then yes, I guess it should be a standalone thing.

Behaviour is not changed by this patch, except
for taking pgoff into account - which should not
matter a whole lot in practice, because mmap of
files is generally done starting at offset 0.

2012-06-21 18:09:50

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH -mm 2/7] mm: get unmapped area from VMA tree

On 06/21/2012 12:16 PM, Mel Gorman wrote:
> On Mon, Jun 18, 2012 at 06:05:21PM -0400, Rik van Riel wrote:
>> From: Rik van Riel<[email protected]>
>>
>> Change the generic implementations of arch_get_unmapped_area(_topdown)
>> to use the free space info in the VMA rbtree. This makes it possible
>> to find free address space in O(log(N)) complexity.
>>
>> For bottom-up allocations, we pick the lowest hole that is large
>> enough for our allocation. For topdown allocations, we pick the
>> highest hole of sufficient size.
>>
>> For topdown allocations, we need to keep track of the highest
>> mapped VMA address, because it might be below mm->mmap_base,
>> and we only keep track of free space to the left of each VMA
>> in the VMA tree. It is tempting to try and keep track of
>> the free space to the right of each VMA when running in
>> topdown mode, but that gets us into trouble when running on
>> x86, where a process can switch direction in the middle of
>> execve.
>>
>> We have to leave the mm->free_area_cache and mm->largest_hole_size
>> in place for now, because the architecture specific versions still
>> use those.
>>
>
> Stick them under a config ARCH_USES_GENERIC_GET_UNMAPPED?

I cannot do that yet, because hugetlbfs still
uses its own get_unmapped_area.

Once Andi Kleen's "[PATCH] MM: Support more
pagesizes for MAP_HUGETLB/SHM_HUGETLB" patch is
in, I can get rid of the separate get_unmapped_area
functions for hugetlbfs.

> Worth mentioning in the changelog that the reduced search time means that
> we no longer need free_area_cache and instead do a search from the root
> RB node every time.

Will do.

>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>> index bf56d66..8ccb4e1 100644
>> --- a/include/linux/mm_types.h
>> +++ b/include/linux/mm_types.h
>> @@ -307,6 +307,7 @@ struct mm_struct {
>> unsigned long task_size; /* size of task vm space */
>> unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
>> unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
>> + unsigned long highest_vma; /* highest vma end address */
>
> With a name like highest_vma I would expect it to be a struct
> vm_area_struct. Name it highest_mapped_addr or something.

I went with the name Johannes came up with,
highest_vm_end :)

>> +static unsigned long node_free_hole(struct rb_node *node)
>> +{
>> + struct vm_area_struct *vma;
>> +
>> + if (!node)
>> + return 0;
>> +
>> + vma = container_of(node, struct vm_area_struct, vm_rb);
>
> What's wrong with using rb_to_vma?

Absolutely nothing. I just wrote this function before
I realized I would end up using container_of way too
much and created the rb_to_vma helper.

Fixed now. Thank you for spotting this one.

> return node ? rb_to_vma(node)->free_gap : 0;
>
>> + return vma->free_gap;
>> +}
>
> function calls it free_hole but vma calls it free_gap. Pick one or
> replace the function with an inlined one-liner.

I am now using free_gap everywhere. Thanks for
pointing out this inconsistency.

>> @@ -1456,13 +1477,29 @@ unacct_error:
>> * This function "knows" that -ENOMEM has the bits set.
>> */
>> #ifndef HAVE_ARCH_UNMAPPED_AREA
>> +struct rb_node *continue_next_right(struct rb_node *node)
>> +{
>
> I get the very strong impression that you were translating a lot of notes
> and diagrams into code because you appear to be immune to commenting :)
>
> This is not returning the next right node because to me that means you
> are walking down the tree. Here you are walking "up" the tree finding a
> node to the right
>
> /*
> * find_next_right_uncle - Find the an "uncle" node to the "right"
> *
> * An "uncle" node is a sibling node of your parent and this function
> * returns an uncle to the right. Given the following basic tree, the
> * node U is an uncle of node C. P is C's parent and G is C's grandparent
> *
> *
> * G
> * / \
> * P U
> * \
> * C
> * This is necessary when searching
> * for a larger gap in the address space.
> */
>
> MUWUHAhaha, watch as I destroy your attempts to reduce line count in
> your diffstat.

Better yet, you have convinced me to add an additional
patch to the series, because this code is useful to have
in the generic augmented rbtree code.

Surely there must be other augmented rbtree users that
want to find something in the tree based on the augmented
data, ie. not using the sort key as the primary search
criterium.

>> + unsigned long vma_start;
>> + int found_here = 0;
>> +
>
> bool.

Consider it done.

>> + vma = container_of(rb_node, struct vm_area_struct, vm_rb);
>>
>
> rb_to_vma

Got it.

2012-06-21 19:06:52

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH -mm 2/7] mm: get unmapped area from VMA tree

On 06/21/2012 05:01 AM, Johannes Weiner wrote:

>> + /* Go left if it looks promising. */
>> + if (node_free_hole(rb_node->rb_left)>= len&&
>> + vma->vm_start - len>= lower_limit) {
>> + rb_node = rb_node->rb_left;
>> + continue;
>
> If we already are at a vma whose start has a lower address than the
> overall length, does it make sense to check for a left hole?
> I.e. shouldn't this be inside the if (vma->vm_start> len) block?

You are right, I can move this in under the
conditional.

>> + if (!found_here&& node_free_hole(rb_node->rb_left)>= len) {
>> + /* Last known hole is to the right of this subtree. */
>
> "to the left"

Actually, it is to the right. We walked left from
our parent to get here, so the holes found so far
are to the right of here.

2012-06-21 19:06:54

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH -mm 4/7] mm: make page colouring code generic

On 06/19/2012 07:27 PM, Andrew Morton wrote:
> On Mon, 18 Jun 2012 18:05:23 -0400
> Rik van Riel<[email protected]> wrote:
>
>> From: Rik van Riel<[email protected]>
>>
>> Fix the x86-64 page colouring code to take pgoff into account.
>
> Could we please have a full description of what's wrong with the
> current code?

Here is a copy of the text I added to the changelog:


The old x86 code will always align the mmap
to aliasing boundaries, even if the program mmaps
the file with a non-zero pgoff.

If program A mmaps the file with pgoff 0, and
program B mmaps the file with pgoff 1. The old
code would align the mmaps, resulting in misaligned
pages:

A: 0123
B: 123

After this patch, they are aligned so the pages
line up:

A: 0123
B: 123

>> Use the x86 and MIPS page colouring code as the basis for a generic
>> page colouring function.

Renamed to "cache alignment", by Andi's request.

>> Teach the generic arch_get_unmapped_area(_topdown) code to call the
>> page colouring code.
>>
>> Make sure that ALIGN_DOWN always aligns down, and ends up at the
>> right page colour.
>
> Some performance tests on the result would be interesting. iirc, we've
> often had trouble demonstrating much or any benefit from coloring.

On AMD Bulldozer, I do not know what the benefits are.

On ARM, MIPS, SPARC and SH, the main benefit is avoiding
data corruption :)

These architectures have VIPT caches on some CPU models,
and MAP_SHARED read-write mappings have to be properly
aligned to guarantee data consistency.

2012-06-21 19:22:58

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH -mm 4/7] mm: make page colouring code generic

On Thu, Jun 21, 2012 at 01:52:14PM -0400, Rik van Riel wrote:
> >Some performance tests on the result would be interesting. iirc, we've
> >often had trouble demonstrating much or any benefit from coloring.
>
> On AMD Bulldozer, I do not know what the benefits are.

I have arranged for running a bunch of benchmarks with and without your
patchset on Bulldozer, basically everything you can get in autotest.

Also, if you have any other microbenchmarks or tests your want run, ping
me.

Thanks.

--
Regards/Gruss,
Boris.

2012-06-21 21:06:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -mm 2/7] mm: get unmapped area from VMA tree

On Mon, 2012-06-18 at 18:05 -0400, Rik van Riel wrote:
> + /* Find the left-most free area of sufficient size. */


Just because there's nothing better than writing it yourself.. I tried
writing something that does the above. The below is the result, it
doesn't use your uncle functions and is clearly limited to two
traversals and thus trivially still O(log n). [ although I think with a
bit of effort you can prove the same for your version ].

---

static inline struct vm_area_struct *vma_of(struct rb_node *node)
{
return container_of(node, struct vm_area_struct, vm_rb);
}

static inline unsigned long max_gap_of(struct rb_node *node)
{
return vma_of(node)->free_gap;
}

static unsigned long gap_of(struct rb_node *node)
{
struct vm_area_struct *vma = vma_of(node);

if (!vma->vm_prev)
return vma->vm_start;

return vma->vm_start - vma->vm_prev->vm_end;
}

static bool node_better(struct rb_node *node, struct rb_node *best)
{
if (!best)
return true;

return vma_of(node)->vm_start < vma_of(best)->vm_start;
}

unsigned long find_leftmost_gap(struct mm_struct *mm, unsigned long len)
{
struct rb_node *node = mm->mm_rb.rb_node, *best = NULL, *tree = NULL;

/*
* Do a search for TASK_UNMAPPED_BASE + len, all nodes right of this
* boundary should be considered. Path nodes are immediate candidates,
* their right sub-tree is stored for later consideration in case
* the immediate path doesn't yield a suitable node.
*/
while (node) {
if (vma_of(node)->vm_start - len >= TASK_UNMAPPED_BASE) {
/*
* If our node has a big enough hole, track it.
*/
if (gap_of(node) > len && node_better(node, best))
best = node;

/*
* In case we flunk out on the path nodes, keep track
* of the right sub-trees which have big enough holes.
*/
if (node->rb_right && max_gap_of(node-rb_right) >= len &&
node_better(node->rb_right, tree))
tree = node->rb_right;

node = node->rb_left;
continue;
}
node = node->rb_right;
}

if (best)
return vma_of(best)->vm_start - len;

/*
* Our stored subtree must be entirely right of TASK_UNMAPPED_BASE + len
* so do a simple search for leftmost hole of appropriate size.
*/
while (tree) {
if (gap_of(tree) >= len && node_better(tree, best))
best = tree;

if (tree->rb_left && max_gap_of(tree->rb_left) >= len) {
tree = tree->rb_left;
continue;
}

tree = tree->rb_right;
}

if (best)
return vma_of(best)->vm_start - len;

/*
* Ok, so no path node, nor right sub-tree had a properly sized hole
* we could use, use the rightmost address in the tree.
*/
node = mm->mm_rb.rb_node;
while (node && node->rb_right)
node = node->rb_right;

return max(vma_of(node)->vm_end, TASK_UNMAPPED_BASE);
}