2004-04-04 12:35:53

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH] anobjrmap 9 priority mjb tree

This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
of anon_vma, and of course each tree has its own additional features.

arch/arm/mm/fault-armv.c | 80 ++---
arch/mips/mm/cache.c | 9
arch/parisc/kernel/cache.c | 86 ++---
arch/parisc/kernel/sys_parisc.c | 14
arch/s390/kernel/compat_exec.c | 2
arch/sparc64/mm/init.c | 8
arch/x86_64/ia32/ia32_binfmt.c | 2
fs/exec.c | 2
fs/hugetlbfs/inode.c | 14
fs/inode.c | 5
fs/locks.c | 8
fs/proc/task_mmu.c | 2
fs/xfs/linux/xfs_vnode.h | 5
include/asm-arm/cacheflush.h | 8
include/asm-parisc/cacheflush.h | 10
include/asm-sh/pgalloc.h | 5
include/linux/fs.h | 6
include/linux/mm.h | 167 +++++++++++
include/linux/prio_tree.h | 78 +++++
init/main.c | 2
kernel/fork.c | 4
kernel/kexec.c | 2
mm/Makefile | 3
mm/filemap.c | 3
mm/fremap.c | 14
mm/memory.c | 15 -
mm/mmap.c | 100 +++---
mm/mremap.c | 42 ++
mm/page_io.c | 4
mm/prio_tree.c | 577 ++++++++++++++++++++++++++++++++++++++++
mm/rmap.c | 164 ++++++-----
mm/shmem.c | 3
mm/vmscan.c | 6
33 files changed, 1172 insertions(+), 278 deletions(-)

--- 2.6.5-rc3-mjb2/arch/arm/mm/fault-armv.c 2004-04-02 21:01:43.725406016 +0100
+++ anobjrmap9/arch/arm/mm/fault-armv.c 2004-04-04 13:05:41.369476056 +0100
@@ -16,6 +16,7 @@
#include <linux/bitops.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
+#include <linux/pagemap.h>

#include <asm/cacheflush.h>
#include <asm/io.h>
@@ -186,47 +187,47 @@ no_pmd:

void __flush_dcache_page(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
struct mm_struct *mm = current->active_mm;
- struct list_head *l;
+ struct vm_area_struct *mpnt;
+ struct prio_tree_iter iter;
+ unsigned long offset;
+ pgoff_t pgoff;

__cpuc_flush_dcache_page(page_address(page));

- if (!page_mapping(page))
+ if (!mapping)
return;

/*
* With a VIVT cache, we need to also write back
* and invalidate any user data.
*/
- list_for_each(l, &page->mapping->i_mmap_shared) {
- struct vm_area_struct *mpnt;
- unsigned long off;
-
- mpnt = list_entry(l, struct vm_area_struct, shared);
-
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ mpnt = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
+ while (mpnt) {
/*
* If this VMA is not in our MM, we can ignore it.
*/
- if (mpnt->vm_mm != mm)
- continue;
-
- if (page->index < mpnt->vm_pgoff)
- continue;
-
- off = page->index - mpnt->vm_pgoff;
- if (off >= (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT)
- continue;
-
- flush_cache_page(mpnt, mpnt->vm_start + (off << PAGE_SHIFT));
+ if (mpnt->vm_mm == mm) {
+ offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+ flush_cache_page(mpnt, mpnt->vm_start + offset);
+ }
+ mpnt = __vma_prio_tree_next(mpnt, &mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
}
}

static void
make_coherent(struct vm_area_struct *vma, unsigned long addr, struct page *page, int dirty)
{
- struct list_head *l;
+ struct address_space *mapping = page->mapping;
struct mm_struct *mm = vma->vm_mm;
- unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ struct vm_area_struct *mpnt;
+ struct prio_tree_iter iter;
+ unsigned long offset;
+ pgoff_t pgoff;
int aliases = 0;

/*
@@ -234,36 +235,21 @@ make_coherent(struct vm_area_struct *vma
* space, then we need to handle them specially to maintain
* cache coherency.
*/
- list_for_each(l, &page->mapping->i_mmap_shared) {
- struct vm_area_struct *mpnt;
- unsigned long off;
-
- mpnt = list_entry(l, struct vm_area_struct, shared);
-
+ pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+ mpnt = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
+ while (mpnt) {
/*
* If this VMA is not in our MM, we can ignore it.
- * Note that we intentionally don't mask out the VMA
+ * Note that we intentionally mask out the VMA
* that we are fixing up.
*/
- if (mpnt->vm_mm != mm || mpnt == vma)
- continue;
-
- /*
- * If the page isn't in this VMA, we can also ignore it.
- */
- if (pgoff < mpnt->vm_pgoff)
- continue;
-
- off = pgoff - mpnt->vm_pgoff;
- if (off >= (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT)
- continue;
-
- off = mpnt->vm_start + (off << PAGE_SHIFT);
-
- /*
- * Ok, it is within mpnt. Fix it up.
- */
- aliases += adjust_pte(mpnt, off);
+ if (mpnt->vm_mm == mm && mpnt != vma) {
+ offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+ aliases += adjust_pte(mpnt, mpnt->vm_start + offset);
+ }
+ mpnt = __vma_prio_tree_next(mpnt, &mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
}
if (aliases)
adjust_pte(vma, addr);
--- 2.6.5-rc3-mjb2/arch/mips/mm/cache.c 2004-04-02 21:01:44.347311472 +0100
+++ anobjrmap9/arch/mips/mm/cache.c 2004-04-04 13:05:41.370475904 +0100
@@ -55,13 +55,14 @@ asmlinkage int sys_cacheflush(void *addr

void flush_dcache_page(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
unsigned long addr;

- if (page_mapping(page) &&
- list_empty(&page->mapping->i_mmap) &&
- list_empty(&page->mapping->i_mmap_shared)) {
+ if (mapping &&
+ prio_tree_empty(&mapping->i_mmap) &&
+ prio_tree_empty(&mapping->i_mmap_shared) &&
+ list_empty(&mapping->i_mmap_nonlinear)) {
SetPageDcacheDirty(page);
-
return;
}

--- 2.6.5-rc3-mjb2/arch/parisc/kernel/cache.c 2004-04-02 21:01:44.356310104 +0100
+++ anobjrmap9/arch/parisc/kernel/cache.c 2004-04-04 13:05:41.371475752 +0100
@@ -17,6 +17,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/seq_file.h>
+#include <linux/pagemap.h>

#include <asm/pdc.h>
#include <asm/cache.h>
@@ -229,67 +230,60 @@ void disable_sr_hashing(void)

void __flush_dcache_page(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
struct mm_struct *mm = current->active_mm;
- struct list_head *l;
+ struct vm_area_struct *mpnt;
+ struct prio_tree_iter iter;
+ unsigned long offset;
+ pgoff_t pgoff;

flush_kernel_dcache_page(page_address(page));

- if (!page_mapping(page))
+ if (!mapping)
return;
- /* check shared list first if it's not empty...it's usually
- * the shortest */
- list_for_each(l, &page->mapping->i_mmap_shared) {
- struct vm_area_struct *mpnt;
- unsigned long off;

- mpnt = list_entry(l, struct vm_area_struct, shared);
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);

+ /* check shared list first if it's not empty...it's usually
+ * the shortest */
+ mpnt = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
+ while (mpnt) {
/*
* If this VMA is not in our MM, we can ignore it.
*/
- if (mpnt->vm_mm != mm)
- continue;
-
- if (page->index < mpnt->vm_pgoff)
- continue;
-
- off = page->index - mpnt->vm_pgoff;
- if (off >= (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT)
- continue;
-
- flush_cache_page(mpnt, mpnt->vm_start + (off << PAGE_SHIFT));
-
- /* All user shared mappings should be equivalently mapped,
- * so once we've flushed one we should be ok
- */
- return;
+ if (mpnt->vm_mm == mm) {
+ offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+ flush_cache_page(mpnt, mpnt->vm_start + offset);
+
+ /* All user shared mappings should be equivalently
+ * mapped, so once we've flushed one we should be ok
+ */
+ return;
+ }
+ mpnt = __vma_prio_tree_next(mpnt, &mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
}

/* then check private mapping list for read only shared mappings
* which are flagged by VM_MAYSHARE */
- list_for_each(l, &page->mapping->i_mmap) {
- struct vm_area_struct *mpnt;
- unsigned long off;
-
- mpnt = list_entry(l, struct vm_area_struct, shared);
-
-
- if (mpnt->vm_mm != mm || !(mpnt->vm_flags & VM_MAYSHARE))
- continue;
-
- if (page->index < mpnt->vm_pgoff)
- continue;
-
- off = page->index - mpnt->vm_pgoff;
- if (off >= (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT)
- continue;
-
- flush_cache_page(mpnt, mpnt->vm_start + (off << PAGE_SHIFT));
-
- /* All user shared mappings should be equivalently mapped,
- * so once we've flushed one we should be ok
+ mpnt = __vma_prio_tree_first(&mapping->i_mmap,
+ &iter, pgoff, pgoff);
+ while (mpnt) {
+ /*
+ * If this VMA is not in our MM, we can ignore it.
*/
- break;
+ if (mpnt->vm_mm == mm && (mpnt->vm_flags & VM_MAYSHARE)) {
+ offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+ flush_cache_page(mpnt, mpnt->vm_start + offset);
+
+ /* All user shared mappings should be equivalently
+ * mapped, so once we've flushed one we should be ok
+ */
+ return;
+ }
+ mpnt = __vma_prio_tree_next(mpnt, &mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
}
}
EXPORT_SYMBOL(__flush_dcache_page);
--- 2.6.5-rc3-mjb2/arch/parisc/kernel/sys_parisc.c 2004-03-30 13:04:00.000000000 +0100
+++ anobjrmap9/arch/parisc/kernel/sys_parisc.c 2004-04-04 13:05:41.372475600 +0100
@@ -68,17 +68,8 @@ static unsigned long get_unshared_area(u
* existing mapping and use the same offset. New scheme is to use the
* address of the kernel data structure as the seed for the offset.
* We'll see how that works...
- */
-#if 0
-static int get_offset(struct address_space *mapping)
-{
- struct vm_area_struct *vma = list_entry(mapping->i_mmap_shared.next,
- struct vm_area_struct, shared);
- return (vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT)) &
- (SHMLBA - 1);
-}
-#else
-/* The mapping is cacheline aligned, so there's no information in the bottom
+ *
+ * The mapping is cacheline aligned, so there's no information in the bottom
* few bits of the address. We're looking for 10 bits (4MB / 4k), so let's
* drop the bottom 8 bits and use bits 8-17.
*/
@@ -87,7 +78,6 @@ static int get_offset(struct address_spa
int offset = (unsigned long) mapping << (PAGE_SHIFT - 8);
return offset & 0x3FF000;
}
-#endif

static unsigned long get_shared_area(struct address_space *mapping,
unsigned long addr, unsigned long len, unsigned long pgoff)
--- 2.6.5-rc3-mjb2/arch/s390/kernel/compat_exec.c 2003-07-10 21:16:28.000000000 +0100
+++ anobjrmap9/arch/s390/kernel/compat_exec.c 2004-04-04 13:05:41.372475600 +0100
@@ -71,7 +71,7 @@ int setup_arg_pages32(struct linux_binpr
mpnt->vm_ops = NULL;
mpnt->vm_pgoff = 0;
mpnt->vm_file = NULL;
- INIT_LIST_HEAD(&mpnt->shared);
+ INIT_VMA_SHARED(mpnt);
mpnt->vm_private_data = (void *) 0;
insert_vm_struct(mm, mpnt);
mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
--- 2.6.5-rc3-mjb2/arch/sparc64/mm/init.c 2004-04-02 21:01:44.535282896 +0100
+++ anobjrmap9/arch/sparc64/mm/init.c 2004-04-04 13:05:41.375475144 +0100
@@ -224,12 +224,14 @@ void update_mmu_cache(struct vm_area_str

void flush_dcache_page(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
int dirty = test_bit(PG_dcache_dirty, &page->flags);
int dirty_cpu = dcache_dirty_cpu(page);

- if (page_mapping(page) &&
- list_empty(&page->mapping->i_mmap) &&
- list_empty(&page->mapping->i_mmap_shared)) {
+ if (mapping &&
+ prio_tree_empty(&mapping->i_mmap) &&
+ prio_tree_empty(&mapping->i_mmap_shared) &&
+ list_empty(&mapping->i_mmap_nonlinear)) {
if (dirty) {
if (dirty_cpu == smp_processor_id())
return;
--- 2.6.5-rc3-mjb2/arch/x86_64/ia32/ia32_binfmt.c 2004-03-30 13:04:03.000000000 +0100
+++ anobjrmap9/arch/x86_64/ia32/ia32_binfmt.c 2004-04-04 13:05:41.375475144 +0100
@@ -360,7 +360,7 @@ int setup_arg_pages(struct linux_binprm
mpnt->vm_ops = NULL;
mpnt->vm_pgoff = 0;
mpnt->vm_file = NULL;
- INIT_LIST_HEAD(&mpnt->shared);
+ INIT_VMA_SHARED(mpnt);
mpnt->vm_private_data = (void *) 0;
insert_vm_struct(mm, mpnt);
mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
--- 2.6.5-rc3-mjb2/fs/exec.c 2004-04-02 21:01:45.785092896 +0100
+++ anobjrmap9/fs/exec.c 2004-04-04 13:05:41.377474840 +0100
@@ -423,7 +423,7 @@ int setup_arg_pages(struct linux_binprm
mpnt->vm_ops = NULL;
mpnt->vm_pgoff = 0;
mpnt->vm_file = NULL;
- INIT_LIST_HEAD(&mpnt->shared);
+ INIT_VMA_SHARED(mpnt);
mpnt->vm_private_data = (void *) 0;
insert_vm_struct(mm, mpnt);
mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
--- 2.6.5-rc3-mjb2/fs/hugetlbfs/inode.c 2004-04-02 21:01:45.794091528 +0100
+++ anobjrmap9/fs/hugetlbfs/inode.c 2004-04-04 13:05:41.378474688 +0100
@@ -270,11 +270,13 @@ static void hugetlbfs_drop_inode(struct
* vma->vm_pgoff is in PAGE_SIZE units.
*/
static void
-hugetlb_vmtruncate_list(struct list_head *list, unsigned long h_pgoff)
+hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
{
struct vm_area_struct *vma;
+ struct prio_tree_iter iter;

- list_for_each_entry(vma, list, shared) {
+ vma = __vma_prio_tree_first(root, &iter, h_pgoff, ULONG_MAX);
+ while (vma) {
unsigned long h_vm_pgoff;
unsigned long v_length;
unsigned long h_length;
@@ -306,6 +308,8 @@ hugetlb_vmtruncate_list(struct list_head
zap_hugepage_range(vma,
vma->vm_start + v_offset,
v_length - v_offset);
+
+ vma = __vma_prio_tree_next(vma, root, &iter, h_pgoff, ULONG_MAX);
}
}

@@ -325,9 +329,11 @@ static int hugetlb_vmtruncate(struct ino

inode->i_size = offset;
down(&mapping->i_shared_sem);
- if (!list_empty(&mapping->i_mmap))
+ /* Protect against page fault */
+ atomic_inc(&mapping->truncate_count);
+ if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
- if (!list_empty(&mapping->i_mmap_shared))
+ if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff);
up(&mapping->i_shared_sem);
truncate_hugepages(mapping, offset);
--- 2.6.5-rc3-mjb2/fs/inode.c 2004-03-30 13:04:15.000000000 +0100
+++ anobjrmap9/fs/inode.c 2004-04-04 13:05:41.380474384 +0100
@@ -189,8 +189,9 @@ void inode_init_once(struct inode *inode
atomic_set(&inode->i_data.truncate_count, 0);
INIT_LIST_HEAD(&inode->i_data.private_list);
spin_lock_init(&inode->i_data.private_lock);
- INIT_LIST_HEAD(&inode->i_data.i_mmap);
- INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
+ INIT_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
+ INIT_PRIO_TREE_ROOT(&inode->i_data.i_mmap_shared);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
spin_lock_init(&inode->i_lock);
i_size_ordered_init(inode);
}
--- 2.6.5-rc3-mjb2/fs/locks.c 2004-03-11 01:56:12.000000000 +0000
+++ anobjrmap9/fs/locks.c 2004-04-04 13:05:41.382474080 +0100
@@ -1455,8 +1455,8 @@ int fcntl_setlk(struct file *filp, unsig
if (IS_MANDLOCK(inode) &&
(inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) {
struct address_space *mapping = filp->f_mapping;
-
- if (!list_empty(&mapping->i_mmap_shared)) {
+ if (!prio_tree_empty(&mapping->i_mmap_shared) ||
+ !list_empty(&mapping->i_mmap_nonlinear)) {
error = -EAGAIN;
goto out;
}
@@ -1593,8 +1593,8 @@ int fcntl_setlk64(struct file *filp, uns
if (IS_MANDLOCK(inode) &&
(inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) {
struct address_space *mapping = filp->f_mapping;
-
- if (!list_empty(&mapping->i_mmap_shared)) {
+ if (!prio_tree_empty(&mapping->i_mmap_shared) ||
+ !list_empty(&mapping->i_mmap_nonlinear)) {
error = -EAGAIN;
goto out;
}
--- 2.6.5-rc3-mjb2/fs/proc/task_mmu.c 2004-04-02 21:01:45.864080888 +0100
+++ anobjrmap9/fs/proc/task_mmu.c 2004-04-04 13:05:41.383473928 +0100
@@ -65,7 +65,7 @@ int task_statm(struct mm_struct *mm, int
*shared += pages;
continue;
}
- if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared))
+ if (vma->vm_flags & VM_SHARED || !vma_shared_empty(vma))
*shared += pages;
if (vma->vm_flags & VM_EXECUTABLE)
*text += pages;
--- 2.6.5-rc3-mjb2/fs/xfs/linux/xfs_vnode.h 2004-02-04 02:45:43.000000000 +0000
+++ anobjrmap9/fs/xfs/linux/xfs_vnode.h 2004-04-04 13:05:41.384473776 +0100
@@ -597,8 +597,9 @@ static __inline__ void vn_flagclr(struct
* Some useful predicates.
*/
#define VN_MAPPED(vp) \
- (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \
- (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared))))
+ (!prio_tree_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \
+ !prio_tree_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared)) || \
+ !list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_nonlinear)))
#define VN_CACHED(vp) (LINVFS_GET_IP(vp)->i_mapping->nrpages)
#define VN_DIRTY(vp) (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->dirty_pages)))
#define VMODIFY(vp) VN_FLAGSET(vp, VMODIFIED)
--- 2.6.5-rc3-mjb2/include/asm-arm/cacheflush.h 2004-04-02 21:01:45.998060520 +0100
+++ anobjrmap9/include/asm-arm/cacheflush.h 2004-04-04 13:05:41.385473624 +0100
@@ -292,8 +292,12 @@ flush_cache_page(struct vm_area_struct *
* about to change to user space. This is the same method as used on SPARC64.
* See update_mmu_cache for the user space part.
*/
-#define mapping_mapped(map) (!list_empty(&(map)->i_mmap) || \
- !list_empty(&(map)->i_mmap_shared))
+static inline int mapping_mapped(struct address_space *mapping)
+{
+ return !prio_tree_empty(&mapping->i_mmap) ||
+ !prio_tree_empty(&mapping->i_mmap_shared) ||
+ !list_empty(&mapping->i_mmap_nonlinear);
+}

extern void __flush_dcache_page(struct page *);

--- 2.6.5-rc3-mjb2/include/asm-parisc/cacheflush.h 2004-04-02 21:01:46.794939376 +0100
+++ anobjrmap9/include/asm-parisc/cacheflush.h 2004-04-04 13:05:41.385473624 +0100
@@ -65,12 +65,18 @@ flush_user_icache_range(unsigned long st
#endif
}

+static inline int mapping_mapped(struct address_space *mapping)
+{
+ return !prio_tree_empty(&mapping->i_mmap) ||
+ !prio_tree_empty(&mapping->i_mmap_shared) ||
+ !list_empty(&mapping->i_mmap_nonlinear);
+}
+
extern void __flush_dcache_page(struct page *page);

static inline void flush_dcache_page(struct page *page)
{
- if (page_mapping(page) && list_empty(&page->mapping->i_mmap) &&
- list_empty(&page->mapping->i_mmap_shared)) {
+ if (page_mapping(page) && !mapping_mapped(page->mapping)) {
set_bit(PG_dcache_dirty, &page->flags);
} else {
__flush_dcache_page(page);
--- 2.6.5-rc3-mjb2/include/asm-sh/pgalloc.h 2004-04-02 21:01:46.963913688 +0100
+++ anobjrmap9/include/asm-sh/pgalloc.h 2004-04-04 13:05:41.386473472 +0100
@@ -101,8 +101,9 @@ static inline pte_t ptep_get_and_clear(p
unsigned long pfn = pte_pfn(pte);
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
- if (!page_mapping(page)
- || list_empty(&page->mapping->i_mmap_shared))
+ if (!page_mapping(page) ||
+ (prio_tree_empty(&page->mapping->i_mmap_shared) &&
+ list_empty(&page->mapping->i_mmap_nonlinear)))
__clear_bit(PG_mapped, &page->flags);
}
}
--- 2.6.5-rc3-mjb2/include/linux/fs.h 2004-03-30 13:04:18.000000000 +0100
+++ anobjrmap9/include/linux/fs.h 2004-04-04 13:05:41.388473168 +0100
@@ -18,6 +18,7 @@
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/radix-tree.h>
+#include <linux/prio_tree.h>
#include <linux/kobject.h>
#include <asm/atomic.h>

@@ -329,8 +330,9 @@ struct address_space {
struct list_head io_pages; /* being prepared for I/O */
unsigned long nrpages; /* number of total pages */
struct address_space_operations *a_ops; /* methods */
- struct list_head i_mmap; /* list of private mappings */
- struct list_head i_mmap_shared; /* list of shared mappings */
+ struct prio_tree_root i_mmap; /* tree of private mappings */
+ struct prio_tree_root i_mmap_shared; /* tree of shared mappings */
+ struct list_head i_mmap_nonlinear;/*list of nonlinear mappings */
struct semaphore i_shared_sem; /* protect both above lists */
atomic_t truncate_count; /* Cover race condition with truncate */
unsigned long flags; /* error bits/gfp mask */
--- 2.6.5-rc3-mjb2/include/linux/mm.h 2004-04-02 21:01:47.316860032 +0100
+++ anobjrmap9/include/linux/mm.h 2004-04-04 13:05:41.390472864 +0100
@@ -11,6 +11,7 @@
#include <linux/list.h>
#include <linux/mmzone.h>
#include <linux/rbtree.h>
+#include <linux/prio_tree.h>
#include <linux/fs.h>

#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */
@@ -68,7 +69,26 @@ struct vm_area_struct {
* one of the address_space->i_mmap{,shared} lists,
* for shm areas, the list of attaches, otherwise unused.
*/
- struct list_head shared;
+ union {
+ struct {
+ struct list_head list;
+ void *parent;
+ } vm_set;
+
+ struct prio_tree_node prio_tree_node;
+
+ struct {
+ void *first;
+ void *second;
+ void *parent;
+ } both;
+ } shared;
+
+ /*
+ * shared.vm_set : list of vmas that map exactly the same set of pages
+ * vm_set_head : head of the vm_set list
+ */
+ struct vm_area_struct *vm_set_head;

/* Function pointers to deal with this struct. */
struct vm_operations_struct * vm_ops;
@@ -130,6 +150,150 @@ struct vm_area_struct {
#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ)

/*
+ * The following macros are used for implementing prio_tree for i_mmap{_shared}
+ */
+
+#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
+#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
+/* avoid overflow */
+#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
+
+#define GET_INDEX_VMA(vma, radix, heap) \
+do { \
+ radix = RADIX_INDEX(vma); \
+ heap = HEAP_INDEX(vma); \
+} while (0)
+
+#define GET_INDEX(node, radix, heap) \
+do { \
+ struct vm_area_struct *__tmp = \
+ prio_tree_entry(node, struct vm_area_struct, shared.prio_tree_node);\
+ GET_INDEX_VMA(__tmp, radix, heap); \
+} while (0)
+
+#define INIT_VMA_SHARED_LIST(vma) \
+do { \
+ INIT_LIST_HEAD(&(vma)->shared.vm_set.list); \
+ (vma)->shared.vm_set.parent = NULL; \
+ (vma)->vm_set_head = NULL; \
+} while (0)
+
+#define INIT_VMA_SHARED(vma) \
+do { \
+ (vma)->shared.both.first = NULL; \
+ (vma)->shared.both.second = NULL; \
+ (vma)->shared.both.parent = NULL; \
+ (vma)->vm_set_head = NULL; \
+} while (0)
+
+extern void __vma_prio_tree_insert(struct prio_tree_root *,
+ struct vm_area_struct *);
+
+extern void __vma_prio_tree_remove(struct prio_tree_root *,
+ struct vm_area_struct *);
+
+static inline int vma_shared_empty(struct vm_area_struct *vma)
+{
+ return vma->shared.both.first == NULL;
+}
+
+/*
+ * Helps to add a new vma that maps the same (identical) set of pages as the
+ * old vma to an i_mmap tree.
+ */
+static inline void __vma_prio_tree_add(struct vm_area_struct *vma,
+ struct vm_area_struct *old)
+{
+ INIT_VMA_SHARED_LIST(vma);
+
+ /* Leave these BUG_ONs till prio_tree patch stabilizes */
+ BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
+ BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
+
+ if (old->shared.both.parent) {
+ if (old->vm_set_head) {
+ list_add_tail(&vma->shared.vm_set.list,
+ &old->vm_set_head->shared.vm_set.list);
+ return;
+ }
+ else {
+ old->vm_set_head = vma;
+ vma->vm_set_head = old;
+ }
+ }
+ else
+ list_add(&vma->shared.vm_set.list, &old->shared.vm_set.list);
+}
+
+/*
+ * We cannot modify vm_start, vm_end, vm_pgoff fields of a vma that has been
+ * already present in an i_mmap{_shared} tree without modifying the tree. The
+ * following helper function should be used when such modifications are
+ * necessary. We should hold the mapping's i_shared_sem.
+ *
+ * This function can be (micro)optimized for some special cases (maybe later).
+ */
+static inline void __vma_modify(struct prio_tree_root *root,
+ struct vm_area_struct *vma, unsigned long start, unsigned long end,
+ unsigned long pgoff)
+{
+ if (root)
+ __vma_prio_tree_remove(root, vma);
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+ if (root)
+ __vma_prio_tree_insert(root, vma);
+}
+
+/*
+ * Helper functions to enumerate vmas that map a given file page or a set of
+ * contiguous file pages. The functions return vmas that at least map a single
+ * page in the given range of contiguous file pages.
+ */
+static inline struct vm_area_struct *__vma_prio_tree_first(
+ struct prio_tree_root *root, struct prio_tree_iter *iter,
+ unsigned long begin, unsigned long end)
+{
+ struct prio_tree_node *ptr;
+
+ ptr = prio_tree_first(root, iter, begin, end);
+
+ if (ptr)
+ return prio_tree_entry(ptr, struct vm_area_struct,
+ shared.prio_tree_node);
+ else
+ return NULL;
+}
+
+static inline struct vm_area_struct *__vma_prio_tree_next(
+ struct vm_area_struct *vma, struct prio_tree_root *root,
+ struct prio_tree_iter *iter, unsigned long begin, unsigned long end)
+{
+ struct prio_tree_node *ptr;
+ struct vm_area_struct *next;
+
+ if (vma->shared.both.parent) {
+ if (vma->vm_set_head)
+ return vma->vm_set_head;
+ }
+ else {
+ next = list_entry(vma->shared.vm_set.list.next,
+ struct vm_area_struct, shared.vm_set.list);
+ if (!(next->vm_set_head))
+ return next;
+ }
+
+ ptr = prio_tree_next(root, iter, begin, end);
+
+ if (ptr)
+ return prio_tree_entry(ptr, struct vm_area_struct,
+ shared.prio_tree_node);
+ else
+ return NULL;
+}
+
+/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
*/
@@ -520,7 +684,6 @@ extern void __vma_link_rb(struct mm_stru
struct rb_node **, struct rb_node *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct *,
unsigned long addr, unsigned long len, unsigned long pgoff);
-extern void vma_relink_file(struct vm_area_struct *, struct vm_area_struct *);
extern void exit_mmap(struct mm_struct *);

extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
--- 2.6.5-rc3-mjb2/include/linux/prio_tree.h 1970-01-01 01:00:00.000000000 +0100
+++ anobjrmap9/include/linux/prio_tree.h 2004-04-04 13:05:41.391472712 +0100
@@ -0,0 +1,78 @@
+#ifndef _LINUX_PRIO_TREE_H
+#define _LINUX_PRIO_TREE_H
+
+struct prio_tree_node {
+ struct prio_tree_node *left;
+ struct prio_tree_node *right;
+ struct prio_tree_node *parent;
+};
+
+struct prio_tree_root {
+ struct prio_tree_node *prio_tree_node;
+ unsigned int index_bits;
+};
+
+struct prio_tree_iter {
+ struct prio_tree_node *cur;
+ unsigned long mask;
+ unsigned long value;
+ int size_level;
+};
+
+#define PRIO_TREE_ROOT (struct prio_tree_root) {NULL, 1}
+
+#define PRIO_TREE_ROOT_INIT {NULL, 1}
+
+#define INIT_PRIO_TREE_ROOT(ptr) \
+do { \
+ (ptr)->prio_tree_node = NULL; \
+ (ptr)->index_bits = 1; \
+} while (0)
+
+#define PRIO_TREE_NODE_INIT(name) {&(name), &(name), &(name)}
+
+#define PRIO_TREE_NODE(name) \
+ struct prio_tree_node name = PRIO_TREE_NODE_INIT(name)
+
+#define INIT_PRIO_TREE_NODE(ptr) \
+do { \
+ (ptr)->left = (ptr)->right = (ptr)->parent = (ptr); \
+} while (0)
+
+#define prio_tree_entry(ptr, type, member) \
+ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+#define PRIO_TREE_ITER (struct prio_tree_iter) {NULL, 0UL, 0UL, 0}
+
+static inline int prio_tree_empty(const struct prio_tree_root *root)
+{
+ return root->prio_tree_node == NULL;
+}
+
+static inline int prio_tree_root(const struct prio_tree_node *node)
+{
+ return node->parent == node;
+}
+
+static inline int prio_tree_left_empty(const struct prio_tree_node *node)
+{
+ return node->left == node;
+}
+
+static inline int prio_tree_right_empty(const struct prio_tree_node *node)
+{
+ return node->right == node;
+}
+
+extern struct prio_tree_node *prio_tree_insert(struct prio_tree_root *,
+ struct prio_tree_node *);
+
+extern void prio_tree_remove(struct prio_tree_root *, struct prio_tree_node *);
+
+extern struct prio_tree_node *prio_tree_first(struct prio_tree_root *,
+ struct prio_tree_iter *, unsigned long, unsigned long);
+
+extern struct prio_tree_node *prio_tree_next(struct prio_tree_root *,
+ struct prio_tree_iter *, unsigned long, unsigned long);
+
+#endif
--- 2.6.5-rc3-mjb2/init/main.c 2004-04-02 21:01:47.345855624 +0100
+++ anobjrmap9/init/main.c 2004-04-04 13:05:41.392472560 +0100
@@ -85,6 +85,7 @@ extern void buffer_init(void);
extern void pidhash_init(void);
extern void pidmap_init(void);
extern void radix_tree_init(void);
+extern void prio_tree_init(void);
extern void free_initmem(void);
extern void populate_rootfs(void);
extern void driver_init(void);
@@ -466,6 +467,7 @@ asmlinkage void __init start_kernel(void
calibrate_delay();
pidmap_init();
pgtable_cache_init();
+ prio_tree_init();
#ifdef CONFIG_X86
if (efi_enabled)
efi_enter_virtual_mode();
--- 2.6.5-rc3-mjb2/kernel/fork.c 2004-04-02 21:01:47.350854864 +0100
+++ anobjrmap9/kernel/fork.c 2004-04-04 13:05:41.394472256 +0100
@@ -314,7 +314,7 @@ static inline int dup_mmap(struct mm_str
tmp->vm_mm = mm;
tmp->vm_next = NULL;
file = tmp->vm_file;
- INIT_LIST_HEAD(&tmp->shared);
+ INIT_VMA_SHARED(tmp);
if (file) {
struct inode *inode = file->f_dentry->d_inode;
get_file(file);
@@ -323,7 +323,7 @@ static inline int dup_mmap(struct mm_str

/* insert tmp into the share list, just after mpnt */
down(&file->f_mapping->i_shared_sem);
- list_add(&tmp->shared, &mpnt->shared);
+ __vma_prio_tree_add(tmp, mpnt);
up(&file->f_mapping->i_shared_sem);
}

--- 2.6.5-rc3-mjb2/kernel/kexec.c 2004-04-02 21:01:47.354854256 +0100
+++ anobjrmap9/kernel/kexec.c 2004-04-04 13:05:41.395472104 +0100
@@ -191,7 +191,7 @@ static int identity_map_pages(struct pag
vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
vma->vm_file = NULL;
vma->vm_private_data = NULL;
- INIT_LIST_HEAD(&vma->shared);
+ INIT_VMA_SHARED(vma);
insert_vm_struct(mm, vma);

error = remap_page_range(vma, vma->vm_start, vma->vm_start,
--- 2.6.5-rc3-mjb2/mm/Makefile 2004-04-02 21:01:47.392848480 +0100
+++ anobjrmap9/mm/Makefile 2004-04-04 13:05:41.395472104 +0100
@@ -9,7 +9,8 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o

obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o readahead.o \
- slab.o swap.o truncate.o vmscan.o $(mmu-y)
+ slab.o swap.o truncate.o vmscan.o prio_tree.o \
+ $(mmu-y)

obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
obj-$(CONFIG_X86_4G) += usercopy.o
--- 2.6.5-rc3-mjb2/mm/filemap.c 2004-04-02 21:01:47.395848024 +0100
+++ anobjrmap9/mm/filemap.c 2004-04-04 13:05:41.397471800 +0100
@@ -632,7 +632,8 @@ page_ok:
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
- if (!list_empty(&mapping->i_mmap_shared))
+ if (!prio_tree_empty(&mapping->i_mmap_shared) ||
+ !list_empty(&mapping->i_mmap_nonlinear))
flush_dcache_page(page);

/*
--- 2.6.5-rc3-mjb2/mm/fremap.c 2004-04-02 21:01:47.397847720 +0100
+++ anobjrmap9/mm/fremap.c 2004-04-04 13:05:41.398471648 +0100
@@ -157,6 +157,8 @@ asmlinkage long sys_remap_file_pages(uns
unsigned long __prot, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct address_space *mapping;
+ unsigned long linear_pgoff;
unsigned long end = start + size;
struct vm_area_struct *vma;
int err = -EINVAL;
@@ -197,8 +199,18 @@ asmlinkage long sys_remap_file_pages(uns
end <= vma->vm_end) {

/* Must set VM_NONLINEAR before any pages are populated. */
- if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff)
+ linear_pgoff = vma->vm_pgoff;
+ linear_pgoff += ((start - vma->vm_start) >> PAGE_SHIFT);
+ if (pgoff != linear_pgoff && !(vma->vm_flags & VM_NONLINEAR)) {
+ mapping = vma->vm_file->f_mapping;
+ down(&mapping->i_shared_sem);
vma->vm_flags |= VM_NONLINEAR;
+ __vma_prio_tree_remove(&mapping->i_mmap_shared, vma);
+ INIT_VMA_SHARED_LIST(vma);
+ list_add_tail(&vma->shared.vm_set.list,
+ &mapping->i_mmap_nonlinear);
+ up(&mapping->i_shared_sem);
+ }

/* ->populate can take a long time, so downgrade the lock. */
downgrade_write(&mm->mmap_sem);
--- 2.6.5-rc3-mjb2/mm/memory.c 2004-04-02 21:01:47.402846960 +0100
+++ anobjrmap9/mm/memory.c 2004-04-04 13:05:41.400471344 +0100
@@ -1077,11 +1077,11 @@ no_new_page:
* An hlen of zero blows away the entire portion file after hba.
*/
static void
-invalidate_mmap_range_list(struct list_head *head,
+invalidate_mmap_range_list(struct prio_tree_root *root,
unsigned long const hba,
unsigned long const hlen)
{
- struct list_head *curr;
+ struct prio_tree_iter iter;
unsigned long hea; /* last page of hole. */
unsigned long vba;
unsigned long vea; /* last page of corresponding uva hole. */
@@ -1092,17 +1092,16 @@ invalidate_mmap_range_list(struct list_h
hea = hba + hlen - 1; /* avoid overflow. */
if (hea < hba)
hea = ULONG_MAX;
- list_for_each(curr, head) {
- vp = list_entry(curr, struct vm_area_struct, shared);
+ vp = __vma_prio_tree_first(root, &iter, hba, hea);
+ while(vp) {
vba = vp->vm_pgoff;
vea = vba + ((vp->vm_end - vp->vm_start) >> PAGE_SHIFT) - 1;
- if (hea < vba || vea < hba)
- continue; /* Mapping disjoint from hole. */
zba = (hba <= vba) ? vba : hba;
zea = (vea <= hea) ? vea : hea;
zap_page_range(vp,
((zba - vba) << PAGE_SHIFT) + vp->vm_start,
(zea - zba + 1) << PAGE_SHIFT);
+ vp = __vma_prio_tree_next(vp, root, &iter, hba, hea);
}
}

@@ -1137,9 +1136,9 @@ void invalidate_mmap_range(struct addres
down(&mapping->i_shared_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
- if (unlikely(!list_empty(&mapping->i_mmap)))
+ if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
- if (unlikely(!list_empty(&mapping->i_mmap_shared)))
+ if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen);
up(&mapping->i_shared_sem);
}
--- 2.6.5-rc3-mjb2/mm/mmap.c 2004-04-02 21:01:47.406846352 +0100
+++ anobjrmap9/mm/mmap.c 2004-04-04 13:05:41.403470888 +0100
@@ -68,12 +68,20 @@ int mmap_hugepages_map_sz = 256;
* Requires inode->i_mapping->i_shared_sem
*/
static inline void
-__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode)
+__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode,
+ struct address_space * mapping)
{
if (inode) {
if (vma->vm_flags & VM_DENYWRITE)
atomic_inc(&inode->i_writecount);
- list_del_init(&vma->shared);
+ if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+ list_del_init(&vma->shared.vm_set.list);
+ INIT_VMA_SHARED(vma);
+ }
+ else if (vma->vm_flags & VM_SHARED)
+ __vma_prio_tree_remove(&mapping->i_mmap_shared, vma);
+ else
+ __vma_prio_tree_remove(&mapping->i_mmap, vma);
}
}

@@ -87,7 +95,8 @@ static void remove_shared_vm_struct(stru
if (file) {
struct address_space *mapping = file->f_mapping;
down(&mapping->i_shared_sem);
- __remove_shared_vm_struct(vma, file->f_dentry->d_inode);
+ __remove_shared_vm_struct(vma, file->f_dentry->d_inode,
+ mapping);
up(&mapping->i_shared_sem);
}
}
@@ -261,10 +270,15 @@ static inline void __vma_link_file(struc
if (vma->vm_flags & VM_DENYWRITE)
atomic_dec(&file->f_dentry->d_inode->i_writecount);

- if (vma->vm_flags & VM_SHARED)
- list_add_tail(&vma->shared, &mapping->i_mmap_shared);
+ if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+ INIT_VMA_SHARED_LIST(vma);
+ list_add_tail(&vma->shared.vm_set.list,
+ &mapping->i_mmap_nonlinear);
+ }
+ else if (vma->vm_flags & VM_SHARED)
+ __vma_prio_tree_insert(&mapping->i_mmap_shared, vma);
else
- list_add_tail(&vma->shared, &mapping->i_mmap);
+ __vma_prio_tree_insert(&mapping->i_mmap, vma);
}
}

@@ -393,7 +407,9 @@ static struct vm_area_struct *vma_merge(
{
spinlock_t *lock = &mm->page_table_lock;
struct inode *inode = file ? file->f_dentry->d_inode : NULL;
+ struct address_space *mapping = file ? file->f_mapping : NULL;
struct semaphore *i_shared_sem;
+ struct prio_tree_root *root = NULL;

/*
* We later require that vma->vm_flags == vm_flags, so this tests
@@ -404,6 +420,15 @@ static struct vm_area_struct *vma_merge(

i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL;

+ if (mapping) {
+ if (vm_flags & VM_SHARED) {
+ if (likely(!(vm_flags & VM_NONLINEAR)))
+ root = &mapping->i_mmap_shared;
+ }
+ else
+ root = &mapping->i_mmap;
+ }
+
if (!prev) {
prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
goto merge_next;
@@ -423,18 +448,18 @@ static struct vm_area_struct *vma_merge(
need_up = 1;
}
spin_lock(lock);
- prev->vm_end = end;

/*
* OK, it did. Can we now merge in the successor as well?
*/
next = prev->vm_next;
- if (next && prev->vm_end == next->vm_start &&
+ if (next && end == next->vm_start &&
can_vma_merge_before(next, vm_flags, file,
pgoff, (end - addr) >> PAGE_SHIFT)) {
- prev->vm_end = next->vm_end;
__vma_unlink(mm, next, prev);
- __remove_shared_vm_struct(next, inode);
+ __vma_modify(root, prev, prev->vm_start,
+ next->vm_end, prev->vm_pgoff);
+ __remove_shared_vm_struct(next, inode, mapping);
spin_unlock(lock);
if (need_up)
up(i_shared_sem);
@@ -445,6 +470,8 @@ static struct vm_area_struct *vma_merge(
kmem_cache_free(vm_area_cachep, next);
return prev;
}
+
+ __vma_modify(root, prev, prev->vm_start, end, prev->vm_pgoff);
spin_unlock(lock);
if (need_up)
up(i_shared_sem);
@@ -464,8 +491,8 @@ static struct vm_area_struct *vma_merge(
if (file)
down(i_shared_sem);
spin_lock(lock);
- prev->vm_start = addr;
- prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT;
+ __vma_modify(root, prev, addr, prev->vm_end,
+ prev->vm_pgoff - ((end - addr) >> PAGE_SHIFT));
spin_unlock(lock);
if (file)
up(i_shared_sem);
@@ -698,7 +725,7 @@ munmap_back:
vma->vm_file = NULL;
vma->vm_private_data = NULL;
vma->vm_next = NULL;
- INIT_LIST_HEAD(&vma->shared);
+ INIT_VMA_SHARED(vma);

if (file) {
error = -EINVAL;
@@ -1289,6 +1316,7 @@ int split_vma(struct mm_struct * mm, str
{
struct vm_area_struct *new;
struct address_space *mapping = NULL;
+ struct prio_tree_root *root = NULL;

if (mm->map_count >= MAX_MAP_COUNT)
return -ENOMEM;
@@ -1300,7 +1328,7 @@ int split_vma(struct mm_struct * mm, str
/* most fields are the same, copy all, and then fixup */
*new = *vma;

- INIT_LIST_HEAD(&new->shared);
+ INIT_VMA_SHARED(new);

if (new_below)
new->vm_end = addr;
@@ -1315,18 +1343,25 @@ int split_vma(struct mm_struct * mm, str
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);

- if (vma->vm_file)
+ if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
+ if (vma->vm_flags & VM_SHARED) {
+ if (likely(!(vma->vm_flags & VM_NONLINEAR)))
+ root = &mapping->i_mmap_shared;
+ }
+ else
+ root = &mapping->i_mmap;
+ }

if (mapping)
down(&mapping->i_shared_sem);
spin_lock(&mm->page_table_lock);

- if (new_below) {
- vma->vm_start = addr;
- vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT);
- } else
- vma->vm_end = addr;
+ if (new_below)
+ __vma_modify(root, vma, addr, vma->vm_end,
+ vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT));
+ else
+ __vma_modify(root, vma, vma->vm_start, addr, vma->vm_pgoff);

__insert_vm_struct(mm, new);

@@ -1499,7 +1534,7 @@ unsigned long do_brk(unsigned long addr,
vma->vm_pgoff = 0;
vma->vm_file = NULL;
vma->vm_private_data = NULL;
- INIT_LIST_HEAD(&vma->shared);
+ INIT_VMA_SHARED(vma);

vma_link(mm, vma, prev, rb_link, rb_parent);

@@ -1597,7 +1632,7 @@ struct vm_area_struct *copy_vma(struct v
new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (new_vma) {
*new_vma = *vma;
- INIT_LIST_HEAD(&new_vma->shared);
+ INIT_VMA_SHARED(new_vma);
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
new_vma->vm_pgoff = pgoff;
@@ -1610,24 +1645,3 @@ struct vm_area_struct *copy_vma(struct v
}
return new_vma;
}
-
-/*
- * Position vma after prev in shared file list:
- * for mremap move error recovery racing against vmtruncate.
- */
-void vma_relink_file(struct vm_area_struct *vma, struct vm_area_struct *prev)
-{
- struct mm_struct *mm = vma->vm_mm;
- struct address_space *mapping;
-
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- if (mapping) {
- down(&mapping->i_shared_sem);
- spin_lock(&mm->page_table_lock);
- list_move(&vma->shared, &prev->shared);
- spin_unlock(&mm->page_table_lock);
- up(&mapping->i_shared_sem);
- }
- }
-}
--- 2.6.5-rc3-mjb2/mm/mremap.c 2004-04-02 21:01:47.409845896 +0100
+++ anobjrmap9/mm/mremap.c 2004-04-04 13:05:41.405470584 +0100
@@ -237,6 +237,7 @@ static int move_page_tables(struct vm_ar
* only a few pages.. This also makes error recovery easier.
*/
while (offset < len) {
+ cond_resched();
ret = move_one_page(vma, old_addr+offset, new_addr+offset);
if (!ret) {
offset += PAGE_SIZE;
@@ -266,6 +267,7 @@ static unsigned long move_vma(struct vm_
unsigned long new_len, unsigned long new_addr)
{
struct mm_struct *mm = vma->vm_mm;
+ struct address_space *mapping = NULL;
struct vm_area_struct *new_vma;
unsigned long vm_flags = vma->vm_flags;
unsigned long new_pgoff;
@@ -285,26 +287,31 @@ static unsigned long move_vma(struct vm_
if (!new_vma)
return -ENOMEM;

+ if (vma->vm_file) {
+ /*
+ * Subtle point from Rajesh Venkatasubramanian: before
+ * moving file-based ptes, we must lock vmtruncate out,
+ * since it might clean the dst vma before the src vma,
+ * and we propagate stale pages into the dst afterward.
+ */
+ mapping = vma->vm_file->f_mapping;
+ down(&mapping->i_shared_sem);
+ }
moved_len = move_page_tables(vma, new_addr, old_addr, old_len);
if (moved_len < old_len) {
/*
* On error, move entries back from new area to old,
* which will succeed since page tables still there,
* and then proceed to unmap new area instead of old.
- *
- * Subtle point from Rajesh Venkatasubramanian: before
- * moving file-based ptes, move new_vma before old vma
- * in the i_mmap or i_mmap_shared list, so when racing
- * against vmtruncate we cannot propagate pages to be
- * truncated back from new_vma into just cleaned old.
*/
- vma_relink_file(vma, new_vma);
move_page_tables(new_vma, old_addr, new_addr, moved_len);
vma = new_vma;
old_len = new_len;
old_addr = new_addr;
new_addr = -ENOMEM;
}
+ if (mapping)
+ up(&mapping->i_shared_sem);

/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {
@@ -351,6 +358,8 @@ unsigned long do_mremap(unsigned long ad
unsigned long flags, unsigned long new_addr)
{
struct vm_area_struct *vma;
+ struct address_space *mapping = NULL;
+ struct prio_tree_root *root = NULL;
unsigned long ret = -EINVAL;
unsigned long charged = 0;

@@ -458,9 +467,26 @@ unsigned long do_mremap(unsigned long ad
/* can we just expand the current mapping? */
if (max_addr - addr >= new_len) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
+
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ if (vma->vm_flags & VM_SHARED) {
+ if (likely(!(vma->vm_flags & VM_NONLINEAR)))
+ root = &mapping->i_mmap_shared;
+ }
+ else
+ root = &mapping->i_mmap;
+ down(&mapping->i_shared_sem);
+ }
+
spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_end = addr + new_len;
+ __vma_modify(root, vma, vma->vm_start,
+ addr + new_len, vma->vm_pgoff);
spin_unlock(&vma->vm_mm->page_table_lock);
+
+ if(mapping)
+ up(&mapping->i_shared_sem);
+
current->mm->total_vm += pages;
if (vma->vm_flags & VM_LOCKED) {
current->mm->locked_vm += pages;
--- 2.6.5-rc3-mjb2/mm/page_io.c 2004-04-02 21:01:47.416844832 +0100
+++ anobjrmap9/mm/page_io.c 2004-04-04 13:05:41.405470584 +0100
@@ -135,12 +135,14 @@ out:
int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
{
int ret;
+ unsigned long save_private;
struct writeback_control swap_wbc = {
.sync_mode = WB_SYNC_ALL,
};

lock_page(page);
SetPageSwapCache(page);
+ save_private = page->private;
page->private = entry.val;

if (rw == READ) {
@@ -150,7 +152,9 @@ int rw_swap_page_sync(int rw, swp_entry_
ret = swap_writepage(page, &swap_wbc);
wait_on_page_writeback(page);
}
+
ClearPageSwapCache(page);
+ page->private = save_private;
if (ret == 0 && (!PageUptodate(page) || PageError(page)))
ret = -EIO;
return ret;
--- 2.6.5-rc3-mjb2/mm/prio_tree.c 1970-01-01 01:00:00.000000000 +0100
+++ anobjrmap9/mm/prio_tree.c 2004-04-04 13:05:41.408470128 +0100
@@ -0,0 +1,577 @@
+/*
+ * mm/prio_tree.c - priority search tree for mapping->i_mmap{,_shared}
+ *
+ * Copyright (C) 2004, Rajesh Venkatasubramanian <[email protected]>
+ *
+ * Based on the radix priority search tree proposed by Edward M. McCreight
+ * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
+ *
+ * 02Feb2004 Initial version
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/prio_tree.h>
+
+/*
+ * A clever mix of heap and radix trees forms a radix priority search tree (PST)
+ * which is useful for storing intervals, e.g, we can consider a vma as a closed
+ * interval of file pages [offset_begin, offset_end], and store all vmas that
+ * map a file in a PST. Then, using the PST, we can answer a stabbing query,
+ * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a
+ * given input interval X (a set of consecutive file pages), in "O(log n + m)"
+ * time where 'log n' is the height of the PST, and 'm' is the number of stored
+ * intervals (vmas) that overlap (map) with the input interval X (the set of
+ * consecutive file pages).
+ *
+ * In our implementation, we store closed intervals of the form [radix_index,
+ * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST
+ * is designed for storing intervals with unique radix indices, i.e., each
+ * interval have different radix_index. However, this limitation can be easily
+ * overcome by using the size, i.e., heap_index - radix_index, as part of the
+ * index, so we index the tree using [(radix_index,size), heap_index].
+ *
+ * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit
+ * machine, the maximum height of a PST can be 64. We can use a balanced version
+ * of the priority search tree to optimize the tree height, but the balanced
+ * tree proposed by McCreight is too complex and memory-hungry for our purpose.
+ */
+
+static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
+
+/*
+ * Maximum heap_index that can be stored in a PST with index_bits bits
+ */
+static inline unsigned long prio_tree_maxindex(unsigned int bits)
+{
+ return index_bits_to_maxindex[bits - 1];
+}
+
+/*
+ * Extend a priority search tree so that it can store a node with heap_index
+ * max_heap_index. In the worst case, this algorithm takes O((log n)^2).
+ * However, this function is used rarely and the common case performance is
+ * not bad.
+ */
+static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root,
+ struct prio_tree_node *node, unsigned long max_heap_index)
+{
+ struct prio_tree_node *first = NULL, *prev, *last = NULL;
+
+ if (max_heap_index > prio_tree_maxindex(root->index_bits))
+ root->index_bits++;
+
+ while (max_heap_index > prio_tree_maxindex(root->index_bits)) {
+ root->index_bits++;
+
+ if (prio_tree_empty(root))
+ continue;
+
+ if (first == NULL) {
+ first = root->prio_tree_node;
+ prio_tree_remove(root, root->prio_tree_node);
+ INIT_PRIO_TREE_NODE(first);
+ last = first;
+ }
+ else {
+ prev = last;
+ last = root->prio_tree_node;
+ prio_tree_remove(root, root->prio_tree_node);
+ INIT_PRIO_TREE_NODE(last);
+ prev->left = last;
+ last->parent = prev;
+ }
+ }
+
+ INIT_PRIO_TREE_NODE(node);
+
+ if (first) {
+ node->left = first;
+ first->parent = node;
+ }
+ else
+ last = node;
+
+ if (!prio_tree_empty(root)) {
+ last->left = root->prio_tree_node;
+ last->left->parent = last;
+ }
+
+ root->prio_tree_node = node;
+ return node;
+}
+
+/*
+ * Replace a prio_tree_node with a new node and return the old node
+ */
+static inline struct prio_tree_node *prio_tree_replace(
+ struct prio_tree_root *root, struct prio_tree_node *old,
+ struct prio_tree_node *node)
+{
+ INIT_PRIO_TREE_NODE(node);
+
+ if (prio_tree_root(old)) {
+ BUG_ON(root->prio_tree_node != old);
+ /*
+ * We can reduce root->index_bits here. However, it is complex
+ * and does not help much to improve performance (IMO).
+ */
+ node->parent = node;
+ root->prio_tree_node = node;
+ }
+ else {
+ node->parent = old->parent;
+ if (old->parent->left == old)
+ old->parent->left = node;
+ else {
+ BUG_ON(old->parent->right != old);
+ old->parent->right = node;
+ }
+ }
+
+ if (!prio_tree_left_empty(old)) {
+ node->left = old->left;
+ old->left->parent = node;
+ }
+
+ if (!prio_tree_right_empty(old)) {
+ node->right = old->right;
+ old->right->parent = node;
+ }
+
+ return old;
+}
+
+#undef swap
+#define swap(x,y,z) do {z = x; x = y; y = z; } while (0)
+
+/*
+ * Insert a prio_tree_node @node into a radix priority search tree @root. The
+ * algorithm typically takes O(log n) time where 'log n' is the number of bits
+ * required to represent the maximum heap_index. In the worst case, the algo
+ * can take O((log n)^2) - check prio_tree_expand.
+ *
+ * If a prior node with same radix_index and heap_index is already found in
+ * the tree, then returns the address of the prior node. Otherwise, inserts
+ * @node into the tree and returns @node.
+ */
+
+struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
+ struct prio_tree_node *node)
+{
+ struct prio_tree_node *cur, *res = node;
+ unsigned long radix_index, heap_index;
+ unsigned long r_index, h_index, index, mask;
+ int size_flag = 0;
+
+ GET_INDEX(node, radix_index, heap_index);
+
+ if (prio_tree_empty(root) ||
+ heap_index > prio_tree_maxindex(root->index_bits))
+ return prio_tree_expand(root, node, heap_index);
+
+ cur = root->prio_tree_node;
+ mask = 1UL << (root->index_bits - 1);
+
+ while (mask) {
+ GET_INDEX(cur, r_index, h_index);
+
+ if (r_index == radix_index && h_index == heap_index)
+ return cur;
+
+ if (h_index < heap_index || (h_index == heap_index &&
+ r_index > radix_index))
+ {
+ struct prio_tree_node *tmp = node;
+ node = prio_tree_replace(root, cur, node);
+ cur = tmp;
+ swap(r_index, radix_index, index);
+ swap(h_index, heap_index, index);
+ }
+
+ if (size_flag)
+ index = heap_index - radix_index;
+ else
+ index = radix_index;
+
+ if (index & mask) {
+ if (prio_tree_right_empty(cur)) {
+ INIT_PRIO_TREE_NODE(node);
+ cur->right = node;
+ node->parent = cur;
+ return res;
+ }
+ else
+ cur = cur->right;
+ }
+ else {
+ if (prio_tree_left_empty(cur)) {
+ INIT_PRIO_TREE_NODE(node);
+ cur->left = node;
+ node->parent = cur;
+ return res;
+ }
+ else
+ cur = cur->left;
+ }
+
+ mask >>= 1;
+
+ if (!mask) {
+ mask = 1UL << (root->index_bits - 1);
+ size_flag = 1;
+ }
+ }
+ /* Should not reach here */
+ BUG();
+ return NULL;
+}
+
+/*
+ * Remove a prio_tree_node @node from a radix priority search tree @root. The
+ * algorithm takes O(log n) time where 'log n' is the number of bits required
+ * to represent the maximum heap_index.
+ */
+
+void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node)
+{
+ struct prio_tree_node *cur;
+ unsigned long r_index, h_index_right, h_index_left;
+
+ cur = node;
+
+ while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) {
+ if (!prio_tree_left_empty(cur))
+ GET_INDEX(cur->left, r_index, h_index_left);
+ else {
+ cur = cur->right;
+ continue;
+ }
+
+ if (!prio_tree_right_empty(cur))
+ GET_INDEX(cur->right, r_index, h_index_right);
+ else {
+ cur = cur->left;
+ continue;
+ }
+
+ /* both h_index_left and h_index_right cannot be 0 */
+ if (h_index_left >= h_index_right)
+ cur = cur->left;
+ else
+ cur = cur->right;
+ }
+
+ if (prio_tree_root(cur)) {
+ BUG_ON(root->prio_tree_node != cur);
+ *root = PRIO_TREE_ROOT;
+ return;
+ }
+
+ if (cur->parent->right == cur)
+ cur->parent->right = cur->parent;
+ else {
+ BUG_ON(cur->parent->left != cur);
+ cur->parent->left = cur->parent;
+ }
+
+ while (cur != node)
+ cur = prio_tree_replace(root, cur->parent, cur);
+
+ return;
+}
+
+/*
+ * Following functions help to enumerate all prio_tree_nodes in the tree that
+ * overlap with the input interval X [radix_index, heap_index]. The enumeration
+ * takes O(log n + m) time where 'log n' is the height of the tree (which is
+ * proportional to # of bits required to represent the maximum heap_index) and
+ * 'm' is the number of prio_tree_nodes that overlap the interval X.
+ */
+
+static inline struct prio_tree_node *__prio_tree_left(
+ struct prio_tree_root *root, struct prio_tree_iter *iter,
+ unsigned long radix_index, unsigned long heap_index,
+ unsigned long *r_index, unsigned long *h_index)
+{
+ if (prio_tree_left_empty(iter->cur))
+ return NULL;
+
+ GET_INDEX(iter->cur->left, *r_index, *h_index);
+
+ if (radix_index <= *h_index) {
+ iter->cur = iter->cur->left;
+ iter->mask >>= 1;
+ if (iter->mask) {
+ if (iter->size_level)
+ iter->size_level++;
+ }
+ else {
+ iter->size_level = 1;
+ iter->mask = 1UL << (root->index_bits - 1);
+ }
+ return iter->cur;
+ }
+
+ return NULL;
+}
+
+
+static inline struct prio_tree_node *__prio_tree_right(
+ struct prio_tree_root *root, struct prio_tree_iter *iter,
+ unsigned long radix_index, unsigned long heap_index,
+ unsigned long *r_index, unsigned long *h_index)
+{
+ unsigned long value;
+
+ if (prio_tree_right_empty(iter->cur))
+ return NULL;
+
+ if (iter->size_level)
+ value = iter->value;
+ else
+ value = iter->value | iter->mask;
+
+ if (heap_index < value)
+ return NULL;
+
+ GET_INDEX(iter->cur->right, *r_index, *h_index);
+
+ if (radix_index <= *h_index) {
+ iter->cur = iter->cur->right;
+ iter->mask >>= 1;
+ iter->value = value;
+ if (iter->mask) {
+ if (iter->size_level)
+ iter->size_level++;
+ }
+ else {
+ iter->size_level = 1;
+ iter->mask = 1UL << (root->index_bits - 1);
+ }
+ return iter->cur;
+ }
+
+ return NULL;
+}
+
+static inline struct prio_tree_node *__prio_tree_parent(
+ struct prio_tree_iter *iter)
+{
+ iter->cur = iter->cur->parent;
+ iter->mask <<= 1;
+ if (iter->size_level) {
+ if (iter->size_level == 1)
+ iter->mask = 1UL;
+ iter->size_level--;
+ }
+ else if (iter->value & iter->mask)
+ iter->value ^= iter->mask;
+ return iter->cur;
+}
+
+static inline int overlap(unsigned long radix_index, unsigned long heap_index,
+ unsigned long r_index, unsigned long h_index)
+{
+ if (heap_index < r_index || radix_index > h_index)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * prio_tree_first:
+ *
+ * Get the first prio_tree_node that overlaps with the interval [radix_index,
+ * heap_index]. Note that always radix_index <= heap_index. We do a pre-order
+ * traversal of the tree.
+ */
+struct prio_tree_node *prio_tree_first(struct prio_tree_root *root,
+ struct prio_tree_iter *iter, unsigned long radix_index,
+ unsigned long heap_index)
+{
+ unsigned long r_index, h_index;
+
+ *iter = PRIO_TREE_ITER;
+
+ if (prio_tree_empty(root))
+ return NULL;
+
+ GET_INDEX(root->prio_tree_node, r_index, h_index);
+
+ if (radix_index > h_index)
+ return NULL;
+
+ iter->mask = 1UL << (root->index_bits - 1);
+ iter->cur = root->prio_tree_node;
+
+ while (1) {
+ if (overlap(radix_index, heap_index, r_index, h_index))
+ return iter->cur;
+
+ if (__prio_tree_left(root, iter, radix_index, heap_index,
+ &r_index, &h_index))
+ continue;
+
+ if (__prio_tree_right(root, iter, radix_index, heap_index,
+ &r_index, &h_index))
+ continue;
+
+ break;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(prio_tree_first);
+
+/* Get the next prio_tree_node that overlaps with the input interval in iter */
+struct prio_tree_node *prio_tree_next(struct prio_tree_root *root,
+ struct prio_tree_iter *iter, unsigned long radix_index,
+ unsigned long heap_index)
+{
+ unsigned long r_index, h_index;
+
+repeat:
+ while (__prio_tree_left(root, iter, radix_index, heap_index,
+ &r_index, &h_index))
+ if (overlap(radix_index, heap_index, r_index, h_index))
+ return iter->cur;
+
+ while (!__prio_tree_right(root, iter, radix_index, heap_index,
+ &r_index, &h_index)) {
+ while (!prio_tree_root(iter->cur) &&
+ iter->cur->parent->right == iter->cur)
+ __prio_tree_parent(iter);
+
+ if (prio_tree_root(iter->cur))
+ return NULL;
+
+ __prio_tree_parent(iter);
+ }
+
+ if (overlap(radix_index, heap_index, r_index, h_index))
+ return iter->cur;
+
+ goto repeat;
+}
+EXPORT_SYMBOL(prio_tree_next);
+
+/*
+ * Radix priority search tree for address_space->i_mmap_{_shared}
+ *
+ * For each vma that map a unique set of file pages i.e., unique [radix_index,
+ * heap_index] value, we have a corresponing priority search tree node. If
+ * multiple vmas have identical [radix_index, heap_index] value, then one of
+ * them is used as a tree node and others are stored in a vm_set list. The tree
+ * node points to the first vma (head) of the list using vm_set_head.
+ *
+ * prio_tree_root
+ * |
+ * A vm_set_head
+ * / \ /
+ * L R -> H-I-J-K-M-N-O-P-Q-S
+ * ^ ^ <-- vm_set.list -->
+ * tree nodes
+ *
+ * We need some way to identify whether a vma is a tree node, head of a vm_set
+ * list, or just a member of a vm_set list. We cannot use vm_flags to store
+ * such information. The reason is, in the above figure, it is possible that
+ * vm_flags' of R and H are covered by the different mmap_sems. When R is
+ * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
+ * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
+ * That's why some trick involving shared.both.parent is used for identifying
+ * tree nodes and list head nodes. We can possibly use the least significant
+ * bit of the vm_set_head field to mark tree and list head nodes. I was worried
+ * about the alignment of vm_area_struct in various architectures.
+ *
+ * vma radix priority search tree node rules:
+ *
+ * vma->shared.both.parent != NULL ==> a tree node
+ *
+ * vma->shared.both.parent == NULL
+ * vma->vm_set_head != NULL ==> list head of vmas that map same pages
+ * vma->vm_set_head == NULL ==> a list node
+ */
+
+void __vma_prio_tree_insert(struct prio_tree_root *root,
+ struct vm_area_struct *vma)
+{
+ struct prio_tree_node *ptr;
+ struct vm_area_struct *old;
+
+ ptr = prio_tree_insert(root, &vma->shared.prio_tree_node);
+
+ if (ptr == &vma->shared.prio_tree_node) {
+ vma->vm_set_head = NULL;
+ return;
+ }
+
+ old = prio_tree_entry(ptr, struct vm_area_struct,
+ shared.prio_tree_node);
+
+ __vma_prio_tree_add(vma, old);
+}
+
+void __vma_prio_tree_remove(struct prio_tree_root *root,
+ struct vm_area_struct *vma)
+{
+ struct vm_area_struct *node, *head, *new_head;
+
+ if (vma->shared.both.parent == NULL && vma->vm_set_head == NULL) {
+ list_del_init(&vma->shared.vm_set.list);
+ INIT_VMA_SHARED(vma);
+ return;
+ }
+
+ if (vma->vm_set_head) {
+ /* Leave this BUG_ON till prio_tree patch stabilizes */
+ BUG_ON(vma->vm_set_head->vm_set_head != vma);
+ if (vma->shared.both.parent) {
+ head = vma->vm_set_head;
+ if (!list_empty(&head->shared.vm_set.list)) {
+ new_head = list_entry(
+ head->shared.vm_set.list.next,
+ struct vm_area_struct,
+ shared.vm_set.list);
+ list_del_init(&head->shared.vm_set.list);
+ }
+ else
+ new_head = NULL;
+
+ prio_tree_replace(root, &vma->shared.prio_tree_node,
+ &head->shared.prio_tree_node);
+ head->vm_set_head = new_head;
+ if (new_head)
+ new_head->vm_set_head = head;
+
+ }
+ else {
+ node = vma->vm_set_head;
+ if (!list_empty(&vma->shared.vm_set.list)) {
+ new_head = list_entry(
+ vma->shared.vm_set.list.next,
+ struct vm_area_struct,
+ shared.vm_set.list);
+ list_del_init(&vma->shared.vm_set.list);
+ node->vm_set_head = new_head;
+ new_head->vm_set_head = node;
+ }
+ else
+ node->vm_set_head = NULL;
+ }
+ INIT_VMA_SHARED(vma);
+ return;
+ }
+
+ prio_tree_remove(root, &vma->shared.prio_tree_node);
+ INIT_VMA_SHARED(vma);
+}
+
+void __init prio_tree_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
+ index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
+ index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
+}
--- 2.6.5-rc3-mjb2/mm/rmap.c 2004-04-02 21:01:47.423843768 +0100
+++ anobjrmap9/mm/rmap.c 2004-04-04 13:05:41.411469672 +0100
@@ -154,19 +154,16 @@ static inline void clear_page_anon(struc
**/

/*
- * At what user virtual address is page expected in file-backed vma?
+ * At what user virtual address is pgoff expected in file-backed vma?
*/
-#define NOADDR (~0UL) /* impossible user virtual address */
static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_address(struct vm_area_struct *vma, unsigned long pgoff)
{
- unsigned long pgoff;
unsigned long address;

- pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- return (address >= vma->vm_start && address < vma->vm_end)?
- address: NOADDR;
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ return address;
}

/**
@@ -182,8 +179,14 @@ static int page_referenced_one(struct pa
pte_t *pte;
int referenced = 0;

- if (!spin_trylock(&mm->page_table_lock))
+ if (!spin_trylock(&mm->page_table_lock)) {
+ /*
+ * For debug we're currently warning if not all found,
+ * but in this case that's expected: suppress warning.
+ */
+ *mapcount = -1;
return 0;
+ }

pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -246,6 +249,8 @@ static inline int page_referenced_anon(s
if (!*mapcount)
goto out;
}
+
+ WARN_ON(*mapcount > 0);
out:
spin_unlock(&anonhd->lock);
return referenced;
@@ -268,45 +273,54 @@ out:
static inline int page_referenced_obj(struct page *page, int *mapcount)
{
struct address_space *mapping = page->mapping;
+ unsigned long pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
unsigned long address;
int referenced = 0;

if (down_trylock(&mapping->i_shared_sem))
return 0;

- list_for_each_entry(vma, &mapping->i_mmap, shared) {
- if (!vma->vm_mm->rss)
- continue;
- address = vma_address(page, vma);
- if (address == NOADDR)
- continue;
- if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) ==
- (VM_LOCKED|VM_MAYSHARE)) {
+ vma = __vma_prio_tree_first(&mapping->i_mmap,
+ &iter, pgoff, pgoff);
+ while (vma) {
+ if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
+ == (VM_LOCKED|VM_MAYSHARE)) {
referenced++;
goto out;
}
- referenced += page_referenced_one(
- page, vma->vm_mm, address, mapcount);
- if (!*mapcount)
- goto out;
+ if (vma->vm_mm->rss) {
+ address = vma_address(vma, pgoff);
+ referenced += page_referenced_one(
+ page, vma->vm_mm, address, mapcount);
+ if (!*mapcount)
+ goto out;
+ }
+ vma = __vma_prio_tree_next(vma, &mapping->i_mmap,
+ &iter, pgoff, pgoff);
}

- list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- if (!vma->vm_mm->rss || (vma->vm_flags & VM_NONLINEAR))
- continue;
- address = vma_address(page, vma);
- if (address == NOADDR)
- continue;
+ vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
+ while (vma) {
if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) {
referenced++;
goto out;
}
- referenced += page_referenced_one(
- page, vma->vm_mm, address, mapcount);
- if (!*mapcount)
- goto out;
+ if (vma->vm_mm->rss) {
+ address = vma_address(vma, pgoff);
+ referenced += page_referenced_one(
+ page, vma->vm_mm, address, mapcount);
+ if (!*mapcount)
+ goto out;
+ }
+ vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
}
+
+ if (list_empty(&mapping->i_mmap_nonlinear))
+ WARN_ON(*mapcount > 0);
out:
up(&mapping->i_shared_sem);
return referenced;
@@ -688,7 +702,9 @@ out:
static inline int try_to_unmap_obj(struct page *page, int *mapcount)
{
struct address_space *mapping = page->mapping;
+ unsigned long pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
unsigned long address;
int ret = SWAP_AGAIN;
unsigned long cursor;
@@ -698,47 +714,50 @@ static inline int try_to_unmap_obj(struc
if (down_trylock(&mapping->i_shared_sem))
return ret;

- list_for_each_entry(vma, &mapping->i_mmap, shared) {
- if (!vma->vm_mm->rss)
- continue;
- address = vma_address(page, vma);
- if (address == NOADDR)
- continue;
- ret = try_to_unmap_one(
- page, vma->vm_mm, address, mapcount, vma);
- if (ret == SWAP_FAIL || !*mapcount)
- goto out;
+ vma = __vma_prio_tree_first(&mapping->i_mmap,
+ &iter, pgoff, pgoff);
+ while (vma) {
+ if (vma->vm_mm->rss) {
+ address = vma_address(vma, pgoff);
+ ret = try_to_unmap_one(
+ page, vma->vm_mm, address, mapcount, vma);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
+ }
+ vma = __vma_prio_tree_next(vma, &mapping->i_mmap,
+ &iter, pgoff, pgoff);
}

- list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
- /*
- * Defer unmapping nonlinear to the next loop,
- * but take notes while we're here e.g. don't
- * want to loop again when no nonlinear vmas.
- */
- if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
- continue;
- cursor = (unsigned long) vma->vm_private_data;
- if (cursor > max_nl_cursor)
- max_nl_cursor = cursor;
- cursor = vma->vm_end - vma->vm_start;
- if (cursor > max_nl_size)
- max_nl_size = cursor;
- continue;
+ vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
+ while (vma) {
+ if (vma->vm_mm->rss) {
+ address = vma_address(vma, pgoff);
+ ret = try_to_unmap_one(
+ page, vma->vm_mm, address, mapcount, vma);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
}
- if (!vma->vm_mm->rss)
- continue;
- address = vma_address(page, vma);
- if (address == NOADDR)
- continue;
- ret = try_to_unmap_one(
- page, vma->vm_mm, address, mapcount, vma);
- if (ret == SWAP_FAIL || !*mapcount)
- goto out;
+ vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
}

- if (max_nl_size == 0) /* no nonlinear vmas of this file */
+ if (list_empty(&mapping->i_mmap_nonlinear))
+ goto out;
+
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+ shared.vm_set.list) {
+ if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
+ continue;
+ cursor = (unsigned long) vma->vm_private_data;
+ if (cursor > max_nl_cursor)
+ max_nl_cursor = cursor;
+ cursor = vma->vm_end - vma->vm_start;
+ if (cursor > max_nl_size)
+ max_nl_size = cursor;
+ }
+
+ if (max_nl_size == 0)
goto out;

/*
@@ -755,9 +774,9 @@ static inline int try_to_unmap_obj(struc
max_nl_cursor = CLUSTER_SIZE;

do {
- list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- if (VM_NONLINEAR != (vma->vm_flags &
- (VM_NONLINEAR|VM_LOCKED|VM_RESERVED)))
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+ shared.vm_set.list) {
+ if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
continue;
cursor = (unsigned long) vma->vm_private_data;
while (vma->vm_mm->rss &&
@@ -771,6 +790,7 @@ static inline int try_to_unmap_obj(struc
vma->vm_private_data = (void *) cursor;
if (*mapcount <= 0)
goto relock;
+ cond_resched();
}
if (ret != SWAP_FAIL)
vma->vm_private_data =
@@ -785,9 +805,9 @@ static inline int try_to_unmap_obj(struc
* in locked vmas). Reset cursor on all unreserved nonlinear
* vmas, now forgetting on which ones it had fallen behind.
*/
- list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- if ((vma->vm_flags & (VM_NONLINEAR|VM_RESERVED)) ==
- VM_NONLINEAR)
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+ shared.vm_set.list) {
+ if (!(vma->vm_flags & VM_RESERVED))
vma->vm_private_data = 0;
}
relock:
--- 2.6.5-rc3-mjb2/mm/shmem.c 2004-04-02 21:01:47.427843160 +0100
+++ anobjrmap9/mm/shmem.c 2004-04-04 13:05:41.413469368 +0100
@@ -1351,7 +1351,8 @@ static void do_shmem_file_read(struct fi
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
- if (!list_empty(&mapping->i_mmap_shared))
+ if (!prio_tree_empty(&mapping->i_mmap_shared) ||
+ !list_empty(&mapping->i_mmap_nonlinear))
flush_dcache_page(page);
/*
* Mark the page accessed if we read the beginning.
--- 2.6.5-rc3-mjb2/mm/vmscan.c 2004-04-02 21:01:47.443840728 +0100
+++ anobjrmap9/mm/vmscan.c 2004-04-04 13:05:41.415469064 +0100
@@ -191,9 +191,11 @@ static inline int page_mapping_inuse(str
return 0;

/* File is mmap'd by somebody. */
- if (!list_empty(&mapping->i_mmap))
+ if (!prio_tree_empty(&mapping->i_mmap))
return 1;
- if (!list_empty(&mapping->i_mmap_shared))
+ if (!prio_tree_empty(&mapping->i_mmap_shared))
+ return 1;
+ if (!list_empty(&mapping->i_mmap_nonlinear))
return 1;

return 0;


2004-04-09 20:39:34

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

> This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
> tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
> Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
> of anon_vma, and of course each tree has its own additional features.

This slows down kernel compile a little, but worse, it slows down SDET
by about 25% (on the 16x). I think you did something horrible to sem
contention ... presumably i_shared_sem, which SDET was fighting with
as it was anyway ;-(

Diffprofile shows:


122626 15.7% total
44129 790.0% __down
20988 4.1% default_idle
12101 550.3% __wake_up
11723 489.1% finish_task_switch
6988 77.4% do_wp_page
3983 21.7% copy_page_range
2683 19.2% zap_pte_range
2325 54.3% do_anonymous_page
2293 73.1% copy_mm
1787 68.3% remove_shared_vm_struct
1768 101.6% pte_alloc_one
1564 40.0% do_no_page
1520 50.8% do_page_fault
1376 39.2% clear_page_tables
1282 63.4% __copy_user_intel
926 9.4% page_remove_rmap
878 13.1% __copy_to_user_ll
835 46.8% __block_prepare_write
788 35.8% copy_process
777 0.0% __vma_prio_tree_remove
761 48.8% buffered_rmqueue
740 48.6% free_hot_cold_page
674 128.4% vma_link
641 0.0% __vma_prio_tree_insert
612 941.5% sched_clock
585 0.0% prio_tree_insert
563 60.4% exit_notify
547 225.1% split_vma
539 6.4% release_pages
534 464.3% schedule
495 32.0% release_task
422 148.1% flush_signal_handlers
421 66.6% find_vma
420 79.5% set_page_dirty
409 60.1% fput
359 44.5% __copy_from_user_ll
319 47.6% do_mmap_pgoff
290 254.4% find_vma_prepare
270 167.7% rb_insert_color
254 61.7% pte_alloc_map
251 91.3% exit_mmap
229 23.2% __read_lock_failed
228 9.9% filemap_nopage
...
-100 -29.3% group_reserve_blocks
-107 -53.5% .text.lock.namespace
-107 -18.4% render_sigset_t
-126 -18.7% mmgrab
-146 -10.9% generic_file_open
-166 -9.5% ext2_new_inode
-166 -38.1% d_path
-166 -20.1% __find_get_block_slow
-173 -20.7% proc_pid_status
-182 -19.3% update_atime
-185 -25.8% fd_install
-202 -13.8% .text.lock.highmem
-221 -14.5% __fput
-225 -14.3% number
-257 -14.2% proc_pid_stat
-284 -21.6% file_kill
-290 -35.3% proc_root_link
-300 -36.5% ext2_new_block
-349 -61.7% .text.lock.base
-382 -48.0% proc_check_root
-412 -19.4% path_release
-454 -20.0% file_move
-462 -32.2% lookup_mnt
-515 -4.5% find_get_page
-547 -34.5% .text.lock.dcache
-689 -31.2% follow_mount
-940 -33.8% .text.lock.dec_and_lock
-1043 -51.6% .text.lock.file_table
-1115 -9.9% __d_lookup
-1226 -20.1% path_lookup
-1305 -61.5% grab_block
-2101 -29.8% atomic_dec_and_lock
-2554 -40.3% .text.lock.filemap

Subject: Re: [PATCH] anobjrmap 9 priority mjb tree


Does SDET use mremap a lot ? Hugh added i_shared_sem in mremap -> move_vma
-> move_page_tables path to avoid orphaned ptes due to mremap vs. truncate
race. That may be the reason for the slowdown, I am not sure.

Rajesh

On Fri, 9 Apr 2004, Martin J. Bligh wrote:

> > This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
> > tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
> > Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
> > of anon_vma, and of course each tree has its own additional features.
>
> This slows down kernel compile a little, but worse, it slows down SDET
> by about 25% (on the 16x). I think you did something horrible to sem
> contention ... presumably i_shared_sem, which SDET was fighting with
> as it was anyway ;-(
>
> Diffprofile shows:
>
>
> 122626 15.7% total
> 44129 790.0% __down
> 20988 4.1% default_idle
> 12101 550.3% __wake_up
> 11723 489.1% finish_task_switch
> 6988 77.4% do_wp_page
> 3983 21.7% copy_page_range
> 2683 19.2% zap_pte_range
> 2325 54.3% do_anonymous_page
> 2293 73.1% copy_mm
> 1787 68.3% remove_shared_vm_struct
> 1768 101.6% pte_alloc_one
> 1564 40.0% do_no_page
> 1520 50.8% do_page_fault
> 1376 39.2% clear_page_tables
> 1282 63.4% __copy_user_intel
> 926 9.4% page_remove_rmap
> 878 13.1% __copy_to_user_ll
> 835 46.8% __block_prepare_write
> 788 35.8% copy_process
> 777 0.0% __vma_prio_tree_remove
> 761 48.8% buffered_rmqueue
> 740 48.6% free_hot_cold_page
> 674 128.4% vma_link
> 641 0.0% __vma_prio_tree_insert
> 612 941.5% sched_clock
> 585 0.0% prio_tree_insert
> 563 60.4% exit_notify
> 547 225.1% split_vma
> 539 6.4% release_pages
> 534 464.3% schedule
> 495 32.0% release_task
> 422 148.1% flush_signal_handlers
> 421 66.6% find_vma
> 420 79.5% set_page_dirty
> 409 60.1% fput
> 359 44.5% __copy_from_user_ll
> 319 47.6% do_mmap_pgoff
> 290 254.4% find_vma_prepare
> 270 167.7% rb_insert_color
> 254 61.7% pte_alloc_map
> 251 91.3% exit_mmap
> 229 23.2% __read_lock_failed
> 228 9.9% filemap_nopage
> ...
> -100 -29.3% group_reserve_blocks
> -107 -53.5% .text.lock.namespace
> -107 -18.4% render_sigset_t
> -126 -18.7% mmgrab
> -146 -10.9% generic_file_open
> -166 -9.5% ext2_new_inode
> -166 -38.1% d_path
> -166 -20.1% __find_get_block_slow
> -173 -20.7% proc_pid_status
> -182 -19.3% update_atime
> -185 -25.8% fd_install
> -202 -13.8% .text.lock.highmem
> -221 -14.5% __fput
> -225 -14.3% number
> -257 -14.2% proc_pid_stat
> -284 -21.6% file_kill
> -290 -35.3% proc_root_link
> -300 -36.5% ext2_new_block
> -349 -61.7% .text.lock.base
> -382 -48.0% proc_check_root
> -412 -19.4% path_release
> -454 -20.0% file_move
> -462 -32.2% lookup_mnt
> -515 -4.5% find_get_page
> -547 -34.5% .text.lock.dcache
> -689 -31.2% follow_mount
> -940 -33.8% .text.lock.dec_and_lock
> -1043 -51.6% .text.lock.file_table
> -1115 -9.9% __d_lookup
> -1226 -20.1% path_lookup
> -1305 -61.5% grab_block
> -2101 -29.8% atomic_dec_and_lock
> -2554 -40.3% .text.lock.filemap
>
>

2004-04-09 21:51:12

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

On Fri, 9 Apr 2004, Martin J. Bligh wrote:
> > This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
> > tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
> > Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
> > of anon_vma, and of course each tree has its own additional features.
>
> This slows down kernel compile a little, but worse, it slows down SDET
> by about 25% (on the 16x). I think you did something horrible to sem
> contention ... presumably i_shared_sem, which SDET was fighting with
> as it was anyway ;-(
>
> Diffprofile shows:
>
> 122626 15.7% total
> 44129 790.0% __down
> 20988 4.1% default_idle

Many thanks for the good news, Martin ;)
Looks like I've done something very stupid, perhaps a mismerge.
Not found it yet, I'll carry on looking tomorrow.

Hugh

2004-04-09 22:17:29

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

--Hugh Dickins <[email protected]> wrote (on Friday, April 09, 2004 22:51:03 +0100):

> On Fri, 9 Apr 2004, Martin J. Bligh wrote:
>> > This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
>> > tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
>> > Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
>> > of anon_vma, and of course each tree has its own additional features.
>>
>> This slows down kernel compile a little, but worse, it slows down SDET
>> by about 25% (on the 16x). I think you did something horrible to sem
>> contention ... presumably i_shared_sem, which SDET was fighting with
>> as it was anyway ;-(
>>
>> Diffprofile shows:
>>
>> 122626 15.7% total
>> 44129 790.0% __down
>> 20988 4.1% default_idle
>
> Many thanks for the good news, Martin ;)
> Looks like I've done something very stupid, perhaps a mismerge.
> Not found it yet, I'll carry on looking tomorrow.

I don't think so ... I ran across a similar problems when I tried to convert
the vma list to a list-of-lists for what you're trying to use prio_tree
for, I think. The sorry fact is that the that thing is heavily contended,
and doing complex manipulations under it makes it worse ;-(

I vaguely considered being a smarty-pants and doing RCU list-of-lists, but
never got around to it. I'll shove in some backtraces and check out the
sem activity to be sure.

M.

2004-04-09 22:17:26

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

> Does SDET use mremap a lot ? Hugh added i_shared_sem in mremap -> move_vma
> -> move_page_tables path to avoid orphaned ptes due to mremap vs. truncate
> race. That may be the reason for the slowdown, I am not sure.

I don't think so .... I presume you're just holding the sem for longer
during normal operations due to the more complex data structure.
This was just with vs without the prio tree patch, so Hughs's changes
were in both sets ...

M.

> Rajesh
>
> On Fri, 9 Apr 2004, Martin J. Bligh wrote:
>
>> > This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
>> > tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
>> > Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
>> > of anon_vma, and of course each tree has its own additional features.
>>
>> This slows down kernel compile a little, but worse, it slows down SDET
>> by about 25% (on the 16x). I think you did something horrible to sem
>> contention ... presumably i_shared_sem, which SDET was fighting with
>> as it was anyway ;-(
>>
>> Diffprofile shows:
>>
>>
>> 122626 15.7% total
>> 44129 790.0% __down
>> 20988 4.1% default_idle
>> 12101 550.3% __wake_up
>> 11723 489.1% finish_task_switch
>> 6988 77.4% do_wp_page
>> 3983 21.7% copy_page_range
>> 2683 19.2% zap_pte_range
>> 2325 54.3% do_anonymous_page
>> 2293 73.1% copy_mm
>> 1787 68.3% remove_shared_vm_struct
>> 1768 101.6% pte_alloc_one
>> 1564 40.0% do_no_page
>> 1520 50.8% do_page_fault
>> 1376 39.2% clear_page_tables
>> 1282 63.4% __copy_user_intel
>> 926 9.4% page_remove_rmap
>> 878 13.1% __copy_to_user_ll
>> 835 46.8% __block_prepare_write
>> 788 35.8% copy_process
>> 777 0.0% __vma_prio_tree_remove
>> 761 48.8% buffered_rmqueue
>> 740 48.6% free_hot_cold_page
>> 674 128.4% vma_link
>> 641 0.0% __vma_prio_tree_insert
>> 612 941.5% sched_clock
>> 585 0.0% prio_tree_insert
>> 563 60.4% exit_notify
>> 547 225.1% split_vma
>> 539 6.4% release_pages
>> 534 464.3% schedule
>> 495 32.0% release_task
>> 422 148.1% flush_signal_handlers
>> 421 66.6% find_vma
>> 420 79.5% set_page_dirty
>> 409 60.1% fput
>> 359 44.5% __copy_from_user_ll
>> 319 47.6% do_mmap_pgoff
>> 290 254.4% find_vma_prepare
>> 270 167.7% rb_insert_color
>> 254 61.7% pte_alloc_map
>> 251 91.3% exit_mmap
>> 229 23.2% __read_lock_failed
>> 228 9.9% filemap_nopage
>> ...
>> -100 -29.3% group_reserve_blocks
>> -107 -53.5% .text.lock.namespace
>> -107 -18.4% render_sigset_t
>> -126 -18.7% mmgrab
>> -146 -10.9% generic_file_open
>> -166 -9.5% ext2_new_inode
>> -166 -38.1% d_path
>> -166 -20.1% __find_get_block_slow
>> -173 -20.7% proc_pid_status
>> -182 -19.3% update_atime
>> -185 -25.8% fd_install
>> -202 -13.8% .text.lock.highmem
>> -221 -14.5% __fput
>> -225 -14.3% number
>> -257 -14.2% proc_pid_stat
>> -284 -21.6% file_kill
>> -290 -35.3% proc_root_link
>> -300 -36.5% ext2_new_block
>> -349 -61.7% .text.lock.base
>> -382 -48.0% proc_check_root
>> -412 -19.4% path_release
>> -454 -20.0% file_move
>> -462 -32.2% lookup_mnt
>> -515 -4.5% find_get_page
>> -547 -34.5% .text.lock.dcache
>> -689 -31.2% follow_mount
>> -940 -33.8% .text.lock.dec_and_lock
>> -1043 -51.6% .text.lock.file_table
>> -1115 -9.9% __d_lookup
>> -1226 -20.1% path_lookup
>> -1305 -61.5% grab_block
>> -2101 -29.8% atomic_dec_and_lock
>> -2554 -40.3% .text.lock.filemap
>>
>>
>
>


2004-04-09 22:57:12

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

>> > This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
>> > tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
>> > Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
>> > of anon_vma, and of course each tree has its own additional features.
>>
>> This slows down kernel compile a little, but worse, it slows down SDET
>> by about 25% (on the 16x). I think you did something horrible to sem
>> contention ... presumably i_shared_sem, which SDET was fighting with
>> as it was anyway ;-(
>>
>> Diffprofile shows:
>>
>> 122626 15.7% total
>> 44129 790.0% __down
>> 20988 4.1% default_idle
>
> Many thanks for the good news, Martin ;)
> Looks like I've done something very stupid, perhaps a mismerge.
> Not found it yet, I'll carry on looking tomorrow.

I applied Andrew's high sophisticated proprietary semtrace technology.
The common ones are:

Call Trace:
[<c0105aee>] __down+0x96/0x10c
[<c0118fb8>] default_wake_function+0x0/0x1c
[<c0105cb8>] __down_failed+0x8/0xc
[<c014314f>] .text.lock.mmap+0x39/0x12a
[<c01421a7>] do_mmap_pgoff+0x4cf/0x60c
[<c010cb74>] old_mmap+0x108/0x144
[<c025c753>] syscall_call+0x7/0xb

Which is the vma_link call from do_mmap_pgoff here:

/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
*/
addr = vma->vm_start;

if (!file || !rb_parent || !vma_merge(mm, prev, rb_parent, addr,
addr + len, vma->vm_flags, file, pgoff)) {
vma_link(mm, vma, prev, rb_link, rb_parent);
if (correct_wcount)
atomic_inc(&inode->i_writecount);

vma_link takes i_shared_sem.

Call Trace:
[<c0105aee>] __down+0x96/0x10c
[<c0118fb8>] default_wake_function+0x0/0x1c
[<c0105cb8>] __down_failed+0x8/0xc
[<c01431dd>] .text.lock.mmap+0xc7/0x12a
[<c0142b4c>] do_munmap+0xbc/0x128
[<c0141f91>] do_mmap_pgoff+0x2b9/0x60c
[<c010cb74>] old_mmap+0x108/0x144
[<c025c753>] syscall_call+0x7/0xb

Is the call to split_vma from do_munmap here:

/*
* If we need to split any vma, do it now to save pain later.
*
* Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
* unmapped vm_area_struct will remain in use: so lower split_vma
* places tmp vma above, and higher split_vma places tmp vma below.
*/
if (start > mpnt->vm_start) {
if (split_vma(mm, mpnt, start, 0))
return -ENOMEM;
prev = mpnt;
}

split_vma takes i_shared_sem, then takes page_table_lock inside it
(which probably isn't helping either ;-)).

if (mapping)
down(&mapping->i_shared_sem);
spin_lock(&mm->page_table_lock);

Call Trace:
[<c0105aee>] __down+0x96/0x10c
[<c0118fb8>] default_wake_function+0x0/0x1c
[<c0105cb8>] __down_failed+0x8/0xc
[<c014311b>] .text.lock.mmap+0x5/0x12a
[<c0142f79>] exit_mmap+0x191/0x1d0
[<c011ae84>] mmput+0x50/0x70
[<c011ec6d>] do_exit+0x1b9/0x330
[<c011eefa>] do_group_exit+0x9e/0xa0
[<c011ef0a>] sys_exit_group+0xe/0x14
[<c025c753>] syscall_call+0x7/0xb

That's remove_shared_vm_struct calling i_shared_sem

Call Trace:
[<c0105aee>] __down+0x96/0x10c
[<c0118fb8>] default_wake_function+0x0/0x1c
[<c0105cb8>] __down_failed+0x8/0xc
[<c014311b>] .text.lock.mmap+0x5/0x12a
[<c0142694>] unmap_vma+0x44/0x78
[<c01426dc>] unmap_vma_list+0x14/0x20
[<c0142ba5>] do_munmap+0x115/0x128
[<c0141f91>] do_mmap_pgoff+0x2b9/0x60c
[<c010cb74>] old_mmap+0x108/0x144
[<c025c753>] syscall_call+0x7/0xb

That's remove_shared_vm_struct again, but called from unmap_vma this time

Call Trace:
[<c0105aee>] __down+0x96/0x10c
[<c0118fb8>] default_wake_function+0x0/0x1c
[<c0105cb8>] __down_failed+0x8/0xc
[<c011c494>] .text.lock.fork+0x79/0x125
[<c011be5c>] copy_process+0x61c/0xa6c
[<c011c322>] do_fork+0x76/0x16f
[<c0105619>] sys_clone+0x29/0x30
[<c025c753>] syscall_call+0x7/0xb

That's dup_mmap taking i_shared_sem here:

/* insert tmp into the share list, just after mpnt */
down(&file->f_mapping->i_shared_sem);
__vma_prio_tree_add(tmp, mpnt);
up(&file->f_mapping->i_shared_sem);

Subject: Re: [PATCH] anobjrmap 9 priority mjb tree


> > Does SDET use mremap a lot ? Hugh added i_shared_sem in mremap -> move_vma
> > -> move_page_tables path to avoid orphaned ptes due to mremap vs. truncate
> > race. That may be the reason for the slowdown, I am not sure.
>
> I don't think so .... I presume you're just holding the sem for longer
> during normal operations due to the more complex data structure.

I haven't done any benchmarks with prio_tree. I just tried kernel compile
which was not bad. I tried rmap-test.c and test-mmap3.c. The results were
not bad on UP. I didn't try them on SMP, so you can be right.

> This was just with vs without the prio tree patch, so Hughs's changes
> were in both sets ...

The change I mentioned above was added only in Hugh's prio_tree patch,
it was not there before. However, I was just guessing that the mremap
change can be a reason. It can very well be due to prio_tree complexity.
I don't have any data to prove or disprove either way.

Rajesh


> > On Fri, 9 Apr 2004, Martin J. Bligh wrote:
> >
> >> > This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
> >> > tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
> >> > Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
> >> > of anon_vma, and of course each tree has its own additional features.
> >>
> >> This slows down kernel compile a little, but worse, it slows down SDET
> >> by about 25% (on the 16x). I think you did something horrible to sem
> >> contention ... presumably i_shared_sem, which SDET was fighting with
> >> as it was anyway ;-(
> >>
> >> Diffprofile shows:
> >>
> >>
> >> 122626 15.7% total
> >> 44129 790.0% __down
> >> 20988 4.1% default_idle
> >> 12101 550.3% __wake_up
> >> 11723 489.1% finish_task_switch
> >> 6988 77.4% do_wp_page
> >> 3983 21.7% copy_page_range
> >> 2683 19.2% zap_pte_range
> >> 2325 54.3% do_anonymous_page
> >> 2293 73.1% copy_mm
> >> 1787 68.3% remove_shared_vm_struct
> >> 1768 101.6% pte_alloc_one
> >> 1564 40.0% do_no_page
> >> 1520 50.8% do_page_fault
> >> 1376 39.2% clear_page_tables
> >> 1282 63.4% __copy_user_intel
> >> 926 9.4% page_remove_rmap
> >> 878 13.1% __copy_to_user_ll
> >> 835 46.8% __block_prepare_write
> >> 788 35.8% copy_process
> >> 777 0.0% __vma_prio_tree_remove
> >> 761 48.8% buffered_rmqueue
> >> 740 48.6% free_hot_cold_page
> >> 674 128.4% vma_link
> >> 641 0.0% __vma_prio_tree_insert
> >> 612 941.5% sched_clock
> >> 585 0.0% prio_tree_insert
> >> 563 60.4% exit_notify
> >> 547 225.1% split_vma
> >> 539 6.4% release_pages
> >> 534 464.3% schedule
> >> 495 32.0% release_task
> >> 422 148.1% flush_signal_handlers
> >> 421 66.6% find_vma
> >> 420 79.5% set_page_dirty
> >> 409 60.1% fput
> >> 359 44.5% __copy_from_user_ll
> >> 319 47.6% do_mmap_pgoff
> >> 290 254.4% find_vma_prepare
> >> 270 167.7% rb_insert_color
> >> 254 61.7% pte_alloc_map
> >> 251 91.3% exit_mmap
> >> 229 23.2% __read_lock_failed
> >> 228 9.9% filemap_nopage
> >> ...
> >> -100 -29.3% group_reserve_blocks
> >> -107 -53.5% .text.lock.namespace
> >> -107 -18.4% render_sigset_t
> >> -126 -18.7% mmgrab
> >> -146 -10.9% generic_file_open
> >> -166 -9.5% ext2_new_inode
> >> -166 -38.1% d_path
> >> -166 -20.1% __find_get_block_slow
> >> -173 -20.7% proc_pid_status
> >> -182 -19.3% update_atime
> >> -185 -25.8% fd_install
> >> -202 -13.8% .text.lock.highmem
> >> -221 -14.5% __fput
> >> -225 -14.3% number
> >> -257 -14.2% proc_pid_stat
> >> -284 -21.6% file_kill
> >> -290 -35.3% proc_root_link
> >> -300 -36.5% ext2_new_block
> >> -349 -61.7% .text.lock.base
> >> -382 -48.0% proc_check_root
> >> -412 -19.4% path_release
> >> -454 -20.0% file_move
> >> -462 -32.2% lookup_mnt
> >> -515 -4.5% find_get_page
> >> -547 -34.5% .text.lock.dcache
> >> -689 -31.2% follow_mount
> >> -940 -33.8% .text.lock.dec_and_lock
> >> -1043 -51.6% .text.lock.file_table
> >> -1115 -9.9% __d_lookup
> >> -1226 -20.1% path_lookup
> >> -1305 -61.5% grab_block
> >> -2101 -29.8% atomic_dec_and_lock
> >> -2554 -40.3% .text.lock.filemap
> >>
> >>
> >
> >
>
>
>

2004-04-11 16:10:09

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

On Fri, 9 Apr 2004, Martin J. Bligh wrote:
> >> This slows down kernel compile a little, but worse, it slows down SDET
> >> by about 25% (on the 16x). I think you did something horrible to sem
> >> contention ... presumably i_shared_sem, which SDET was fighting with
> >> as it was anyway ;-(
> >>
> >> Diffprofile shows:
> >>
> >> 122626 15.7% total
> >> 44129 790.0% __down
> >> 20988 4.1% default_idle
>
> I applied Andrew's high sophisticated proprietary semtrace technology.

Thanks a lot, Martin, this seems pretty important.

So, i_shared_sem, as you supposed.

Do you still have the two profiles input to diffprofile?
I wonder if they'd have clues to help us understand it better.

Any chance of you doing the same comparison between 2.6.5-aa5
2.6.5-aa5 minus prio-tree? (Well, needn't be -aa5, whatever comes to
hand. Looks like "patch -p1 -R < prio-tree" mostly works, just some
rejects in mm/mmap.c itself, let me know if I can help out on that.)

If -aa is okay, I hope so, then it's surely some stupidity from me.

We're not at all surprised that vma linking and unlinking should take
rather longer; but the rise in __down, __wake_up, finish_task_switch
is horrifying. Or is that how it usually looks, when a semaphore is
well contended - thundering herd?

Hugh

2004-04-11 18:08:33

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

>> I applied Andrew's high sophisticated proprietary semtrace technology.
>
> Thanks a lot, Martin, this seems pretty important.
>
> So, i_shared_sem, as you supposed.
>
> Do you still have the two profiles input to diffprofile?
> I wonder if they'd have clues to help us understand it better.

Yup. Attatched.

> Any chance of you doing the same comparison between 2.6.5-aa5
> 2.6.5-aa5 minus prio-tree? (Well, needn't be -aa5, whatever comes to
> hand. Looks like "patch -p1 -R < prio-tree" mostly works, just some
> rejects in mm/mmap.c itself, let me know if I can help out on that.)
>
> If -aa is okay, I hope so, then it's surely some stupidity from me.

Good idea. Not sure how easy it'll be to back prio_tree out, but I can
surely do aa5, which would give us a good clue still. Might not be until
this time tommorow though.

> We're not at all surprised that vma linking and unlinking should take
> rather longer; but the rise in __down, __wake_up, finish_task_switch
> is horrifying. Or is that how it usually looks, when a semaphore is
> well contended - thundering herd?

I think there's just a locking cliff you fall off after a certain level
of contention ... i_shared_sem has always been bad for SDET, to be fair.
But I hate making it worse ;-) I did more investigation of it a year or
so ago ... something does horrible things to it (which is why akpm turned
it into a sem in the first place) ... maybe holding it over proess teardown
for eons or something. Bah, I want lockmeter for sems ;-)

Maybe I can dig out my old analysis ... I'll take a look. I suppose I could
always turn it back into a spinlock to look at it ;-) On the other hand, if
we can fix that problem, I think my "list of lists" was simpler than prio
tree (and is probably much more susceptible to RCU).

M.



Attachments:
(No filename) (1.76 kB)
mjb1 (16.82 kB)
mjb1-prio (17.45 kB)
Download all attachments
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree


This patch is an attempt at reducing the contention on i_shared_sem
by introducing a new semaphore i_mmap_sem. The i_shared_sem covers
i_mmap_shared tree and i_mmap_nonlinear list now, whereas i_mmap_sem
covers i_mmap tree. This may help to reduce the contention on
i_shared_sem if a file is mapped both private and shared. Kernel
compile time with and without this patch did not change much, though.

This patch is on top of 2.6.5-mjb1+anobjrmap9_prio. Compiled and
tested.

Martin! Are you interested in testing SDET with this patch ?



fs/hugetlbfs/inode.c | 6 +++++-
fs/inode.c | 1 +
include/linux/fs.h | 3 ++-
include/linux/mm.h | 24 +++++++++++++++++++++++-
kernel/fork.c | 4 ++--
mm/filemap.c | 6 +++---
mm/memory.c | 6 +++++-
mm/mmap.c | 51 +++++++++++++++------------------------------------
mm/mremap.c | 29 ++++++++++++-----------------
mm/rmap.c | 32 ++++++++++++++++++++++++--------
10 files changed, 92 insertions(+), 70 deletions(-)

diff -puN include/linux/fs.h~010_sem_contention include/linux/fs.h
--- mmlinux-2.6/include/linux/fs.h~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/include/linux/fs.h 2004-04-11 22:07:40.000000000 -0400
@@ -333,7 +333,8 @@ struct address_space {
struct prio_tree_root i_mmap; /* tree of private mappings */
struct prio_tree_root i_mmap_shared; /* tree of shared mappings */
struct list_head i_mmap_nonlinear;/*list of nonlinear mappings */
- struct semaphore i_shared_sem; /* protect both above lists */
+ struct semaphore i_mmap_sem; /* protect i_mmap prio_tree */
+ struct semaphore i_shared_sem; /* protect shared and nonlinear */
atomic_t truncate_count; /* Cover race condition with truncate */
unsigned long flags; /* error bits/gfp mask */
struct backing_dev_info *backing_dev_info; /* device readahead, etc */
diff -puN include/linux/mm.h~010_sem_contention include/linux/mm.h
--- mmlinux-2.6/include/linux/mm.h~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/include/linux/mm.h 2004-04-11 22:08:13.000000000 -0400
@@ -232,7 +232,7 @@ static inline void __vma_prio_tree_add(s
* We cannot modify vm_start, vm_end, vm_pgoff fields of a vma that has been
* already present in an i_mmap{_shared} tree without modifying the tree. The
* following helper function should be used when such modifications are
- * necessary. We should hold the mapping's i_shared_sem.
+ * necessary. We should hold the mapping's i_shared_sem or i_mmap_sem.
*
* This function can be (micro)optimized for some special cases (maybe later).
*/
@@ -296,6 +296,28 @@ static inline struct vm_area_struct *__v
return NULL;
}

+/* Caller should hold mmap_sem */
+static inline void vma_mapping_lock(struct vm_area_struct *vma)
+{
+ if (vma->vm_file) {
+ if (vma->vm_flags & VM_SHARED)
+ down(&vma->vm_file->f_mapping->i_shared_sem);
+ else
+ down(&vma->vm_file->f_mapping->i_mmap_sem);
+ }
+}
+
+/* Caller should hold mmap_sem */
+static inline void vma_mapping_unlock(struct vm_area_struct *vma)
+{
+ if (vma->vm_file) {
+ if (vma->vm_flags & VM_SHARED)
+ up(&vma->vm_file->f_mapping->i_shared_sem);
+ else
+ up(&vma->vm_file->f_mapping->i_mmap_sem);
+ }
+}
+
/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
diff -puN kernel/fork.c~010_sem_contention kernel/fork.c
--- mmlinux-2.6/kernel/fork.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/kernel/fork.c 2004-04-11 22:07:40.000000000 -0400
@@ -332,9 +332,9 @@ static inline int dup_mmap(struct mm_str
atomic_dec(&inode->i_writecount);

/* insert tmp into the share list, just after mpnt */
- down(&file->f_mapping->i_shared_sem);
+ vma_mapping_lock(tmp);
__vma_prio_tree_add(tmp, mpnt);
- up(&file->f_mapping->i_shared_sem);
+ vma_mapping_unlock(tmp);
}

/*
diff -puN mm/mmap.c~010_sem_contention mm/mmap.c
--- mmlinux-2.6/mm/mmap.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/mmap.c 2004-04-11 22:07:40.000000000 -0400
@@ -67,7 +67,7 @@ int mmap_use_hugepages = 0;
int mmap_hugepages_map_sz = 256;

/*
- * Requires inode->i_mapping->i_shared_sem
+ * Requires inode->i_mapping->i_shared_sem or i_mmap_sem
*/
static inline void
__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode,
@@ -96,10 +96,10 @@ static void remove_shared_vm_struct(stru

if (file) {
struct address_space *mapping = file->f_mapping;
- down(&mapping->i_shared_sem);
+ vma_mapping_lock(vma);
__remove_shared_vm_struct(vma, file->f_dentry->d_inode,
mapping);
- up(&mapping->i_shared_sem);
+ vma_mapping_unlock(vma);
}
}

@@ -298,18 +298,11 @@ static void vma_link(struct mm_struct *m
struct vm_area_struct *prev, struct rb_node **rb_link,
struct rb_node *rb_parent)
{
- struct address_space *mapping = NULL;
-
- if (vma->vm_file)
- mapping = vma->vm_file->f_mapping;
-
- if (mapping)
- down(&mapping->i_shared_sem);
+ vma_mapping_lock(vma);
spin_lock(&mm->page_table_lock);
__vma_link(mm, vma, prev, rb_link, rb_parent);
spin_unlock(&mm->page_table_lock);
- if (mapping)
- up(&mapping->i_shared_sem);
+ vma_mapping_unlock(vma);

mark_mm_hugetlb(mm, vma);
mm->map_count++;
@@ -318,8 +311,8 @@ static void vma_link(struct mm_struct *m

/*
* Insert vm structure into process list sorted by address and into the inode's
- * i_mmap ring. The caller should hold mm->page_table_lock and
- * ->f_mappping->i_shared_sem if vm_file is non-NULL.
+ * i_mmap ring. The caller should hold mm->page_table_lock and i_shared_sem or
+ * i_mmap_sem if vm_file is non-NULL.
*/
static void
__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
@@ -410,7 +403,6 @@ static struct vm_area_struct *vma_merge(
spinlock_t *lock = &mm->page_table_lock;
struct inode *inode = file ? file->f_dentry->d_inode : NULL;
struct address_space *mapping = file ? file->f_mapping : NULL;
- struct semaphore *i_shared_sem;
struct prio_tree_root *root = NULL;

/*
@@ -420,8 +412,6 @@ static struct vm_area_struct *vma_merge(
if (vm_flags & VM_SPECIAL)
return NULL;

- i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL;
-
if (mapping) {
if (vm_flags & VM_SHARED) {
if (likely(!(vm_flags & VM_NONLINEAR)))
@@ -442,13 +432,8 @@ static struct vm_area_struct *vma_merge(
if (prev->vm_end == addr &&
can_vma_merge_after(prev, vm_flags, file, pgoff)) {
struct vm_area_struct *next;
- int need_up = 0;

- if (unlikely(file && prev->vm_next &&
- prev->vm_next->vm_file == file)) {
- down(i_shared_sem);
- need_up = 1;
- }
+ vma_mapping_lock(prev);
spin_lock(lock);

/*
@@ -463,8 +448,7 @@ static struct vm_area_struct *vma_merge(
next->vm_end, prev->vm_pgoff);
__remove_shared_vm_struct(next, inode, mapping);
spin_unlock(lock);
- if (need_up)
- up(i_shared_sem);
+ vma_mapping_unlock(prev);
if (file)
fput(file);

@@ -475,8 +459,7 @@ static struct vm_area_struct *vma_merge(

__vma_modify(root, prev, prev->vm_start, end, prev->vm_pgoff);
spin_unlock(lock);
- if (need_up)
- up(i_shared_sem);
+ vma_mapping_unlock(prev);
return prev;
}

@@ -490,14 +473,12 @@ static struct vm_area_struct *vma_merge(
pgoff, (end - addr) >> PAGE_SHIFT))
return NULL;
if (end == prev->vm_start) {
- if (file)
- down(i_shared_sem);
+ vma_mapping_lock(prev);
spin_lock(lock);
__vma_modify(root, prev, addr, prev->vm_end,
prev->vm_pgoff - ((end - addr) >> PAGE_SHIFT));
spin_unlock(lock);
- if (file)
- up(i_shared_sem);
+ vma_mapping_unlock(prev);
return prev;
}
}
@@ -1361,8 +1342,7 @@ int split_vma(struct mm_struct * mm, str
root = &mapping->i_mmap;
}

- if (mapping)
- down(&mapping->i_shared_sem);
+ vma_mapping_lock(vma);
spin_lock(&mm->page_table_lock);

if (new_below)
@@ -1374,8 +1354,7 @@ int split_vma(struct mm_struct * mm, str
__insert_vm_struct(mm, new);

spin_unlock(&mm->page_table_lock);
- if (mapping)
- up(&mapping->i_shared_sem);
+ vma_mapping_unlock(vma);

return 0;
}
@@ -1609,7 +1588,7 @@ void exit_mmap(struct mm_struct *mm)

/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap ring. If vm_file is non-NULL
- * then i_shared_sem is taken here.
+ * then i_shared_sem or i_mmap_sem is taken here.
*/
void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
diff -puN mm/memory.c~010_sem_contention mm/memory.c
--- mmlinux-2.6/mm/memory.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/memory.c 2004-04-11 22:07:40.000000000 -0400
@@ -1133,11 +1133,15 @@ void invalidate_mmap_range(struct addres
if (holeend & ~(long long)ULONG_MAX)
hlen = ULONG_MAX - hba + 1;
}
- down(&mapping->i_shared_sem);
+ down(&mapping->i_mmap_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
+ up(&mapping->i_mmap_sem);
+ down(&mapping->i_shared_sem);
+ /* Protect against page fault -- not sure this is required */
+ atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen);
up(&mapping->i_shared_sem);
diff -puN mm/filemap.c~010_sem_contention mm/filemap.c
--- mmlinux-2.6/mm/filemap.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/filemap.c 2004-04-11 22:07:40.000000000 -0400
@@ -55,17 +55,17 @@
/*
* Lock ordering:
*
- * ->i_shared_sem (vmtruncate)
+ * ->i_shared{_mmap}_sem (vmtruncate)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_list_lock
* ->swap_device_lock (exclusive_swap_page, others)
* ->mapping->page_lock
*
* ->i_sem
- * ->i_shared_sem (truncate->invalidate_mmap_range)
+ * ->i_shared{_mmap}_sem (truncate->invalidate_mmap_range)
*
* ->mmap_sem
- * ->i_shared_sem (various places)
+ * ->i_shared{_mmap}_sem (various places)
*
* ->mmap_sem
* ->lock_page (access_process_vm)
diff -puN mm/mremap.c~010_sem_contention mm/mremap.c
--- mmlinux-2.6/mm/mremap.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/mremap.c 2004-04-11 22:07:40.000000000 -0400
@@ -267,7 +267,6 @@ static unsigned long move_vma(struct vm_
unsigned long new_len, unsigned long new_addr)
{
struct mm_struct *mm = vma->vm_mm;
- struct address_space *mapping = NULL;
struct vm_area_struct *new_vma;
unsigned long vm_flags = vma->vm_flags;
unsigned long new_pgoff;
@@ -287,16 +286,14 @@ static unsigned long move_vma(struct vm_
if (!new_vma)
return -ENOMEM;

- if (vma->vm_file) {
- /*
- * Subtle point from Rajesh Venkatasubramanian: before
- * moving file-based ptes, we must lock vmtruncate out,
- * since it might clean the dst vma before the src vma,
- * and we propagate stale pages into the dst afterward.
- */
- mapping = vma->vm_file->f_mapping;
- down(&mapping->i_shared_sem);
- }
+ /*
+ * Subtle point from Rajesh Venkatasubramanian: before
+ * moving file-based ptes, we must lock vmtruncate out,
+ * since it might clean the dst vma before the src vma,
+ * and we propagate stale pages into the dst afterward.
+ */
+ vma_mapping_lock(vma);
+
moved_len = move_page_tables(vma, new_addr, old_addr, old_len);
if (moved_len < old_len) {
/*
@@ -310,8 +307,8 @@ static unsigned long move_vma(struct vm_
old_addr = new_addr;
new_addr = -ENOMEM;
}
- if (mapping)
- up(&mapping->i_shared_sem);
+
+ vma_mapping_unlock(vma);

/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {
@@ -476,16 +473,14 @@ unsigned long do_mremap(unsigned long ad
}
else
root = &mapping->i_mmap;
- down(&mapping->i_shared_sem);
}

+ vma_mapping_lock(vma);
spin_lock(&vma->vm_mm->page_table_lock);
__vma_modify(root, vma, vma->vm_start,
addr + new_len, vma->vm_pgoff);
spin_unlock(&vma->vm_mm->page_table_lock);
-
- if(mapping)
- up(&mapping->i_shared_sem);
+ vma_mapping_unlock(vma);

current->mm->total_vm += pages;
if (vma->vm_flags & VM_LOCKED) {
diff -puN mm/rmap.c~010_sem_contention mm/rmap.c
--- mmlinux-2.6/mm/rmap.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/rmap.c 2004-04-11 22:07:40.000000000 -0400
@@ -267,8 +267,8 @@ out:
*
* This function is only called from page_referenced for object-based pages.
*
- * The semaphore address_space->i_shared_sem is tried. If it can't be gotten,
- * assume a reference count of 0, so try_to_unmap will then have a go.
+ * The semaphores ->i_mmap_sem and ->i_shared_sem are tried. If they can't be
+ * gotten, assume a reference count of 0, so try_to_unmap will then have a go.
*/
static inline int page_referenced_obj(struct page *page, int *mapcount)
{
@@ -276,12 +276,14 @@ static inline int page_referenced_obj(st
unsigned long pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
struct prio_tree_iter iter;
+ struct semaphore *semaphore;
unsigned long address;
int referenced = 0;

- if (down_trylock(&mapping->i_shared_sem))
+ if (down_trylock(&mapping->i_mmap_sem))
return 0;

+ semaphore = &mapping->i_mmap_sem;
vma = __vma_prio_tree_first(&mapping->i_mmap,
&iter, pgoff, pgoff);
while (vma) {
@@ -301,6 +303,12 @@ static inline int page_referenced_obj(st
&iter, pgoff, pgoff);
}

+ up(&mapping->i_mmap_sem);
+
+ if (down_trylock(&mapping->i_shared_sem))
+ return 0;
+
+ semaphore = &mapping->i_shared_sem;
vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
&iter, pgoff, pgoff);
while (vma) {
@@ -322,7 +330,7 @@ static inline int page_referenced_obj(st
if (list_empty(&mapping->i_mmap_nonlinear))
WARN_ON(*mapcount > 0);
out:
- up(&mapping->i_shared_sem);
+ up(semaphore);
return referenced;
}

@@ -696,8 +704,8 @@ out:
*
* This function is only called from try_to_unmap for object-based pages.
*
- * The semaphore address_space->i_shared_sem is tried. If it can't be gotten,
- * return a temporary error.
+ * The semaphores ->i_mmap_sem and ->i_shared_sem are tried. If they can't be
+ * gotten, return a temporary error.
*/
static inline int try_to_unmap_obj(struct page *page, int *mapcount)
{
@@ -705,15 +713,17 @@ static inline int try_to_unmap_obj(struc
unsigned long pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
struct prio_tree_iter iter;
+ struct semaphore *semaphore;
unsigned long address;
int ret = SWAP_AGAIN;
unsigned long cursor;
unsigned long max_nl_cursor = 0;
unsigned long max_nl_size = 0;

- if (down_trylock(&mapping->i_shared_sem))
+ if (down_trylock(&mapping->i_mmap_sem))
return ret;

+ semaphore = &mapping->i_mmap_sem;
vma = __vma_prio_tree_first(&mapping->i_mmap,
&iter, pgoff, pgoff);
while (vma) {
@@ -728,6 +738,12 @@ static inline int try_to_unmap_obj(struc
&iter, pgoff, pgoff);
}

+ up(&mapping->i_mmap_sem);
+
+ if (down_trylock(&mapping->i_shared_sem))
+ return ret;
+
+ semaphore = &mapping->i_shared_sem;
vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
&iter, pgoff, pgoff);
while (vma) {
@@ -813,7 +829,7 @@ static inline int try_to_unmap_obj(struc
relock:
rmap_lock(page);
out:
- up(&mapping->i_shared_sem);
+ up(semaphore);
return ret;
}

diff -puN fs/inode.c~010_sem_contention fs/inode.c
--- mmlinux-2.6/fs/inode.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/fs/inode.c 2004-04-11 22:07:40.000000000 -0400
@@ -185,6 +185,7 @@ void inode_init_once(struct inode *inode
sema_init(&inode->i_sem, 1);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
spin_lock_init(&inode->i_data.page_lock);
+ init_MUTEX(&inode->i_data.i_mmap_sem);
init_MUTEX(&inode->i_data.i_shared_sem);
atomic_set(&inode->i_data.truncate_count, 0);
INIT_LIST_HEAD(&inode->i_data.private_list);
diff -puN fs/hugetlbfs/inode.c~010_sem_contention fs/hugetlbfs/inode.c
--- mmlinux-2.6/fs/hugetlbfs/inode.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/fs/hugetlbfs/inode.c 2004-04-11 22:07:40.000000000 -0400
@@ -325,11 +325,15 @@ static int hugetlb_vmtruncate(struct ino
pgoff = offset >> HPAGE_SHIFT;

inode->i_size = offset;
- down(&mapping->i_shared_sem);
+ down(&mapping->i_mmap_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+ up(&mapping->i_mmap_sem);
+ down(&mapping->i_shared_sem);
+ /* Protect against page fault -- not sure this is required */
+ atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff);
up(&mapping->i_shared_sem);

_

2004-04-12 05:24:33

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

> This patch is an attempt at reducing the contention on i_shared_sem
> by introducing a new semaphore i_mmap_sem. The i_shared_sem covers
> i_mmap_shared tree and i_mmap_nonlinear list now, whereas i_mmap_sem
> covers i_mmap tree. This may help to reduce the contention on
> i_shared_sem if a file is mapped both private and shared. Kernel
> compile time with and without this patch did not change much, though.
>
> This patch is on top of 2.6.5-mjb1+anobjrmap9_prio. Compiled and
> tested.
>
> Martin! Are you interested in testing SDET with this patch ?

Runs exactly the same as prio ... thanks for having a crack at it though.
I guess that sharing combination isn't that common ;-(

M.

2004-04-12 15:47:41

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

>> Any chance of you doing the same comparison between 2.6.5-aa5
>> 2.6.5-aa5 minus prio-tree? (Well, needn't be -aa5, whatever comes to
>> hand. Looks like "patch -p1 -R < prio-tree" mostly works, just some
>> rejects in mm/mmap.c itself, let me know if I can help out on that.)
>>
>> If -aa is okay, I hope so, then it's surely some stupidity from me.
>
> Good idea. Not sure how easy it'll be to back prio_tree out, but I can
> surely do aa5, which would give us a good clue still. Might not be until
> this time tommorow though.

Hmm. it's halfway between the two. There does seem to be less sem
contention, though the profile ticks in __down isn't really an accurate
measure. I'll try to think of some more accurate way to instrument sems
(maybe I can do rdtsc before and after taking the sem, and then hack
the profiling code to stash delta increment indexed by the callers
address). Meanwhile, maybe you can decipher something from the attatched
and appended ...

M.

SDET 128 (see disclaimer)
Throughput Std. Dev
2.6.5-mjb1 100.0% 1.2%
2.6.5-mjb1-prio 73.6% 0.0%
2.6.5-aa5 86.4% 1.9%

Full profile attatch (to match yesterday's).

diffprofile from mjb1+prio to -aa5
11084 102.0% find_get_page
4976 101.8% clear_page_tables
4482 690.6% schedule
2322 1745.9% pgd_alloc
2036 41.2% atomic_dec_and_lock
1816 0.0% page_add_rmap
1623 16.0% __d_lookup
1428 0.0% __set_page_dirty_buffers
1275 0.0% anon_vma_unlink
1234 25.3% path_lookup
1101 25.0% remove_shared_vm_struct
990 53.7% .text.lock.dec_and_lock
921 0.0% find_get_pages_tag
883 0.0% anon_vma_link
791 52.2% follow_mount
783 250.2% unmap_vmas
764 0.0% strnlen_user
588 32.3% file_move
572 36.8% proc_pid_stat
563 57.9% lookup_mnt
543 55.4% .text.lock.file_table
498 48.0% .text.lock.dcache
488 69.0% flush_signal_handlers
467 215.2% .text.lock.base
460 27.9% kmem_cache_free
...
-217 -100.0% __do_softirq
-220 -100.0% page_update_anon_rmap
-228 -57.6% flush_tlb_page
-230 -69.9% sys_close
-233 -100.0% direct_strncpy_from_user
-233 -7.8% copy_process
-235 -21.1% kmap_atomic
-254 -37.5% sched_clock
-257 -59.6% rb_insert_color
-318 -47.7% pte_alloc_map
-330 -99.1% find_get_pages
-332 -31.5% find_vma
-340 -43.8% __vma_prio_tree_remove
-356 -29.7% vma_link
-369 -100.0% direct_strnlen_user
-401 -100.0% page_add_anon_rmap
-470 -8.6% do_no_page
-610 -8.1% __copy_to_user_ll
-619 -100.0% radix_tree_lookup
-923 -97.4% set_page_dirty
-1133 -12.6% release_pages
-1406 -53.7% __block_prepare_write
-1553 -14.5% page_remove_rmap
-1555 -100.0% page_add_obj_rmap
-1753 -38.9% do_page_fault
-2269 -68.7% __copy_user_intel
-2959 -44.8% do_anonymous_page
-3789 -100.0% .text.lock.filemap
-7671 -53.6% __wake_up
-8214 -36.8% copy_page_range
-9948 -62.1% do_wp_page
-14120 -100.0% finish_task_switch
-27763 -55.8% __down
-64017 -12.0% default_idle
-107386 -11.9% total


Attachments:
(No filename) (3.35 kB)
2.6.5-aa5 (17.19 kB)
Download all attachments

2004-04-12 18:43:34

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

On Mon, 12 Apr 2004, Martin J. Bligh wrote:
> >> Any chance of you doing the same comparison between 2.6.5-aa5
> >> 2.6.5-aa5 minus prio-tree?
>
> Hmm. it's halfway between the two. There does seem to be less sem
> contention, though the profile ticks in __down isn't really an accurate
> measure.

Thanks a lot, Martin (despite my silence, I really am listening and
pondering intermittently on this). So, -aa5 shows high __down count
too, may not be any kind of accurate measure, but it's surely not good.

Mainly I'm concentrating on getting ready the next few patches
of the objrmap set for Andrew (not to be sent for a day or two).
Unless we see a plausible way forward on your SDET numbers, I
think it casts this project in doubt - but even so I do need
to focus or I'll just mess them up.

Seems as if prio tree has too high a cost there, yet we believe we
need it to handle the corner cases of objrmap.

What I want to investigate, when I'm far enough done, is the effect
of restoring i_shared_sem to the i_shared_lock it was before 2.5.57.
My fantasy is that your SDET would behave much more stably without
that as a semaphore, that prio tree just pushes it over your cliff.

It is easier to insert and remove vmas in the list than the tree,
and you can get away with leaving them in place quite often with
the list.

(Expect me to shout excitedly "Hey, the __down count has gone right
down, that proves I'm right!")

i_shared_lock changed to i_shared_sem to allow that cond_resched_lock
in unmap_vmas to solve vmtruncate latency problems? With i_mmap and
i_mmap_shared as lists, isn't it easy to insert a dummy marker vma
and drop the lock if we need resched? Resuming from marker after.

But, sadly, I doubt that can be done with the prio tree: Rajesh?

Hugh

Subject: Re: [PATCH] anobjrmap 9 priority mjb tree


> Unless we see a plausible way forward on your SDET numbers, I
> think it casts this project in doubt - but even so I do need

We can try a few fancy locking tricks. But, we don't know whether
such tricks will help.

> i_shared_lock changed to i_shared_sem to allow that cond_resched_lock
> in unmap_vmas to solve vmtruncate latency problems? With i_mmap and
> i_mmap_shared as lists, isn't it easy to insert a dummy marker vma
> and drop the lock if we need resched? Resuming from marker after.
>
> But, sadly, I doubt that can be done with the prio tree: Rajesh?

Yeap. With prio_tree it is tricky. We already have the marker for
prio_tree, i.e., prio_tree_iter. But, when you drop a lock new tree
nodes may be added to the prio_tree, and the marker does not provide
any consistent meaning after the node additions.

Rajesh

2004-04-12 19:02:14

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

> On Mon, 12 Apr 2004, Martin J. Bligh wrote:
>> >> Any chance of you doing the same comparison between 2.6.5-aa5
>> >> 2.6.5-aa5 minus prio-tree?
>>
>> Hmm. it's halfway between the two. There does seem to be less sem
>> contention, though the profile ticks in __down isn't really an accurate
>> measure.
>
> Thanks a lot, Martin (despite my silence, I really am listening and
> pondering intermittently on this). So, -aa5 shows high __down count
> too, may not be any kind of accurate measure, but it's surely not good.
>
> Mainly I'm concentrating on getting ready the next few patches
> of the objrmap set for Andrew (not to be sent for a day or two).
> Unless we see a plausible way forward on your SDET numbers, I
> think it casts this project in doubt - but even so I do need
> to focus or I'll just mess them up.
>
> Seems as if prio tree has too high a cost there, yet we believe we
> need it to handle the corner cases of objrmap.

I'm still not sure those corner cases actually occur now that we have
Oracle using remap_file_pages. But Andrew mentioned some uml thing
recently - not sure if that workload affects this or not.

> What I want to investigate, when I'm far enough done, is the effect
> of restoring i_shared_sem to the i_shared_lock it was before 2.5.57.
> My fantasy is that your SDET would behave much more stably without
> that as a semaphore, that prio tree just pushes it over your cliff.

Yeah, to be honest, maybe SDET is kind of bolloxed anyway by i_shared_sem
It needs a better fix. This probably shouldn't hold up objrmap, but it
would be nice to have a think about ;-)

> It is easier to insert and remove vmas in the list than the tree,
> and you can get away with leaving them in place quite often with
> the list.
>
> (Expect me to shout excitedly "Hey, the __down count has gone right
> down, that proves I'm right!")
>
> i_shared_lock changed to i_shared_sem to allow that cond_resched_lock
> in unmap_vmas to solve vmtruncate latency problems? With i_mmap and
> i_mmap_shared as lists, isn't it easy to insert a dummy marker vma
> and drop the lock if we need resched? Resuming from marker after.
>
> But, sadly, I doubt that can be done with the prio tree: Rajesh?

If it were just a list, maybe RCU would be appropriate. It might be
rather write-heavy though ? I think I played with an rwsem instead
of a sem in the past too (though be careful if you try this, as for
no good reason the return codes are inverted ;-()

M.


2004-04-12 19:10:48

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

On Mon, 12 Apr 2004, Martin J. Bligh wrote:
>
> If it were just a list, maybe RCU would be appropriate. It might be
> rather write-heavy though ? I think I played with an rwsem instead
> of a sem in the past too (though be careful if you try this, as for
> no good reason the return codes are inverted ;-()

Yes, I think all the common paths have to write, in case the
uncommon paths (truncation and swapout) want to read: the wrong
way round for any kind of read-write optimization, isn't it?

Hugh

Subject: Re: [PATCH] anobjrmap 9 priority mjb tree


On Mon, 12 Apr 2004, Hugh Dickins wrote:
> On Mon, 12 Apr 2004, Martin J. Bligh wrote:
> >
> > If it were just a list, maybe RCU would be appropriate. It might be
> > rather write-heavy though ? I think I played with an rwsem instead
> > of a sem in the past too (though be careful if you try this, as for
> > no good reason the return codes are inverted ;-()
>
> Yes, I think all the common paths have to write, in case the
> uncommon paths (truncation and swapout) want to read: the wrong
> way round for any kind of read-write optimization, isn't it?

In common workloads e.g., add libc mapping using __vma_prio_tree_insert,
mostly you do not add new nodes to the tree. Instead, you just add to
a vm_set list. I am currently considering using rwsem to optimize
such cases. Similarly __vma_prio_tree_remove can also be optimized
in some common cases. I don't know whether it will help. Let us see...

Rajesh



2004-04-12 21:03:11

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

> On Mon, 12 Apr 2004, Hugh Dickins wrote:
>> On Mon, 12 Apr 2004, Martin J. Bligh wrote:
>> >
>> > If it were just a list, maybe RCU would be appropriate. It might be
>> > rather write-heavy though ? I think I played with an rwsem instead
>> > of a sem in the past too (though be careful if you try this, as for
>> > no good reason the return codes are inverted ;-()
>>
>> Yes, I think all the common paths have to write, in case the
>> uncommon paths (truncation and swapout) want to read: the wrong
>> way round for any kind of read-write optimization, isn't it?

But isn't objrmap a big read case? ;-)

> In common workloads e.g., add libc mapping using __vma_prio_tree_insert,
> mostly you do not add new nodes to the tree. Instead, you just add to
> a vm_set list. I am currently considering using rwsem to optimize
> such cases. Similarly __vma_prio_tree_remove can also be optimized
> in some common cases. I don't know whether it will help. Let us see...

Sounds interesting ... so basically you're breaking out the locking of
the tree itself separately?

M.

PS. In the diffprofiles, I observed that Andrea had killed one of the large
remaining lock entries (.text.lock.filemap). Turns out he'd turned the
locking in find_get_page from "spin_lock(&mapping->page_lock)" into
"spin_lock_irq(&mapping->tree_lock)", and I'm using readprofile, which
doesn't profile with irqs off, so it's not really disappeared, just hidden.
Not sure which sub-patch that comes from, and it turned out to be a bit of
a dead end, but whilst I'm there, I thought I'd point out this was contended,
and show the diffprofile with and without spinline for aa5:

22210 246777.8% find_trylock_page
2538 36.4% atomic_dec_and_lock
1249 146.6% grab_block
1042 99.6% kmap_high
882 29400.0% find_get_pages
868 69.1% file_kill
744 30.9% file_move
499 236.5% proc_pid_readlink
433 82.8% d_instantiate
389 110.2% kunmap_high
319 52.4% ext2_new_block
303 27.2% d_alloc
220 44.9% prune_dcache
206 3.1% __wake_up
195 26.4% new_inode
194 71.6% d_delete
161 33.5% d_path
146 53.9% group_reserve_blocks
124 11.4% __mark_inode_dirty
117 13.9% __find_get_block_slow
116 45.7% __insert_inode_hash
113 8.3% page_address
106 5.0% proc_pid_stat
...
-216 -100.0% .text.lock.namespace
-244 -1.1% __down
-352 -100.0% .text.lock.inode
-684 -100.0% .text.lock.base
-887 -96.3% find_get_pages_tag
-1269 -100.0% .text.lock.highmem
-1523 -100.0% .text.lock.file_table
-1535 -100.0% .text.lock.dcache
-1549 -0.2% total
-2834 -100.0% .text.lock.dec_and_lock
-2915 -0.6% default_idle
-21908 -99.8% find_get_page

(SDET 128 on the 16-way NUMA-Q).
(this basically shows who was taking the locks we see in profiles).
Still not quite sure why inlining the spinlocks did this, to be honest:

22210 246777.8% find_trylock_page
-21908 -99.8% find_get_page

as neither seems to call the other. Humpf.


2004-04-12 21:10:43

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

"Martin J. Bligh" <[email protected]> wrote:
>
> Turns out he'd turned the
> locking in find_get_page from "spin_lock(&mapping->page_lock)" into
> "spin_lock_irq(&mapping->tree_lock)",

That's from the use-radix-tree-walks-for-writeback code.

Use oprofile - it's NMI-based.

> and I'm using readprofile, which
> doesn't profile with irqs off, so it's not really disappeared, just hidden.
> Not sure which sub-patch that comes from, and it turned out to be a bit of
> a dead end, but whilst I'm there, I thought I'd point out this was contended,
> and show the diffprofile with and without spinline for aa5:
>
> 22210 246777.8% find_trylock_page
> 2538 36.4% atomic_dec_and_lock

profiler brokenness, surely. Almost nothing calls find_trylock_page(),
unless Andrea has done something peculiar. Use oprofile.

2004-04-12 21:32:34

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

--On Monday, April 12, 2004 14:12:44 -0700 Andrew Morton <[email protected]> wrote:

> "Martin J. Bligh" <[email protected]> wrote:
>>
>> Turns out he'd turned the
>> locking in find_get_page from "spin_lock(&mapping->page_lock)" into
>> "spin_lock_irq(&mapping->tree_lock)",
>
> That's from the use-radix-tree-walks-for-writeback code.
>
> Use oprofile - it's NMI-based.
>
>> and I'm using readprofile, which
>> doesn't profile with irqs off, so it's not really disappeared, just hidden.
>> Not sure which sub-patch that comes from, and it turned out to be a bit of
>> a dead end, but whilst I'm there, I thought I'd point out this was contended,
>> and show the diffprofile with and without spinline for aa5:
>>
>> 22210 246777.8% find_trylock_page
>> 2538 36.4% atomic_dec_and_lock
>
> profiler brokenness, surely. Almost nothing calls find_trylock_page(),
> unless Andrea has done something peculiar. Use oprofile.

Well, he did do this:

@@ -413,11 +412,11 @@ struct page *find_trylock_page(struct ad
{
struct page *page;

- spin_lock(&mapping->page_lock);
+ spin_lock_irq(&mapping->tree_lock);
page = radix_tree_lookup(&mapping->page_tree, offset);
if (page && TestSetPageLocked(page))
page = NULL;
- spin_unlock(&mapping->page_lock);
+ spin_unlock_irq(&mapping->tree_lock);
return page;
}

Which would stop it appearing in readprofile. But why spinlock inlining
should affect that one way or the other is beyond me. I'll see about
using oprofile, but it's not a trivial conversion (it's all scripted).
There's no other occurences of that in his patchset. But you're right,
only xfs, and free_swap_and_cache seem to use it, and I'm not swapping.

M.

Subject: Re: [PATCH] anobjrmap 9 priority mjb tree


This patch is another attempt at reducing the contention on i_shared_sem.
The patch converts i_shared_sem from normal semaphore to read-write
semaphore. The locking rules used are:

1) A prio_tree cannot be modified without holding write lock.
2) However, vmas can be added and removed from a vm_set list
by just holding the read lock and a bit lock (vm_set_lock)
in the corresponding prio_tree node.
3) All objrmap functions just hold read lock now. So when we
walk a vm_set list we have to hold the corresponding
vm_set_lock.
4) Since truncate uses write lock (provides exclusion) we don't
have to take vm_set_locks.

Martin! When you get time to test your SDET with this patch, please
let me know whether this patch helps you at all. The patch applies
on top of 2.6.5-mjb1+anobjrmap9_prio_tree.


fs/hugetlbfs/inode.c | 4 -
fs/inode.c | 2
include/linux/fs.h | 2
include/linux/mm.h | 127 ++++++++++++++++++++++++++++++++++++++++++++--
include/linux/prio_tree.h | 3 +
kernel/fork.c | 34 +++++++++++-
mm/fremap.c | 4 -
mm/memory.c | 4 -
mm/mmap.c | 120 ++++++++++++++++++++++++++++++++-----------
mm/mremap.c | 8 +-
mm/prio_tree.c | 117 ++++++++++++++++++++++++++++--------------
mm/rmap.c | 46 ++++++++++------
12 files changed, 365 insertions(+), 106 deletions(-)

diff -puN fs/hugetlbfs/inode.c~110_sem_contention fs/hugetlbfs/inode.c
--- mmlinux-2.6/fs/hugetlbfs/inode.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/fs/hugetlbfs/inode.c 2004-04-14 15:49:01.000000000 -0400
@@ -325,14 +325,14 @@ static int hugetlb_vmtruncate(struct ino
pgoff = offset >> HPAGE_SHIFT;

inode->i_size = offset;
- down(&mapping->i_shared_sem);
+ down_write(&mapping->i_shared_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff);
- up(&mapping->i_shared_sem);
+ up_write(&mapping->i_shared_sem);
truncate_hugepages(mapping, offset);
return 0;
}
diff -puN fs/inode.c~110_sem_contention fs/inode.c
--- mmlinux-2.6/fs/inode.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/fs/inode.c 2004-04-14 15:49:01.000000000 -0400
@@ -185,7 +185,7 @@ void inode_init_once(struct inode *inode
sema_init(&inode->i_sem, 1);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
spin_lock_init(&inode->i_data.page_lock);
- init_MUTEX(&inode->i_data.i_shared_sem);
+ init_rwsem(&inode->i_data.i_shared_sem);
atomic_set(&inode->i_data.truncate_count, 0);
INIT_LIST_HEAD(&inode->i_data.private_list);
spin_lock_init(&inode->i_data.private_lock);
diff -puN include/linux/fs.h~110_sem_contention include/linux/fs.h
--- mmlinux-2.6/include/linux/fs.h~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/include/linux/fs.h 2004-04-14 15:49:01.000000000 -0400
@@ -333,7 +333,7 @@ struct address_space {
struct prio_tree_root i_mmap; /* tree of private mappings */
struct prio_tree_root i_mmap_shared; /* tree of shared mappings */
struct list_head i_mmap_nonlinear;/*list of nonlinear mappings */
- struct semaphore i_shared_sem; /* protect both above lists */
+ struct rw_semaphore i_shared_sem; /* protect both above lists */
atomic_t truncate_count; /* Cover race condition with truncate */
unsigned long flags; /* error bits/gfp mask */
struct backing_dev_info *backing_dev_info; /* device readahead, etc */
diff -puN include/linux/mm.h~110_sem_contention include/linux/mm.h
--- mmlinux-2.6/include/linux/mm.h~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/include/linux/mm.h 2004-04-14 15:49:01.000000000 -0400
@@ -87,6 +87,10 @@ struct vm_area_struct {
/*
* shared.vm_set : list of vmas that map exactly the same set of pages
* vm_set_head : head of the vm_set list
+ *
+ * Both shared.vm_set.list and vm_set_head are protected by VM_SET_LOCK
+ * bit of the corresponding tree node's vm_flags when accessed under
+ * down_read(i_shared_sem)
*/
struct vm_area_struct *vm_set_head;

@@ -133,6 +137,8 @@ struct vm_area_struct {
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */

+#define VM_SET_LOCK 24 /* Lock bit for vm_set list, head */
+
/* It makes sense to apply VM_ACCOUNT to this vma. */
#define VM_MAYACCT(vma) (!!((vma)->vm_flags & VM_HUGETLB))

@@ -156,6 +162,13 @@ struct vm_area_struct {
* The following macros are used for implementing prio_tree for i_mmap{_shared}
*/

+#define vm_set_lock(vma) bit_spin_lock(VM_SET_LOCK, \
+ (unsigned long *)&(vma->vm_flags))
+#define vm_set_trylock(vma) bit_spin_trylock(VM_SET_LOCK, \
+ (unsigned long *)&(vma->vm_flags))
+#define vm_set_unlock(vma) bit_spin_unlock(VM_SET_LOCK, \
+ (unsigned long *)&(vma->vm_flags))
+
#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
/* avoid overflow */
@@ -202,7 +215,8 @@ static inline int vma_shared_empty(struc

/*
* Helps to add a new vma that maps the same (identical) set of pages as the
- * old vma to an i_mmap tree.
+ * old vma to an i_mmap tree. No new tree node is added by this function.
+ * The new vma is added to an already existing tree node's vm_set list.
*/
static inline void __vma_prio_tree_add(struct vm_area_struct *vma,
struct vm_area_struct *old)
@@ -229,6 +243,37 @@ static inline void __vma_prio_tree_add(s
}

/*
+ * Delete a vm_set list node from an i_mmap tree. Note that this function
+ * should not be called with a tree node, i.e., shared.both.parent != NULL.
+ */
+static inline void __vma_prio_tree_del(struct vm_area_struct *vma)
+{
+ /* Leave this BUG_ON till prio_tree patch stabilizes */
+ BUG_ON(vma->shared.both.parent);
+
+ if (vma->vm_set_head) {
+ struct vm_area_struct *tree_node, *new_head;
+ /* Leave this BUG_ON till prio_tree patch stabilizes */
+ BUG_ON(vma->vm_set_head->vm_set_head != vma);
+ tree_node = vma->vm_set_head;
+ if (!list_empty(&vma->shared.vm_set.list)) {
+ new_head = list_entry(
+ vma->shared.vm_set.list.next,
+ struct vm_area_struct,
+ shared.vm_set.list);
+ list_del_init(&vma->shared.vm_set.list);
+ tree_node->vm_set_head = new_head;
+ new_head->vm_set_head = tree_node;
+ }
+ else
+ tree_node->vm_set_head = NULL;
+ } else
+ list_del_init(&vma->shared.vm_set.list);
+
+ INIT_VMA_SHARED(vma);
+}
+
+/*
* We cannot modify vm_start, vm_end, vm_pgoff fields of a vma that has been
* already present in an i_mmap{_shared} tree without modifying the tree. The
* following helper function should be used when such modifications are
@@ -250,6 +295,25 @@ static inline void __vma_modify(struct p
}

/*
+ * Find a vma with given radix_index and heap_index in the prio_tree. Return
+ * the vma pointer if found, NULL otherwise.
+ */
+static inline struct vm_area_struct *__vma_prio_tree_find(
+ struct prio_tree_root *root, unsigned long radix_index,
+ unsigned long heap_index)
+{
+ struct prio_tree_node *ptr;
+
+ ptr = prio_tree_find(root, radix_index, heap_index);
+
+ if (ptr)
+ return prio_tree_entry(ptr, struct vm_area_struct,
+ shared.prio_tree_node);
+ else
+ return NULL;
+}
+
+/*
* Helper functions to enumerate vmas that map a given file page or a set of
* contiguous file pages. The functions return vmas that at least map a single
* page in the given range of contiguous file pages.
@@ -269,11 +333,19 @@ static inline struct vm_area_struct *__v
return NULL;
}

-static inline struct vm_area_struct *__vma_prio_tree_next(
- struct vm_area_struct *vma, struct prio_tree_root *root,
- struct prio_tree_iter *iter, unsigned long begin, unsigned long end)
+static inline struct vm_area_struct *__vma_prio_tree_first_lock(
+ struct prio_tree_root *root, struct prio_tree_iter *iter,
+ unsigned long begin, unsigned long end)
+{
+ struct vm_area_struct *vma;
+ vma = __vma_prio_tree_first(root, iter, begin, end);
+ if (vma)
+ vm_set_lock(vma);
+ return vma;
+}
+
+static inline struct vm_area_struct *__vm_set_next(struct vm_area_struct *vma)
{
- struct prio_tree_node *ptr;
struct vm_area_struct *next;

if (vma->shared.both.parent) {
@@ -286,6 +358,19 @@ static inline struct vm_area_struct *__v
if (!(next->vm_set_head))
return next;
}
+ return NULL;
+}
+
+static inline struct vm_area_struct *__vma_prio_tree_next(
+ struct vm_area_struct *vma, struct prio_tree_root *root,
+ struct prio_tree_iter *iter, unsigned long begin, unsigned long end)
+{
+ struct prio_tree_node *ptr;
+ struct vm_area_struct *next;
+
+ next = __vm_set_next(vma);
+ if (next)
+ return next;

ptr = prio_tree_next(root, iter, begin, end);

@@ -296,6 +381,38 @@ static inline struct vm_area_struct *__v
return NULL;
}

+static inline void __vma_prio_tree_iter_unlock(struct prio_tree_iter *iter)
+{
+ struct vm_area_struct *vma;
+ vma = prio_tree_entry(iter->cur, struct vm_area_struct,
+ shared.prio_tree_node);
+ vm_set_unlock(vma);
+}
+
+static inline struct vm_area_struct *__vma_prio_tree_next_lock(
+ struct vm_area_struct *vma, struct prio_tree_root *root,
+ struct prio_tree_iter *iter, unsigned long begin, unsigned long end)
+{
+ struct prio_tree_node *ptr;
+ struct vm_area_struct *next;
+
+ next = __vm_set_next(vma);
+ if (next)
+ return next;
+
+ __vma_prio_tree_iter_unlock(iter);
+ ptr = prio_tree_next(root, iter, begin, end);
+
+ if (ptr) {
+ next = prio_tree_entry(ptr, struct vm_area_struct,
+ shared.prio_tree_node);
+ vm_set_lock(next);
+ return next;
+ } else
+ return NULL;
+}
+
+
/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
diff -puN include/linux/prio_tree.h~110_sem_contention include/linux/prio_tree.h
--- mmlinux-2.6/include/linux/prio_tree.h~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/include/linux/prio_tree.h 2004-04-14 15:49:01.000000000 -0400
@@ -67,6 +67,9 @@ static inline int prio_tree_right_empty(
extern struct prio_tree_node *prio_tree_insert(struct prio_tree_root *,
struct prio_tree_node *);

+extern struct prio_tree_node *prio_tree_find(struct prio_tree_root *,
+ unsigned long, unsigned long);
+
extern void prio_tree_remove(struct prio_tree_root *, struct prio_tree_node *);

extern struct prio_tree_node *prio_tree_first(struct prio_tree_root *,
diff -puN kernel/fork.c~110_sem_contention kernel/fork.c
--- mmlinux-2.6/kernel/fork.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/kernel/fork.c 2004-04-14 15:49:01.000000000 -0400
@@ -326,15 +326,43 @@ static inline int dup_mmap(struct mm_str
file = tmp->vm_file;
INIT_VMA_SHARED(tmp);
if (file) {
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = file->f_dentry->d_inode;
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);

/* insert tmp into the share list, just after mpnt */
- down(&file->f_mapping->i_shared_sem);
- __vma_prio_tree_add(tmp, mpnt);
- up(&file->f_mapping->i_shared_sem);
+ if (down_write_trylock(&mapping->i_shared_sem)) {
+ __vma_prio_tree_add(tmp, mpnt);
+ up_write(&mapping->i_shared_sem);
+ }
+ else {
+ if (unlikely(mpnt->vm_flags & VM_NONLINEAR)) {
+ down_write(&mapping->i_shared_sem);
+ list_add(&tmp->shared.vm_set.list,
+ &mpnt->shared.vm_set.list);
+ up_write(&mapping->i_shared_sem);
+ }
+ else {
+ struct vm_area_struct *tree_node;
+ struct prio_tree_root *root;
+ if (mpnt->vm_flags & VM_SHARED)
+ root = &mapping->i_mmap_shared;
+ else
+ root = &mapping->i_mmap;
+
+ down_read(&mapping->i_shared_sem);
+ tree_node = __vma_prio_tree_find(root,
+ RADIX_INDEX(mpnt),
+ HEAP_INDEX(mpnt));
+ BUG_ON(!tree_node);
+ vm_set_lock(tree_node);
+ __vma_prio_tree_add(tmp, mpnt);
+ vm_set_unlock(tree_node);
+ up_read(&mapping->i_shared_sem);
+ }
+ }
}

/*
diff -puN mm/fremap.c~110_sem_contention mm/fremap.c
--- mmlinux-2.6/mm/fremap.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/mm/fremap.c 2004-04-14 15:49:01.000000000 -0400
@@ -203,13 +203,13 @@ asmlinkage long sys_remap_file_pages(uns
linear_pgoff += ((start - vma->vm_start) >> PAGE_SHIFT);
if (pgoff != linear_pgoff && !(vma->vm_flags & VM_NONLINEAR)) {
mapping = vma->vm_file->f_mapping;
- down(&mapping->i_shared_sem);
+ down_write(&mapping->i_shared_sem);
vma->vm_flags |= VM_NONLINEAR;
__vma_prio_tree_remove(&mapping->i_mmap_shared, vma);
INIT_VMA_SHARED_LIST(vma);
list_add_tail(&vma->shared.vm_set.list,
&mapping->i_mmap_nonlinear);
- up(&mapping->i_shared_sem);
+ up_write(&mapping->i_shared_sem);
}

/* ->populate can take a long time, so downgrade the lock. */
diff -puN mm/memory.c~110_sem_contention mm/memory.c
--- mmlinux-2.6/mm/memory.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/mm/memory.c 2004-04-14 15:49:01.000000000 -0400
@@ -1133,14 +1133,14 @@ void invalidate_mmap_range(struct addres
if (holeend & ~(long long)ULONG_MAX)
hlen = ULONG_MAX - hba + 1;
}
- down(&mapping->i_shared_sem);
+ down_write(&mapping->i_shared_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen);
- up(&mapping->i_shared_sem);
+ up_write(&mapping->i_shared_sem);
}
EXPORT_SYMBOL_GPL(invalidate_mmap_range);

diff -puN mm/mmap.c~110_sem_contention mm/mmap.c
--- mmlinux-2.6/mm/mmap.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/mm/mmap.c 2004-04-14 15:49:01.000000000 -0400
@@ -96,10 +96,43 @@ static void remove_shared_vm_struct(stru

if (file) {
struct address_space *mapping = file->f_mapping;
- down(&mapping->i_shared_sem);
- __remove_shared_vm_struct(vma, file->f_dentry->d_inode,
- mapping);
- up(&mapping->i_shared_sem);
+ struct inode *inode = file->f_dentry->d_inode;
+
+ if (down_write_trylock(&mapping->i_shared_sem)) {
+ __remove_shared_vm_struct(vma, inode, mapping);
+ up_write(&mapping->i_shared_sem);
+ return;
+ }
+
+ if (likely(!(vma->vm_flags & VM_NONLINEAR) &&
+ !vma->shared.both.parent)) {
+ struct prio_tree_root *root;
+ struct vm_area_struct *tree_node;
+ if (vma->vm_flags & VM_SHARED)
+ root = &mapping->i_mmap_shared;
+ else
+ root = &mapping->i_mmap;
+
+ down_read(&mapping->i_shared_sem);
+ if (unlikely(vma->shared.both.parent)) {
+ up_read(&mapping->i_shared_sem);
+ goto get_write;
+ }
+ tree_node = __vma_prio_tree_find(root,
+ RADIX_INDEX(vma), HEAP_INDEX(vma));
+ BUG_ON(!tree_node);
+ vm_set_lock(tree_node);
+ if (vma->vm_flags & VM_DENYWRITE)
+ atomic_inc(&inode->i_writecount);
+ __vma_prio_tree_del(vma);
+ vm_set_unlock(tree_node);
+ up_read(&mapping->i_shared_sem);
+ return;
+ }
+get_write:
+ down_write(&mapping->i_shared_sem);
+ __remove_shared_vm_struct(vma, inode, mapping);
+ up_write(&mapping->i_shared_sem);
}
}

@@ -291,26 +324,58 @@ __vma_link(struct mm_struct *mm, struct
{
__vma_link_list(mm, vma, prev, rb_parent);
__vma_link_rb(mm, vma, rb_link, rb_parent);
- __vma_link_file(vma);
}

static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node **rb_link,
struct rb_node *rb_parent)
{
+ struct prio_tree_root *root;
+ struct vm_area_struct *tree_node;
struct address_space *mapping = NULL;

if (vma->vm_file)
mapping = vma->vm_file->f_mapping;

- if (mapping)
- down(&mapping->i_shared_sem);
- spin_lock(&mm->page_table_lock);
- __vma_link(mm, vma, prev, rb_link, rb_parent);
- spin_unlock(&mm->page_table_lock);
- if (mapping)
- up(&mapping->i_shared_sem);
+ if (mapping) {
+ if (unlikely(vma->vm_flags & VM_NONLINEAR))
+ goto get_write;
+ if (vma->vm_flags & VM_SHARED)
+ root = &mapping->i_mmap_shared;
+ else
+ root = &mapping->i_mmap;

+ down_read(&mapping->i_shared_sem);
+ tree_node = __vma_prio_tree_find(root,
+ RADIX_INDEX(vma), HEAP_INDEX(vma));
+ if (tree_node) {
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ if (vma->vm_flags & VM_DENYWRITE)
+ atomic_dec(&inode->i_writecount);
+ spin_lock(&mm->page_table_lock);
+ __vma_link(mm, vma, prev, rb_link, rb_parent);
+ spin_unlock(&mm->page_table_lock);
+ vm_set_lock(tree_node);
+ __vma_prio_tree_add(vma, tree_node);
+ vm_set_unlock(tree_node);
+ up_read(&mapping->i_shared_sem);
+ }
+ else {
+ up_read(&mapping->i_shared_sem);
+get_write:
+ down_write(&mapping->i_shared_sem);
+ spin_lock(&mm->page_table_lock);
+ __vma_link(mm, vma, prev, rb_link, rb_parent);
+ __vma_link_file(vma);
+ spin_unlock(&mm->page_table_lock);
+ up_write(&mapping->i_shared_sem);
+ }
+ }
+ else {
+ spin_lock(&mm->page_table_lock);
+ __vma_link(mm, vma, prev, rb_link, rb_parent);
+ spin_unlock(&mm->page_table_lock);
+ }
mark_mm_hugetlb(mm, vma);
mm->map_count++;
validate_mm(mm);
@@ -331,6 +396,7 @@ __insert_vm_struct(struct mm_struct * mm
if (__vma && __vma->vm_start < vma->vm_end)
BUG();
__vma_link(mm, vma, prev, rb_link, rb_parent);
+ __vma_link_file(vma);
mark_mm_hugetlb(mm, vma);
mm->map_count++;
validate_mm(mm);
@@ -410,7 +476,7 @@ static struct vm_area_struct *vma_merge(
spinlock_t *lock = &mm->page_table_lock;
struct inode *inode = file ? file->f_dentry->d_inode : NULL;
struct address_space *mapping = file ? file->f_mapping : NULL;
- struct semaphore *i_shared_sem;
+ struct rw_semaphore *i_shared_sem;
struct prio_tree_root *root = NULL;

/*
@@ -442,13 +508,9 @@ static struct vm_area_struct *vma_merge(
if (prev->vm_end == addr &&
can_vma_merge_after(prev, vm_flags, file, pgoff)) {
struct vm_area_struct *next;
- int need_up = 0;

- if (unlikely(file && prev->vm_next &&
- prev->vm_next->vm_file == file)) {
- down(i_shared_sem);
- need_up = 1;
- }
+ if (file)
+ down_write(i_shared_sem);
spin_lock(lock);

/*
@@ -463,20 +525,18 @@ static struct vm_area_struct *vma_merge(
next->vm_end, prev->vm_pgoff);
__remove_shared_vm_struct(next, inode, mapping);
spin_unlock(lock);
- if (need_up)
- up(i_shared_sem);
- if (file)
+ if (file) {
+ up_write(i_shared_sem);
fput(file);
-
+ }
mm->map_count--;
kmem_cache_free(vm_area_cachep, next);
return prev;
}
-
__vma_modify(root, prev, prev->vm_start, end, prev->vm_pgoff);
spin_unlock(lock);
- if (need_up)
- up(i_shared_sem);
+ if (file)
+ up_write(i_shared_sem);
return prev;
}

@@ -491,13 +551,13 @@ static struct vm_area_struct *vma_merge(
return NULL;
if (end == prev->vm_start) {
if (file)
- down(i_shared_sem);
+ down_write(i_shared_sem);
spin_lock(lock);
__vma_modify(root, prev, addr, prev->vm_end,
prev->vm_pgoff - ((end - addr) >> PAGE_SHIFT));
spin_unlock(lock);
if (file)
- up(i_shared_sem);
+ up_write(i_shared_sem);
return prev;
}
}
@@ -1362,7 +1422,7 @@ int split_vma(struct mm_struct * mm, str
}

if (mapping)
- down(&mapping->i_shared_sem);
+ down_write(&mapping->i_shared_sem);
spin_lock(&mm->page_table_lock);

if (new_below)
@@ -1375,7 +1435,7 @@ int split_vma(struct mm_struct * mm, str

spin_unlock(&mm->page_table_lock);
if (mapping)
- up(&mapping->i_shared_sem);
+ up_write(&mapping->i_shared_sem);

return 0;
}
diff -puN mm/mremap.c~110_sem_contention mm/mremap.c
--- mmlinux-2.6/mm/mremap.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/mm/mremap.c 2004-04-14 15:49:01.000000000 -0400
@@ -295,7 +295,7 @@ static unsigned long move_vma(struct vm_
* and we propagate stale pages into the dst afterward.
*/
mapping = vma->vm_file->f_mapping;
- down(&mapping->i_shared_sem);
+ down_read(&mapping->i_shared_sem);
}
moved_len = move_page_tables(vma, new_addr, old_addr, old_len);
if (moved_len < old_len) {
@@ -311,7 +311,7 @@ static unsigned long move_vma(struct vm_
new_addr = -ENOMEM;
}
if (mapping)
- up(&mapping->i_shared_sem);
+ up_read(&mapping->i_shared_sem);

/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {
@@ -476,7 +476,7 @@ unsigned long do_mremap(unsigned long ad
}
else
root = &mapping->i_mmap;
- down(&mapping->i_shared_sem);
+ down_write(&mapping->i_shared_sem);
}

spin_lock(&vma->vm_mm->page_table_lock);
@@ -485,7 +485,7 @@ unsigned long do_mremap(unsigned long ad
spin_unlock(&vma->vm_mm->page_table_lock);

if(mapping)
- up(&mapping->i_shared_sem);
+ up_write(&mapping->i_shared_sem);

current->mm->total_vm += pages;
if (vma->vm_flags & VM_LOCKED) {
diff -puN mm/prio_tree.c~110_sem_contention mm/prio_tree.c
--- mmlinux-2.6/mm/prio_tree.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/mm/prio_tree.c 2004-04-14 15:49:01.000000000 -0400
@@ -279,6 +279,66 @@ void prio_tree_remove(struct prio_tree_r
}

/*
+ * Find a prio_tree node with the given radix_index and heap_index. This
+ * algorithm takes 0(log n) time. At most 64 (less than 32 in common case)
+ * nodes are visited in a 32 bit machine.
+ */
+struct prio_tree_node *prio_tree_find(struct prio_tree_root *root,
+ unsigned long radix_index, unsigned long heap_index)
+{
+ struct prio_tree_node *cur;
+ unsigned long r_index, h_index, index, mask;
+ int size_flag = 0;
+
+ if (prio_tree_empty(root) ||
+ heap_index > prio_tree_maxindex(root->index_bits))
+ return NULL;
+
+ cur = root->prio_tree_node;
+ mask = 1UL << (root->index_bits - 1);
+
+ while (mask) {
+ GET_INDEX(cur, r_index, h_index);
+
+ if (r_index == radix_index && h_index == heap_index)
+ return cur;
+
+ if (h_index < heap_index || (h_index == heap_index &&
+ r_index > radix_index))
+ return NULL;
+
+ if (size_flag)
+ index = heap_index - radix_index;
+ else
+ index = radix_index;
+
+ if (index & mask) {
+ if (prio_tree_right_empty(cur))
+ return NULL;
+ else
+ cur = cur->right;
+ }
+ else {
+ if (prio_tree_left_empty(cur))
+ return NULL;
+ else
+ cur = cur->left;
+ }
+
+ mask >>= 1;
+
+ if (!mask) {
+ mask = 1UL << (root->index_bits - 1);
+ size_flag = 1;
+ }
+ }
+ /* Should not reach here */
+ BUG();
+ return NULL;
+}
+
+
+/*
* Following functions help to enumerate all prio_tree_nodes in the tree that
* overlap with the input interval X [radix_index, heap_index]. The enumeration
* takes O(log n + m) time where 'log n' is the height of the tree (which is
@@ -529,55 +589,34 @@ void __vma_prio_tree_insert(struct prio_
void __vma_prio_tree_remove(struct prio_tree_root *root,
struct vm_area_struct *vma)
{
- struct vm_area_struct *node, *head, *new_head;
+ struct vm_area_struct *head, *new_head;

- if (vma->shared.both.parent == NULL && vma->vm_set_head == NULL) {
- list_del_init(&vma->shared.vm_set.list);
- INIT_VMA_SHARED(vma);
+ if (!vma->shared.both.parent) {
+ __vma_prio_tree_del(vma);
return;
}

if (vma->vm_set_head) {
/* Leave this BUG_ON till prio_tree patch stabilizes */
BUG_ON(vma->vm_set_head->vm_set_head != vma);
- if (vma->shared.both.parent) {
- head = vma->vm_set_head;
- if (!list_empty(&head->shared.vm_set.list)) {
- new_head = list_entry(
- head->shared.vm_set.list.next,
- struct vm_area_struct,
- shared.vm_set.list);
- list_del_init(&head->shared.vm_set.list);
- }
- else
- new_head = NULL;
+ head = vma->vm_set_head;
+ if (!list_empty(&head->shared.vm_set.list)) {
+ new_head = list_entry(head->shared.vm_set.list.next,
+ struct vm_area_struct, shared.vm_set.list);
+ list_del_init(&head->shared.vm_set.list);
+ }
+ else
+ new_head = NULL;

- prio_tree_replace(root, &vma->shared.prio_tree_node,
- &head->shared.prio_tree_node);
- head->vm_set_head = new_head;
- if (new_head)
- new_head->vm_set_head = head;
+ prio_tree_replace(root, &vma->shared.prio_tree_node,
+ &head->shared.prio_tree_node);
+ head->vm_set_head = new_head;
+ if (new_head)
+ new_head->vm_set_head = head;

- }
- else {
- node = vma->vm_set_head;
- if (!list_empty(&vma->shared.vm_set.list)) {
- new_head = list_entry(
- vma->shared.vm_set.list.next,
- struct vm_area_struct,
- shared.vm_set.list);
- list_del_init(&vma->shared.vm_set.list);
- node->vm_set_head = new_head;
- new_head->vm_set_head = node;
- }
- else
- node->vm_set_head = NULL;
- }
- INIT_VMA_SHARED(vma);
- return;
- }
+ } else
+ prio_tree_remove(root, &vma->shared.prio_tree_node);

- prio_tree_remove(root, &vma->shared.prio_tree_node);
INIT_VMA_SHARED(vma);
}

diff -puN mm/rmap.c~110_sem_contention mm/rmap.c
--- mmlinux-2.6/mm/rmap.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/mm/rmap.c 2004-04-14 15:49:01.000000000 -0400
@@ -279,10 +279,10 @@ static inline int page_referenced_obj(st
unsigned long address;
int referenced = 0;

- if (down_trylock(&mapping->i_shared_sem))
+ if (!down_read_trylock(&mapping->i_shared_sem))
return 0;

- vma = __vma_prio_tree_first(&mapping->i_mmap,
+ vma = __vma_prio_tree_first_lock(&mapping->i_mmap,
&iter, pgoff, pgoff);
while (vma) {
if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
@@ -297,11 +297,11 @@ static inline int page_referenced_obj(st
if (!*mapcount)
goto out;
}
- vma = __vma_prio_tree_next(vma, &mapping->i_mmap,
+ vma = __vma_prio_tree_next_lock(vma, &mapping->i_mmap,
&iter, pgoff, pgoff);
}

- vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ vma = __vma_prio_tree_first_lock(&mapping->i_mmap_shared,
&iter, pgoff, pgoff);
while (vma) {
if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) {
@@ -315,14 +315,17 @@ static inline int page_referenced_obj(st
if (!*mapcount)
goto out;
}
- vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared,
+ vma = __vma_prio_tree_next_lock(vma, &mapping->i_mmap_shared,
&iter, pgoff, pgoff);
}
-
+
if (list_empty(&mapping->i_mmap_nonlinear))
- WARN_ON(*mapcount > 0);
+ WARN_ON(*mapcount > 0);
+ up_read(&mapping->i_shared_sem);
+ return referenced;
out:
- up(&mapping->i_shared_sem);
+ __vma_prio_tree_iter_unlock(&iter);
+ up_read(&mapping->i_shared_sem);
return referenced;
}

@@ -711,10 +714,10 @@ static inline int try_to_unmap_obj(struc
unsigned long max_nl_cursor = 0;
unsigned long max_nl_size = 0;

- if (down_trylock(&mapping->i_shared_sem))
+ if (!down_read_trylock(&mapping->i_shared_sem))
return ret;

- vma = __vma_prio_tree_first(&mapping->i_mmap,
+ vma = __vma_prio_tree_first_lock(&mapping->i_mmap,
&iter, pgoff, pgoff);
while (vma) {
if (vma->vm_mm->rss) {
@@ -722,13 +725,13 @@ static inline int try_to_unmap_obj(struc
ret = try_to_unmap_one(
page, vma->vm_mm, address, mapcount, vma);
if (ret == SWAP_FAIL || !*mapcount)
- goto out;
+ goto out_read;
}
- vma = __vma_prio_tree_next(vma, &mapping->i_mmap,
+ vma = __vma_prio_tree_next_lock(vma, &mapping->i_mmap,
&iter, pgoff, pgoff);
}

- vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ vma = __vma_prio_tree_first_lock(&mapping->i_mmap_shared,
&iter, pgoff, pgoff);
while (vma) {
if (vma->vm_mm->rss) {
@@ -736,14 +739,18 @@ static inline int try_to_unmap_obj(struc
ret = try_to_unmap_one(
page, vma->vm_mm, address, mapcount, vma);
if (ret == SWAP_FAIL || !*mapcount)
- goto out;
+ goto out_read;
}
- vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared,
+ vma = __vma_prio_tree_next_lock(vma, &mapping->i_mmap_shared,
&iter, pgoff, pgoff);
}

if (list_empty(&mapping->i_mmap_nonlinear))
- goto out;
+ goto nolock;
+
+ up_read(&mapping->i_shared_sem);
+ if (!down_write_trylock(&mapping->i_shared_sem))
+ return ret;

list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
shared.vm_set.list) {
@@ -813,7 +820,12 @@ static inline int try_to_unmap_obj(struc
relock:
rmap_lock(page);
out:
- up(&mapping->i_shared_sem);
+ up_write(&mapping->i_shared_sem);
+ return ret;
+out_read:
+ __vma_prio_tree_iter_unlock(&iter);
+nolock:
+ up_read(&mapping->i_shared_sem);
return ret;
}


_

2004-04-15 00:05:34

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

On Wed, Apr 14, 2004 at 04:18:38PM -0400, Rajesh Venkatasubramanian wrote:
>
> This patch is another attempt at reducing the contention on i_shared_sem.
> The patch converts i_shared_sem from normal semaphore to read-write
> semaphore. The locking rules used are:
>
> 1) A prio_tree cannot be modified without holding write lock.
> 2) However, vmas can be added and removed from a vm_set list
> by just holding the read lock and a bit lock (vm_set_lock)
> in the corresponding prio_tree node.

no way, you cannot bitflip vm_flags unless you own the mmap_sem, this
patch seems very broken to me, it should randomly corrupt memory in
vma->vm_flags while racing against mprotect etc.. or am I missing
something?

> 3) All objrmap functions just hold read lock now. So when we
> walk a vm_set list we have to hold the corresponding
> vm_set_lock.
> 4) Since truncate uses write lock (provides exclusion) we don't
> have to take vm_set_locks.
>
> Martin! When you get time to test your SDET with this patch, please
> let me know whether this patch helps you at all. The patch applies
> on top of 2.6.5-mjb1+anobjrmap9_prio_tree.

I considered converting it to a rwsem too, details are in the the email
I posted while providing the rwspinlock solution to the parisc cache
flushing code.

As I wrote there, I wasn't convinced in the common case this is going to
gain anything significant (the only thing that sleeps while teh
semaphore is held is truncate and truncate during paging on the same
inode isn't an extremly common case, especially for the big apps), and
it makes it a bit more complicated, but giving it a try will be
interesting. I was mostly interested about having the objrmap code very
rarely failing the trylock during paging (that semaphore is by far the
biggest scalability hit during paging of shm, but the cacheline bouncing
won't be avoided by the rwsem). To make the paging scale better
(something SDET cannot measure) I don't need a safe vm_set_lock, I
believe simply making it a rwsem is the way to go just to make the
paging potentially scale a bit better. I rated implementing the locking
abstraction to fixup the basic parisc race as a bit higher prio, after
that it should be easy to have it implementing a rwsem for all archs w/o
cache flushing, the abstraction will have to expose a read/write
functionality for the rwlock. I'm not convinced your double locking is
going to boost anything even if it would be safe, I'd just take it in
write mode when the tree is being modified, with the only object of
avoiding the paging to block (and potentially to avoid blocking against
big truncates too).

2004-04-15 00:11:44

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

>> Martin! When you get time to test your SDET with this patch, please
>> let me know whether this patch helps you at all. The patch applies
>> on top of 2.6.5-mjb1+anobjrmap9_prio_tree.
>
> I considered converting it to a rwsem too, details are in the the email
> I posted while providing the rwspinlock solution to the parisc cache
> flushing code.

I will try it, but I'm not convinced it'll help. I profiled the takers
of i_shared_sem, and I think they're all writers (and I tried rwsem on
the simple linked list before with no benefit).

M.

Subject: Re: [PATCH] anobjrmap 9 priority mjb tree


> > 2) However, vmas can be added and removed from a vm_set list
> > by just holding the read lock and a bit lock (vm_set_lock)
> > in the corresponding prio_tree node.
>
> no way, you cannot bitflip vm_flags unless you own the mmap_sem, this
> patch seems very broken to me, it should randomly corrupt memory in
> vma->vm_flags while racing against mprotect etc.. or am I missing
> something?

I don't know why bit_spin_lock with vma->vm_flags should be a problem
if it is used without mmap_sem. Can you explain ?

Anyway, in my patch, the only places that use vm_set_lock without
mmap_sem is __vma_prio_tree_first_lock and __vma_prio_tree_next_lock.
If it is really racy to use bit_spin_lock on vm_flags without mmap_sem
(I am not sure and I am not convinced that it is racy), then we can
revert these changes and take down_write on the page out path.

Well. In that case, we can use rwsem as you mentioned below: take
down_write on all modifications and take down_read on pageout. Here, you
allow multiple parallel page_referenced and try_to_unmap on the same
inode, however with only one modification at a time.

Wherease my solution will allow multiple modifications at the same
time (if possible) with only one pageout routine at a time. I chose
this solution because Martin's SDET took big hit in common cases of
adding and removing vmas from the i_mmap{_shared} data structure.

Thanks,
Rajesh

>
> > 3) All objrmap functions just hold read lock now. So when we
> > walk a vm_set list we have to hold the corresponding
> > vm_set_lock.
> > 4) Since truncate uses write lock (provides exclusion) we don't
> > have to take vm_set_locks.
> >
> > Martin! When you get time to test your SDET with this patch, please
> > let me know whether this patch helps you at all. The patch applies
> > on top of 2.6.5-mjb1+anobjrmap9_prio_tree.
>
> I considered converting it to a rwsem too, details are in the the email
> I posted while providing the rwspinlock solution to the parisc cache
> flushing code.
>
> As I wrote there, I wasn't convinced in the common case this is going to
> gain anything significant (the only thing that sleeps while teh
> semaphore is held is truncate and truncate during paging on the same
> inode isn't an extremly common case, especially for the big apps), and
> it makes it a bit more complicated, but giving it a try will be
> interesting. I was mostly interested about having the objrmap code very
> rarely failing the trylock during paging (that semaphore is by far the
> biggest scalability hit during paging of shm, but the cacheline bouncing
> won't be avoided by the rwsem). To make the paging scale better
> (something SDET cannot measure) I don't need a safe vm_set_lock, I
> believe simply making it a rwsem is the way to go just to make the
> paging potentially scale a bit better. I rated implementing the locking
> abstraction to fixup the basic parisc race as a bit higher prio, after
> that it should be easy to have it implementing a rwsem for all archs w/o
> cache flushing, the abstraction will have to expose a read/write
> functionality for the rwlock. I'm not convinced your double locking is
> going to boost anything even if it would be safe, I'd just take it in
> write mode when the tree is being modified, with the only object of
> avoiding the paging to block (and potentially to avoid blocking against
> big truncates too).
>

2004-04-15 06:23:33

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

> Wherease my solution will allow multiple modifications at the same
> time (if possible) with only one pageout routine at a time. I chose
> this solution because Martin's SDET took big hit in common cases of
> adding and removing vmas from the i_mmap{_shared} data structure.

FYI, even without prio-tree, I get a 12% boost from converting i_shared_sem
into a spinlock. I'll try doing the same on top of prio-tree next.

M.

2004-04-15 10:26:16

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

On Wed, 14 Apr 2004, Martin J. Bligh wrote:
>
> FYI, even without prio-tree, I get a 12% boost from converting i_shared_sem
> into a spinlock. I'll try doing the same on top of prio-tree next.

Good news, though not a surprise.

Any ideas how we might handle latency from vmtruncate (and
try_to_unmap) if using prio_tree with i_shared_lock spinlock?

Hugh

2004-04-15 12:52:35

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

On Thu, Apr 15, 2004 at 11:26:09AM +0100, Hugh Dickins wrote:
> On Wed, 14 Apr 2004, Martin J. Bligh wrote:
> >
> > FYI, even without prio-tree, I get a 12% boost from converting i_shared_sem
> > into a spinlock. I'll try doing the same on top of prio-tree next.
>
> Good news, though not a surprise.
>
> Any ideas how we might handle latency from vmtruncate (and
> try_to_unmap) if using prio_tree with i_shared_lock spinlock?

we'd need to break the loop after need_resched returns 1 (and then the
second time we'd just screw the latency and go ahead). I also wanted to
make it a spinlock again like in 2.4, the semaphore probably generates
overscheduling. OTOH the spinlock saved some cpu in slightly different
workloads with big truncates (plus it made the cond_resched trivial w/o
requiring loop break) and I agree with Andrew about that, Martin isn't
benchmarking the other side, the one that made Andrew to change it to a
semaphore.

2004-04-15 13:00:33

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

On Wed, Apr 14, 2004 at 11:40:52PM -0400, Rajesh Venkatasubramanian wrote:
>
> > > 2) However, vmas can be added and removed from a vm_set list
> > > by just holding the read lock and a bit lock (vm_set_lock)
> > > in the corresponding prio_tree node.
> >
> > no way, you cannot bitflip vm_flags unless you own the mmap_sem, this
> > patch seems very broken to me, it should randomly corrupt memory in
> > vma->vm_flags while racing against mprotect etc.. or am I missing
> > something?
>
> I don't know why bit_spin_lock with vma->vm_flags should be a problem
> if it is used without mmap_sem. Can you explain ?

you seem not to know all rules about the atomic operations in smp, you
cannot just set_bit on one side and use non-atomic operations on the
other side, and expect the set_bit not to invalidate the non-atomic
operations.

The effect of the mprotect may be deleted by your new concurrent
set_bit and stuff like that.

> If it is really racy to use bit_spin_lock on vm_flags without mmap_sem

it __definitely__ is racy.

> Well. In that case, we can use rwsem as you mentioned below: take
> down_write on all modifications and take down_read on pageout. Here, you

exactly, this also avoids the more complex (and racy, but the racy would
be easy to fix by adding another 4 atomic bytes to the vma) double
locking that you introduced.

> allow multiple parallel page_referenced and try_to_unmap on the same
> inode, however with only one modification at a time.

exactly.

> Wherease my solution will allow multiple modifications at the same
> time (if possible) with only one pageout routine at a time. I chose
> this solution because Martin's SDET took big hit in common cases of
> adding and removing vmas from the i_mmap{_shared} data structure.

you can still fix the smp race condition by trivially adding 4 bytes to
the vma (i.e. a vma->vm_flags_atomic), but I'd be surprised if your
double locking actually improve things, Martin is running on a very
parallel old-numa where cacheline bouncing across nodes pass through a
fibre channel IIRC, and the cacheline bouncing that the semaphore
generates is huge, it's not necessairly huge contention, it's just the
bouncing that hurts, and the down_read won't help at all for the
cacheline trashing, it'll still bounce like before. Though you may gain
something minor, but I doubt it'd be huge.

I'd suggest Martin to give a try to the racy code, it's just good enough
for a pratical experiment (the race shouldn't easily trigger so it
probably passes one run of SDET safely).

Subject: Re: [PATCH] anobjrmap 9 priority mjb tree


> > I don't know why bit_spin_lock with vma->vm_flags should be a problem
> > if it is used without mmap_sem. Can you explain ?
>
> you seem not to know all rules about the atomic operations in smp, you
> cannot just set_bit on one side and use non-atomic operations on the
> other side, and expect the set_bit not to invalidate the non-atomic
> operations.
>
> The effect of the mprotect may be deleted by your new concurrent
> set_bit and stuff like that.

Thank you very much for that. Stupid me. I didn't read the code in
page->flags properly. Thanks again.

Rajesh


2004-04-15 15:41:14

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

>> FYI, even without prio-tree, I get a 12% boost from converting i_shared_sem
>> into a spinlock. I'll try doing the same on top of prio-tree next.
>
> Good news, though not a surprise.
>
> Any ideas how we might handle latency from vmtruncate (and
> try_to_unmap) if using prio_tree with i_shared_lock spinlock?

I've been thinking about that. My rough plan is to go wild, naked and lockless.
If we arrange things in the correct order, new entries onto the list would
pick up the truncated image of the file (so they'd be OK). Entries removed
from the list don't matter anyway. We just need to make sure that everything
that was on the list when we start does get truncated.

Basically there are two sets of operations ... ones that map and unmap
the file object (address_space) and ones that alter it - we should be
able to proceed with inserts and deletes whilst truncating, though we
probably need to protect against the alterations. The two op types could
go under separate locking.

But I need to think on it some more - would not suprise me to come to the
conclusion that I'm full of shit ;-) The opinions of others would be
very welcome ...

M.

2004-04-15 16:55:45

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

On Thu, 15 Apr 2004, Martin J. Bligh wrote:
> >
> > Any ideas how we might handle latency from vmtruncate (and
> > try_to_unmap) if using prio_tree with i_shared_lock spinlock?
>
> I've been thinking about that. My rough plan is to go wild, naked and lockless.
> If we arrange things in the correct order, new entries onto the list would

It's quite easy if there's a list - though I'm not that eager to go wild,
naked and lockless with you! But what if there's a prio_tree?

Hugh

2004-04-15 17:03:38

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

>> > Any ideas how we might handle latency from vmtruncate (and
>> > try_to_unmap) if using prio_tree with i_shared_lock spinlock?
>>
>> I've been thinking about that. My rough plan is to go wild, naked and lockless.
>> If we arrange things in the correct order, new entries onto the list would
>
> It's quite easy if there's a list - though I'm not that eager to go wild,
> naked and lockless with you! But what if there's a prio_tree?

I still think my list-of-lists patch fixes the original problem, and is
simpler ... I'll try to get it updated, and sent out.

M.,

2004-04-15 17:52:54

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

On Thu, 15 Apr 2004, Martin J. Bligh wrote:
>
> I still think my list-of-lists patch fixes the original problem, and is
> simpler ... I'll try to get it updated, and sent out.

Please do, I never saw it before.

Though I have to admit I'm sceptical: prio_tree appears to be well
designed for the issue in question, list-of-lists sounds, well,
no offence, but a bit of a hack.

But we may well have overlooked the overhead of prio_tree's
complexity relative to list, and need to reconsider options.

Hugh

2004-04-15 18:49:07

by Dave McCracken

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree


--On Thursday, April 15, 2004 18:50:42 +0100 Hugh Dickins
<[email protected]> wrote:

> Though I have to admit I'm sceptical: prio_tree appears to be well
> designed for the issue in question, list-of-lists sounds, well,
> no offence, but a bit of a hack.

It is a bit of a hack, but the theory behind it is fairly simple. It came
out of my early efforts to sort the list. Martin and I produced a theory
that many vmas have identical start and end addresses due to fork and/or
fixed address mappings. If this theory is true list-of-lists will create a
much shorter top-level list of unique start-end pairs for searching. We'd
only need to walk the second level list when we get a match to the search.

It never got any serious exposure or testing. It came out just as
everyone's attention shifted away from objrmap so no one really looked at
it.

Dave McCracken

Subject: Re: [PATCH] anobjrmap 9 priority mjb tree


> It has similar problems, IIRC with increasing i_shared_sem contention.

Agreed.

> But I think it solves the same issues as prio_tree,

Agreed.

> is simpler,

Agreed.

> is easier to fix up to do clever locking with.

I haven't thought about it fully, so I am not sure. But, it is likely
that locking is easier with list-of-lists.

[snip]

> diff -urpN -X /home/fletch/.diff.exclude 820-numa_large_pages/mm/mmap.c 830-list-of-lists/mm/mmap.c
> --- 820-numa_large_pages/mm/mmap.c Wed Jun 18 21:49:20 2003
> +++ 830-list-of-lists/mm/mmap.c Wed Jun 18 23:29:38 2003
> @@ -306,6 +306,56 @@ static void __vma_link_rb(struct mm_stru
> rb_insert_color(&vma->vm_rb, &mm->mm_rb);
> }
>
> +static void vma_add (struct vm_area_struct *vma,
> + struct list_head *range_list)
> +{
> + struct address_range *range;
> + struct list_head *prev, *next;
> + unsigned long start = vma->vm_pgoff;
> + unsigned long end = vma->vm_pgoff +
> + (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1);
> +
> + /* First, look for an existing range that matches ours */
> + prev = range_list;
> + list_for_each(next, range_list) {
> + range = list_entry(next, struct address_range, ranges);
> + if (range->start > start)
> + break; /* this list is sorted by start */
> + if ((range->start == start) && (range->end == end)) {
> + goto found;
> + }
> + prev = next;
> + }

Hmm.. We do a linear O(N) search for each vma added. If the range_list
has 1000 vmas, then it is really bad. Running Ingo's test-mmap3.c or
Andrew's rmap-test.c (check 3rd test Andrew did - single process,
10,000 different vmas - with different range->start and range->end)
will be slow.

The prio_tree patch optimizes these cases with O(log N) insert algorithm.

[snip]

> +static void vma_del (struct vm_area_struct *vma)
> +{
[snip]
> + next = vma->shared.next; /* stash the range list we're on */
> + list_del(&vma->shared); /* remove us from the list of vmas */
> + if (list_empty(next)) { /* we were the last vma for range */
> + range = list_entry(next, struct address_range, vmas);
> + list_del(&range->ranges);
> + kfree(range);
> + }
> +}

Agree that vma_del is much simpler.

> page_referenced_obj(struct page *page)
> {
[snip]
> + list_for_each_entry(range, &mapping->i_mmap, ranges) {
> + if (range->start > index)
> + break; /* Sorted by start address => we are done */
> + if (range->end < index)
> + continue;

Again O(N) search...

> + list_for_each_entry(vma, &range->vmas, shared)
> + referenced += page_referenced_obj_one(vma, page);
> + }


> @@ -512,7 +532,9 @@ static int
> try_to_unmap_obj(struct page *page)
> {
[snip]
> + list_for_each_entry(range, &mapping->i_mmap, ranges) {
> + if (range->start > index)
> + break; /* Sorted by start address => we are done */
> + if (range->end < index)
> + continue;

Here also O(N) search when each vma map a unique set of file pages...

Thanks for posting the code. Your old postings (almost a year ago)
regarding list-of-lists inspired me to develop prio_tree. Thanks.

Rajesh

2004-04-15 22:33:22

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

On Thu, Apr 15, 2004 at 08:40:50AM -0700, Martin J. Bligh wrote:
> >> FYI, even without prio-tree, I get a 12% boost from converting i_shared_sem
> >> into a spinlock. I'll try doing the same on top of prio-tree next.
> >
> > Good news, though not a surprise.
> >
> > Any ideas how we might handle latency from vmtruncate (and
> > try_to_unmap) if using prio_tree with i_shared_lock spinlock?
>
> I've been thinking about that. My rough plan is to go wild, naked and lockless.
> If we arrange things in the correct order, new entries onto the list would
> pick up the truncated image of the file (so they'd be OK). Entries removed
> from the list don't matter anyway. We just need to make sure that everything
> that was on the list when we start does get truncated.

entries removed must be freed with RCU, and that means vmas freed with
rcu that means mm and pgd freed with rcu and the whole vm will collapse
on you when you attempt that. I mean it's going all the way up to the
whole MM, not just the shared list of vmas.

2004-04-15 22:40:06

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

On Thu, Apr 15, 2004 at 10:14:51AM -0700, Martin J. Bligh wrote:
> I still think my list-of-lists patch fixes the original problem, and is
> simpler ... I'll try to get it updated, and sent out.

it's a lot worse than the prio-tree IMHO, when a new range is generated
you've to loop all over the vmas etc... it's O(N) stuff for certain ops,
prio-tree is O(log(N)) for all.

If your object is to be able to use RCU (and implementing a RCU
prio-tree is going to be extremely complicated) you can attempt a
prio-skip-list, that would be a skip-list (that still provides O(log(N))
but that uses lists everywhere so that you can more easily create a
RCU-prio-skip-list, though I didn't even think if the range-lookup can
be implemented reasonably easily on top of a skip-list to create the
prio-skip-list).

but even if we could create the rcu-prio-skip-list (that would solve all
complexity issues like the prio-tree and it would allow lockless lookups
too [unlike prio-tree]) you'd still have to deal with the mess of
freeing vmas with rcu, that would cause everything else over the
vma to be freed with rcu too, mm, pgds etc... that would require quite
some changes, at the very least to be able to garbage collect the mm,pgd
from the vma free operations. I doubt it worth it, for the fast path you
cannot go lockless anyways, the lockless is only for the readonly
operations, and the readonly are the only unlikely ones (namely only
truncate and paging). So it's overdesign.