LinuxLists.cc - [PATCH] anobjrmap 9 priority mjb tree

2004-04-12 04:32:43

by Rajesh Venkatasubramanian

[permalink] [raw]

Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

This patch is an attempt at reducing the contention on i_shared_sem
by introducing a new semaphore i_mmap_sem. The i_shared_sem covers
i_mmap_shared tree and i_mmap_nonlinear list now, whereas i_mmap_sem
covers i_mmap tree. This may help to reduce the contention on
i_shared_sem if a file is mapped both private and shared. Kernel
compile time with and without this patch did not change much, though.

This patch is on top of 2.6.5-mjb1+anobjrmap9_prio. Compiled and
tested.

Martin! Are you interested in testing SDET with this patch ?

fs/hugetlbfs/inode.c | 6 +++++-
fs/inode.c | 1 +
include/linux/fs.h | 3 ++-
include/linux/mm.h | 24 +++++++++++++++++++++++-
kernel/fork.c | 4 ++--
mm/filemap.c | 6 +++---
mm/memory.c | 6 +++++-
mm/mmap.c | 51 +++++++++++++++------------------------------------
mm/mremap.c | 29 ++++++++++++-----------------
mm/rmap.c | 32 ++++++++++++++++++++++++--------
10 files changed, 92 insertions(+), 70 deletions(-)

diff -puN include/linux/fs.h~010_sem_contention include/linux/fs.h
--- mmlinux-2.6/include/linux/fs.h~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/include/linux/fs.h 2004-04-11 22:07:40.000000000 -0400
@@ -333,7 +333,8 @@ struct address_space {
struct prio_tree_root i_mmap; /* tree of private mappings */
struct prio_tree_root i_mmap_shared; /* tree of shared mappings */
struct list_head i_mmap_nonlinear;/*list of nonlinear mappings */
- struct semaphore i_shared_sem; /* protect both above lists */
+ struct semaphore i_mmap_sem; /* protect i_mmap prio_tree */
+ struct semaphore i_shared_sem; /* protect shared and nonlinear */
atomic_t truncate_count; /* Cover race condition with truncate */
unsigned long flags; /* error bits/gfp mask */
struct backing_dev_info *backing_dev_info; /* device readahead, etc */
diff -puN include/linux/mm.h~010_sem_contention include/linux/mm.h
--- mmlinux-2.6/include/linux/mm.h~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/include/linux/mm.h 2004-04-11 22:08:13.000000000 -0400
@@ -232,7 +232,7 @@ static inline void __vma_prio_tree_add(s
* We cannot modify vm_start, vm_end, vm_pgoff fields of a vma that has been
* already present in an i_mmap{_shared} tree without modifying the tree. The
* following helper function should be used when such modifications are
- * necessary. We should hold the mapping's i_shared_sem.
+ * necessary. We should hold the mapping's i_shared_sem or i_mmap_sem.
*
* This function can be (micro)optimized for some special cases (maybe later).
*/
@@ -296,6 +296,28 @@ static inline struct vm_area_struct *__v
return NULL;
}

+/* Caller should hold mmap_sem */
+static inline void vma_mapping_lock(struct vm_area_struct *vma)
+{
+ if (vma->vm_file) {
+ if (vma->vm_flags & VM_SHARED)
+ down(&vma->vm_file->f_mapping->i_shared_sem);
+ else
+ down(&vma->vm_file->f_mapping->i_mmap_sem);
+ }
+}
+
+/* Caller should hold mmap_sem */
+static inline void vma_mapping_unlock(struct vm_area_struct *vma)
+{
+ if (vma->vm_file) {
+ if (vma->vm_flags & VM_SHARED)
+ up(&vma->vm_file->f_mapping->i_shared_sem);
+ else
+ up(&vma->vm_file->f_mapping->i_mmap_sem);
+ }
+}
+
/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
diff -puN kernel/fork.c~010_sem_contention kernel/fork.c
--- mmlinux-2.6/kernel/fork.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/kernel/fork.c 2004-04-11 22:07:40.000000000 -0400
@@ -332,9 +332,9 @@ static inline int dup_mmap(struct mm_str
atomic_dec(&inode->i_writecount);

/* insert tmp into the share list, just after mpnt */
- down(&file->f_mapping->i_shared_sem);
+ vma_mapping_lock(tmp);
__vma_prio_tree_add(tmp, mpnt);
- up(&file->f_mapping->i_shared_sem);
+ vma_mapping_unlock(tmp);
}

/*
diff -puN mm/mmap.c~010_sem_contention mm/mmap.c
--- mmlinux-2.6/mm/mmap.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/mmap.c 2004-04-11 22:07:40.000000000 -0400
@@ -67,7 +67,7 @@ int mmap_use_hugepages = 0;
int mmap_hugepages_map_sz = 256;

/*
- * Requires inode->i_mapping->i_shared_sem
+ * Requires inode->i_mapping->i_shared_sem or i_mmap_sem
*/
static inline void
__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode,
@@ -96,10 +96,10 @@ static void remove_shared_vm_struct(stru

if (file) {
struct address_space *mapping = file->f_mapping;
- down(&mapping->i_shared_sem);
+ vma_mapping_lock(vma);
__remove_shared_vm_struct(vma, file->f_dentry->d_inode,
mapping);
- up(&mapping->i_shared_sem);
+ vma_mapping_unlock(vma);
}
}

@@ -298,18 +298,11 @@ static void vma_link(struct mm_struct *m
struct vm_area_struct *prev, struct rb_node **rb_link,
struct rb_node *rb_parent)
{
- struct address_space *mapping = NULL;
-
- if (vma->vm_file)
- mapping = vma->vm_file->f_mapping;
-
- if (mapping)
- down(&mapping->i_shared_sem);
+ vma_mapping_lock(vma);
spin_lock(&mm->page_table_lock);
__vma_link(mm, vma, prev, rb_link, rb_parent);
spin_unlock(&mm->page_table_lock);
- if (mapping)
- up(&mapping->i_shared_sem);
+ vma_mapping_unlock(vma);

mark_mm_hugetlb(mm, vma);
mm->map_count++;
@@ -318,8 +311,8 @@ static void vma_link(struct mm_struct *m

/*
* Insert vm structure into process list sorted by address and into the inode's
- * i_mmap ring. The caller should hold mm->page_table_lock and
- * ->f_mappping->i_shared_sem if vm_file is non-NULL.
+ * i_mmap ring. The caller should hold mm->page_table_lock and i_shared_sem or
+ * i_mmap_sem if vm_file is non-NULL.
*/
static void
__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
@@ -410,7 +403,6 @@ static struct vm_area_struct *vma_merge(
spinlock_t *lock = &mm->page_table_lock;
struct inode *inode = file ? file->f_dentry->d_inode : NULL;
struct address_space *mapping = file ? file->f_mapping : NULL;
- struct semaphore *i_shared_sem;
struct prio_tree_root *root = NULL;

/*
@@ -420,8 +412,6 @@ static struct vm_area_struct *vma_merge(
if (vm_flags & VM_SPECIAL)
return NULL;

- i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL;
-
if (mapping) {
if (vm_flags & VM_SHARED) {
if (likely(!(vm_flags & VM_NONLINEAR)))
@@ -442,13 +432,8 @@ static struct vm_area_struct *vma_merge(
if (prev->vm_end == addr &&
can_vma_merge_after(prev, vm_flags, file, pgoff)) {
struct vm_area_struct *next;
- int need_up = 0;

- if (unlikely(file && prev->vm_next &&
- prev->vm_next->vm_file == file)) {
- down(i_shared_sem);
- need_up = 1;
- }
+ vma_mapping_lock(prev);
spin_lock(lock);

/*
@@ -463,8 +448,7 @@ static struct vm_area_struct *vma_merge(
next->vm_end, prev->vm_pgoff);
__remove_shared_vm_struct(next, inode, mapping);
spin_unlock(lock);
- if (need_up)
- up(i_shared_sem);
+ vma_mapping_unlock(prev);
if (file)
fput(file);

@@ -475,8 +459,7 @@ static struct vm_area_struct *vma_merge(

__vma_modify(root, prev, prev->vm_start, end, prev->vm_pgoff);
spin_unlock(lock);
- if (need_up)
- up(i_shared_sem);
+ vma_mapping_unlock(prev);
return prev;
}

@@ -490,14 +473,12 @@ static struct vm_area_struct *vma_merge(
pgoff, (end - addr) >> PAGE_SHIFT))
return NULL;
if (end == prev->vm_start) {
- if (file)
- down(i_shared_sem);
+ vma_mapping_lock(prev);
spin_lock(lock);
__vma_modify(root, prev, addr, prev->vm_end,
prev->vm_pgoff - ((end - addr) >> PAGE_SHIFT));
spin_unlock(lock);
- if (file)
- up(i_shared_sem);
+ vma_mapping_unlock(prev);
return prev;
}
}
@@ -1361,8 +1342,7 @@ int split_vma(struct mm_struct * mm, str
root = &mapping->i_mmap;
}

- if (mapping)
- down(&mapping->i_shared_sem);
+ vma_mapping_lock(vma);
spin_lock(&mm->page_table_lock);

if (new_below)
@@ -1374,8 +1354,7 @@ int split_vma(struct mm_struct * mm, str
__insert_vm_struct(mm, new);

spin_unlock(&mm->page_table_lock);
- if (mapping)
- up(&mapping->i_shared_sem);
+ vma_mapping_unlock(vma);

return 0;
}
@@ -1609,7 +1588,7 @@ void exit_mmap(struct mm_struct *mm)

/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap ring. If vm_file is non-NULL
- * then i_shared_sem is taken here.
+ * then i_shared_sem or i_mmap_sem is taken here.
*/
void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
diff -puN mm/memory.c~010_sem_contention mm/memory.c
--- mmlinux-2.6/mm/memory.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/memory.c 2004-04-11 22:07:40.000000000 -0400
@@ -1133,11 +1133,15 @@ void invalidate_mmap_range(struct addres
if (holeend & ~(long long)ULONG_MAX)
hlen = ULONG_MAX - hba + 1;
}
- down(&mapping->i_shared_sem);
+ down(&mapping->i_mmap_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
+ up(&mapping->i_mmap_sem);
+ down(&mapping->i_shared_sem);
+ /* Protect against page fault -- not sure this is required */
+ atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen);
up(&mapping->i_shared_sem);
diff -puN mm/filemap.c~010_sem_contention mm/filemap.c
--- mmlinux-2.6/mm/filemap.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/filemap.c 2004-04-11 22:07:40.000000000 -0400
@@ -55,17 +55,17 @@
/*
* Lock ordering:
*
- * ->i_shared_sem (vmtruncate)
+ * ->i_shared{_mmap}_sem (vmtruncate)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_list_lock
* ->swap_device_lock (exclusive_swap_page, others)
* ->mapping->page_lock
*
* ->i_sem
- * ->i_shared_sem (truncate->invalidate_mmap_range)
+ * ->i_shared{_mmap}_sem (truncate->invalidate_mmap_range)
*
* ->mmap_sem
- * ->i_shared_sem (various places)
+ * ->i_shared{_mmap}_sem (various places)
*
* ->mmap_sem
* ->lock_page (access_process_vm)
diff -puN mm/mremap.c~010_sem_contention mm/mremap.c
--- mmlinux-2.6/mm/mremap.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/mremap.c 2004-04-11 22:07:40.000000000 -0400
@@ -267,7 +267,6 @@ static unsigned long move_vma(struct vm_
unsigned long new_len, unsigned long new_addr)
{
struct mm_struct *mm = vma->vm_mm;
- struct address_space *mapping = NULL;
struct vm_area_struct *new_vma;
unsigned long vm_flags = vma->vm_flags;
unsigned long new_pgoff;
@@ -287,16 +286,14 @@ static unsigned long move_vma(struct vm_
if (!new_vma)
return -ENOMEM;

- if (vma->vm_file) {
- /*
- * Subtle point from Rajesh Venkatasubramanian: before
- * moving file-based ptes, we must lock vmtruncate out,
- * since it might clean the dst vma before the src vma,
- * and we propagate stale pages into the dst afterward.
- */
- mapping = vma->vm_file->f_mapping;
- down(&mapping->i_shared_sem);
- }
+ /*
+ * Subtle point from Rajesh Venkatasubramanian: before
+ * moving file-based ptes, we must lock vmtruncate out,
+ * since it might clean the dst vma before the src vma,
+ * and we propagate stale pages into the dst afterward.
+ */
+ vma_mapping_lock(vma);
+
moved_len = move_page_tables(vma, new_addr, old_addr, old_len);
if (moved_len < old_len) {
/*
@@ -310,8 +307,8 @@ static unsigned long move_vma(struct vm_
old_addr = new_addr;
new_addr = -ENOMEM;
}
- if (mapping)
- up(&mapping->i_shared_sem);
+
+ vma_mapping_unlock(vma);

/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {
@@ -476,16 +473,14 @@ unsigned long do_mremap(unsigned long ad
}
else
root = &mapping->i_mmap;
- down(&mapping->i_shared_sem);
}

+ vma_mapping_lock(vma);
spin_lock(&vma->vm_mm->page_table_lock);
__vma_modify(root, vma, vma->vm_start,
addr + new_len, vma->vm_pgoff);
spin_unlock(&vma->vm_mm->page_table_lock);
-
- if(mapping)
- up(&mapping->i_shared_sem);
+ vma_mapping_unlock(vma);

current->mm->total_vm += pages;
if (vma->vm_flags & VM_LOCKED) {
diff -puN mm/rmap.c~010_sem_contention mm/rmap.c
--- mmlinux-2.6/mm/rmap.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/rmap.c 2004-04-11 22:07:40.000000000 -0400
@@ -267,8 +267,8 @@ out:
*
* This function is only called from page_referenced for object-based pages.
*
- * The semaphore address_space->i_shared_sem is tried. If it can't be gotten,
- * assume a reference count of 0, so try_to_unmap will then have a go.
+ * The semaphores ->i_mmap_sem and ->i_shared_sem are tried. If they can't be
+ * gotten, assume a reference count of 0, so try_to_unmap will then have a go.
*/
static inline int page_referenced_obj(struct page *page, int *mapcount)
{
@@ -276,12 +276,14 @@ static inline int page_referenced_obj(st
unsigned long pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
struct prio_tree_iter iter;
+ struct semaphore *semaphore;
unsigned long address;
int referenced = 0;

- if (down_trylock(&mapping->i_shared_sem))
+ if (down_trylock(&mapping->i_mmap_sem))
return 0;

+ semaphore = &mapping->i_mmap_sem;
vma = __vma_prio_tree_first(&mapping->i_mmap,
&iter, pgoff, pgoff);
while (vma) {
@@ -301,6 +303,12 @@ static inline int page_referenced_obj(st
&iter, pgoff, pgoff);
}

+ up(&mapping->i_mmap_sem);
+
+ if (down_trylock(&mapping->i_shared_sem))
+ return 0;
+
+ semaphore = &mapping->i_shared_sem;
vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
&iter, pgoff, pgoff);
while (vma) {
@@ -322,7 +330,7 @@ static inline int page_referenced_obj(st
if (list_empty(&mapping->i_mmap_nonlinear))
WARN_ON(*mapcount > 0);
out:
- up(&mapping->i_shared_sem);
+ up(semaphore);
return referenced;
}

@@ -696,8 +704,8 @@ out:
*
* This function is only called from try_to_unmap for object-based pages.
*
- * The semaphore address_space->i_shared_sem is tried. If it can't be gotten,
- * return a temporary error.
+ * The semaphores ->i_mmap_sem and ->i_shared_sem are tried. If they can't be
+ * gotten, return a temporary error.
*/
static inline int try_to_unmap_obj(struct page *page, int *mapcount)
{
@@ -705,15 +713,17 @@ static inline int try_to_unmap_obj(struc
unsigned long pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
struct prio_tree_iter iter;
+ struct semaphore *semaphore;
unsigned long address;
int ret = SWAP_AGAIN;
unsigned long cursor;
unsigned long max_nl_cursor = 0;
unsigned long max_nl_size = 0;

- if (down_trylock(&mapping->i_shared_sem))
+ if (down_trylock(&mapping->i_mmap_sem))
return ret;

+ semaphore = &mapping->i_mmap_sem;
vma = __vma_prio_tree_first(&mapping->i_mmap,
&iter, pgoff, pgoff);
while (vma) {
@@ -728,6 +738,12 @@ static inline int try_to_unmap_obj(struc
&iter, pgoff, pgoff);
}

+ up(&mapping->i_mmap_sem);
+
+ if (down_trylock(&mapping->i_shared_sem))
+ return ret;
+
+ semaphore = &mapping->i_shared_sem;
vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
&iter, pgoff, pgoff);
while (vma) {
@@ -813,7 +829,7 @@ static inline int try_to_unmap_obj(struc
relock:
rmap_lock(page);
out:
- up(&mapping->i_shared_sem);
+ up(semaphore);
return ret;
}

diff -puN fs/inode.c~010_sem_contention fs/inode.c
--- mmlinux-2.6/fs/inode.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/fs/inode.c 2004-04-11 22:07:40.000000000 -0400
@@ -185,6 +185,7 @@ void inode_init_once(struct inode *inode
sema_init(&inode->i_sem, 1);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
spin_lock_init(&inode->i_data.page_lock);
+ init_MUTEX(&inode->i_data.i_mmap_sem);
init_MUTEX(&inode->i_data.i_shared_sem);
atomic_set(&inode->i_data.truncate_count, 0);
INIT_LIST_HEAD(&inode->i_data.private_list);
diff -puN fs/hugetlbfs/inode.c~010_sem_contention fs/hugetlbfs/inode.c
--- mmlinux-2.6/fs/hugetlbfs/inode.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/fs/hugetlbfs/inode.c 2004-04-11 22:07:40.000000000 -0400
@@ -325,11 +325,15 @@ static int hugetlb_vmtruncate(struct ino
pgoff = offset >> HPAGE_SHIFT;

inode->i_size = offset;
- down(&mapping->i_shared_sem);
+ down(&mapping->i_mmap_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+ up(&mapping->i_mmap_sem);
+ down(&mapping->i_shared_sem);
+ /* Protect against page fault -- not sure this is required */
+ atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff);
up(&mapping->i_shared_sem);

_

2004-04-12 05:24:33

by Martin J. Bligh

[permalink] [raw]

Subject: Re: [PATCH] anobjrmap 9 priority mjb tree

> This patch is an attempt at reducing the contention on i_shared_sem
> by introducing a new semaphore i_mmap_sem. The i_shared_sem covers
> i_mmap_shared tree and i_mmap_nonlinear list now, whereas i_mmap_sem
> covers i_mmap tree. This may help to reduce the contention on
> i_shared_sem if a file is mapped both private and shared. Kernel
> compile time with and without this patch did not change much, though.
>
> This patch is on top of 2.6.5-mjb1+anobjrmap9_prio. Compiled and
> tested.
>
> Martin! Are you interested in testing SDET with this patch ?

Runs exactly the same as prio ... thanks for having a crack at it though.
I guess that sharing combination isn't that common ;-(

M.

2004-04-12 15:47:41

On Thu, Apr 15, 2004 at 10:14:51AM -0700, Martin J. Bligh wrote:
> I still think my list-of-lists patch fixes the original problem, and is
> simpler ... I'll try to get it updated, and sent out.

it's a lot worse than the prio-tree IMHO, when a new range is generated
you've to loop all over the vmas etc... it's O(N) stuff for certain ops,
prio-tree is O(log(N)) for all.

If your object is to be able to use RCU (and implementing a RCU
prio-tree is going to be extremely complicated) you can attempt a
prio-skip-list, that would be a skip-list (that still provides O(log(N))
but that uses lists everywhere so that you can more easily create a
RCU-prio-skip-list, though I didn't even think if the range-lookup can
be implemented reasonably easily on top of a skip-list to create the
prio-skip-list).

but even if we could create the rcu-prio-skip-list (that would solve all
complexity issues like the prio-tree and it would allow lockless lookups
too [unlike prio-tree]) you'd still have to deal with the mess of
freeing vmas with rcu, that would cause everything else over the
vma to be freed with rcu too, mm, pgds etc... that would require quite
some changes, at the very least to be able to garbage collect the mm,pgd
from the vma free operations. I doubt it worth it, for the fast path you
cannot go lockless anyways, the lockless is only for the readonly
operations, and the readonly are the only unlikely ones (namely only
truncate and paging). So it's overdesign.