2004-03-18 23:27:18

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH] anobjrmap 1/6 objrmap

First of six patches implementing full object-based rmap over 2.6.5-rc1,
reviving my anonmm method to compare against Andrea's anon_vma method.
I've not yet implemented Linus' early-COW solution to the mremap move
issue, that will follow; handling of non-linear obj vmas also to follow.
Sorry, not yet checked against wli's tree, he may have some fixes to it.

anobjrmap 1/6 Dave McCracken's objrmap

Start with Dave McCracken's objrmap from Martin J. Bligh's tree, as did
Andrea. We've each diverged slightly: I've not bothered to include the
filemap.c locking comment, just to remove it again later; and I've not
included the page_table_lock avoidance from mmap.c - I don't see how it
can be safe to unlink a vma while try_to_unmap might be in find_vma
(but that may be fine in Andrea's, which ends up not using find_vma).
In rmap.c: I've not seen the problem which led Andrea to change try
failures from 1 to 0; fixed three comment typos, positioning of
page_test_and_clear_dirty calls, and use ptep_clear_flush.

fs/exec.c | 1
include/linux/mm.h | 1
include/linux/page-flags.h | 5
include/linux/swap.h | 2
mm/fremap.c | 21 ++
mm/memory.c | 8
mm/page_alloc.c | 2
mm/rmap.c | 390 +++++++++++++++++++++++++++++++++++++++++++--
mm/swapfile.c | 1
9 files changed, 417 insertions(+), 14 deletions(-)

--- 2.6.5-rc1/fs/exec.c 2004-03-11 01:56:08.000000000 +0000
+++ anobjrmap1/fs/exec.c 2004-03-18 21:26:40.786812568 +0000
@@ -324,6 +324,7 @@ void put_dirty_page(struct task_struct *
}
lru_cache_add_active(page);
flush_dcache_page(page);
+ SetPageAnon(page);
set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))));
pte_chain = page_add_rmap(page, pte, pte_chain);
pte_unmap(pte);
--- 2.6.5-rc1/include/linux/mm.h 2004-03-11 01:56:06.000000000 +0000
+++ anobjrmap1/include/linux/mm.h 2004-03-18 21:26:40.787812416 +0000
@@ -180,6 +180,7 @@ struct page {
struct pte_chain *chain;/* Reverse pte mapping pointer.
* protected by PG_chainlock */
pte_addr_t direct;
+ int mapcount;
} pte;
unsigned long private; /* mapping-private opaque data */

--- 2.6.5-rc1/include/linux/page-flags.h 2004-03-16 07:00:20.000000000 +0000
+++ anobjrmap1/include/linux/page-flags.h 2004-03-18 21:26:40.789812112 +0000
@@ -75,6 +75,7 @@
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */
+#define PG_anon 20 /* Anonymous page */


/*
@@ -301,6 +302,10 @@ extern void get_full_page_state(struct p
#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags)
#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags)

+#define PageAnon(page) test_bit(PG_anon, &(page)->flags)
+#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags)
+#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags)
+
/*
* The PageSwapCache predicate doesn't use a PG_flag at this time,
* but it may again do so one day.
--- 2.6.5-rc1/include/linux/swap.h 2004-02-04 02:45:16.000000000 +0000
+++ anobjrmap1/include/linux/swap.h 2004-03-18 21:26:40.790811960 +0000
@@ -185,6 +185,8 @@ struct pte_chain *FASTCALL(page_add_rmap
void FASTCALL(page_remove_rmap(struct page *, pte_t *));
int FASTCALL(try_to_unmap(struct page *));

+int page_convert_anon(struct page *);
+
/* linux/mm/shmem.c */
extern int shmem_unuse(swp_entry_t entry, struct page *page);
#else
--- 2.6.5-rc1/mm/fremap.c 2004-03-11 01:56:06.000000000 +0000
+++ anobjrmap1/mm/fremap.c 2004-03-18 21:26:40.791811808 +0000
@@ -61,10 +61,26 @@ int install_page(struct mm_struct *mm, s
pmd_t *pmd;
pte_t pte_val;
struct pte_chain *pte_chain;
+ unsigned long pgidx;

pte_chain = pte_chain_alloc(GFP_KERNEL);
if (!pte_chain)
goto err;
+
+ /*
+ * Convert this page to anon for objrmap if it's nonlinear
+ */
+ pgidx = (addr - vma->vm_start) >> PAGE_SHIFT;
+ pgidx += vma->vm_pgoff;
+ pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
+ if (!PageAnon(page) && (page->index != pgidx)) {
+ lock_page(page);
+ err = page_convert_anon(page);
+ unlock_page(page);
+ if (err < 0)
+ goto err_free;
+ }
+
pgd = pgd_offset(mm, addr);
spin_lock(&mm->page_table_lock);

@@ -85,12 +101,11 @@ int install_page(struct mm_struct *mm, s
pte_val = *pte;
pte_unmap(pte);
update_mmu_cache(vma, addr, pte_val);
- spin_unlock(&mm->page_table_lock);
- pte_chain_free(pte_chain);
- return 0;

+ err = 0;
err_unlock:
spin_unlock(&mm->page_table_lock);
+err_free:
pte_chain_free(pte_chain);
err:
return err;
--- 2.6.5-rc1/mm/memory.c 2004-03-16 07:00:20.000000000 +0000
+++ anobjrmap1/mm/memory.c 2004-03-18 21:26:40.794811352 +0000
@@ -1071,6 +1071,7 @@ static int do_wp_page(struct mm_struct *
++mm->rss;
page_remove_rmap(old_page, page_table);
break_cow(vma, new_page, address, page_table);
+ SetPageAnon(new_page);
pte_chain = page_add_rmap(new_page, page_table, pte_chain);
lru_cache_add_active(new_page);

@@ -1310,6 +1311,7 @@ static int do_swap_page(struct mm_struct

flush_icache_page(vma, page);
set_pte(page_table, pte);
+ SetPageAnon(page);
pte_chain = page_add_rmap(page, page_table, pte_chain);

/* No need to invalidate - it was non-present before */
@@ -1377,6 +1379,7 @@ do_anonymous_page(struct mm_struct *mm,
vma);
lru_cache_add_active(page);
mark_page_accessed(page);
+ SetPageAnon(page);
}

set_pte(page_table, entry);
@@ -1444,6 +1447,10 @@ retry:
if (!pte_chain)
goto oom;

+ /* See if nopage returned an anon page */
+ if (!new_page->mapping || PageSwapCache(new_page))
+ SetPageAnon(new_page);
+
/*
* Should we do an early C-O-W break?
*/
@@ -1454,6 +1461,7 @@ retry:
copy_user_highpage(page, new_page, address);
page_cache_release(new_page);
lru_cache_add_active(page);
+ SetPageAnon(page);
new_page = page;
}

--- 2.6.5-rc1/mm/page_alloc.c 2004-03-16 07:00:20.000000000 +0000
+++ anobjrmap1/mm/page_alloc.c 2004-03-18 21:26:40.796811048 +0000
@@ -224,6 +224,8 @@ static inline void free_pages_check(cons
bad_page(function, page);
if (PageDirty(page))
ClearPageDirty(page);
+ if (PageAnon(page))
+ ClearPageAnon(page);
}

/*
--- 2.6.5-rc1/mm/rmap.c 2004-03-11 01:56:12.000000000 +0000
+++ anobjrmap1/mm/rmap.c 2004-03-18 21:26:40.800810440 +0000
@@ -102,6 +102,136 @@ pte_chain_encode(struct pte_chain *pte_c
**/

/**
+ * find_pte - Find a pte pointer given a vma and a struct page.
+ * @vma: the vma to search
+ * @page: the page to find
+ *
+ * Determine if this page is mapped in this vma. If it is, map and return
+ * the pte pointer associated with it. Return null if the page is not
+ * mapped in this vma for any reason.
+ *
+ * This is strictly an internal helper function for the object-based rmap
+ * functions.
+ *
+ * It is the caller's responsibility to unmap the pte if it is returned.
+ */
+static inline pte_t *
+find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long loffset;
+ unsigned long address;
+
+ loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
+ address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT);
+ if (address < vma->vm_start || address >= vma->vm_end)
+ goto out;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pmd = pmd_offset(pgd, address);
+ if (!pmd_present(*pmd))
+ goto out;
+
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte))
+ goto out_unmap;
+
+ if (page_to_pfn(page) != pte_pfn(*pte))
+ goto out_unmap;
+
+ if (addr)
+ *addr = address;
+
+ return pte;
+
+out_unmap:
+ pte_unmap(pte);
+out:
+ return NULL;
+}
+
+/**
+ * page_referenced_obj_one - referenced check for object-based rmap
+ * @vma: the vma to look in.
+ * @page: the page we're working on.
+ *
+ * Find a pte entry for a page/vma pair, then check and clear the referenced
+ * bit.
+ *
+ * This is strictly a helper function for page_referenced_obj.
+ */
+static int
+page_referenced_obj_one(struct vm_area_struct *vma, struct page *page)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t *pte;
+ int referenced = 0;
+
+ if (!spin_trylock(&mm->page_table_lock))
+ return 1;
+
+ pte = find_pte(vma, page, NULL);
+ if (pte) {
+ if (ptep_test_and_clear_young(pte))
+ referenced++;
+ pte_unmap(pte);
+ }
+
+ spin_unlock(&mm->page_table_lock);
+ return referenced;
+}
+
+/**
+ * page_referenced_obj - referenced check for object-based rmap
+ * @page: the page we're checking references on.
+ *
+ * For an object-based mapped page, find all the places it is mapped and
+ * check/clear the referenced flag. This is done by following the page->mapping
+ * pointer, then walking the chain of vmas it holds. It returns the number
+ * of references it found.
+ *
+ * This function is only called from page_referenced for object-based pages.
+ *
+ * The semaphore address_space->i_shared_sem is tried. If it can't be gotten,
+ * assume a reference count of 1.
+ */
+static int
+page_referenced_obj(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct vm_area_struct *vma;
+ int referenced = 0;
+
+ if (!page->pte.mapcount)
+ return 0;
+
+ if (!mapping)
+ BUG();
+
+ if (PageSwapCache(page))
+ BUG();
+
+ if (down_trylock(&mapping->i_shared_sem))
+ return 1;
+
+ list_for_each_entry(vma, &mapping->i_mmap, shared)
+ referenced += page_referenced_obj_one(vma, page);
+
+ list_for_each_entry(vma, &mapping->i_mmap_shared, shared)
+ referenced += page_referenced_obj_one(vma, page);
+
+ up(&mapping->i_shared_sem);
+
+ return referenced;
+}
+
+/**
* page_referenced - test if the page was referenced
* @page: the page to test
*
@@ -123,6 +253,10 @@ int fastcall page_referenced(struct page
if (TestClearPageReferenced(page))
referenced++;

+ if (!PageAnon(page)) {
+ referenced += page_referenced_obj(page);
+ goto out;
+ }
if (PageDirect(page)) {
pte_t *pte = rmap_ptep_map(page->pte.direct);
if (ptep_test_and_clear_young(pte))
@@ -154,6 +288,7 @@ int fastcall page_referenced(struct page
__pte_chain_free(pc);
}
}
+out:
return referenced;
}

@@ -176,6 +311,21 @@ page_add_rmap(struct page *page, pte_t *

pte_chain_lock(page);

+ /*
+ * If this is an object-based page, just count it. We can
+ * find the mappings by walking the object vma chain for that object.
+ */
+ if (!PageAnon(page)) {
+ if (!page->mapping)
+ BUG();
+ if (PageSwapCache(page))
+ BUG();
+ if (!page->pte.mapcount)
+ inc_page_state(nr_mapped);
+ page->pte.mapcount++;
+ goto out;
+ }
+
if (page->pte.direct == 0) {
page->pte.direct = pte_paddr;
SetPageDirect(page);
@@ -232,8 +382,21 @@ void fastcall page_remove_rmap(struct pa
pte_chain_lock(page);

if (!page_mapped(page))
- goto out_unlock; /* remap_page_range() from a driver? */
+ goto out_unlock;

+ /*
+ * If this is an object-based page, just uncount it. We can
+ * find the mappings by walking the object vma chain for that object.
+ */
+ if (!PageAnon(page)) {
+ if (!page->mapping)
+ BUG();
+ if (PageSwapCache(page))
+ BUG();
+ page->pte.mapcount--;
+ goto out;
+ }
+
if (PageDirect(page)) {
if (page->pte.direct == pte_paddr) {
page->pte.direct = 0;
@@ -270,16 +433,112 @@ void fastcall page_remove_rmap(struct pa
}
}
out:
- if (page->pte.direct == 0 && page_test_and_clear_dirty(page))
- set_page_dirty(page);
- if (!page_mapped(page))
+ if (!page_mapped(page)) {
+ if (page_test_and_clear_dirty(page))
+ set_page_dirty(page);
dec_page_state(nr_mapped);
+ }
out_unlock:
pte_chain_unlock(page);
return;
}

/**
+ * try_to_unmap_obj_one - unmap a page using the object-based rmap method
+ * @page: the page to unmap
+ *
+ * Determine whether a page is mapped in a given vma and unmap it if it's found.
+ *
+ * This function is strictly a helper function for try_to_unmap_obj.
+ */
+static inline int
+try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address;
+ pte_t *pte;
+ pte_t pteval;
+ int ret = SWAP_AGAIN;
+
+ if (!spin_trylock(&mm->page_table_lock))
+ return ret;
+
+ pte = find_pte(vma, page, &address);
+ if (!pte)
+ goto out;
+
+ if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) {
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
+
+ flush_cache_page(vma, address);
+ pteval = ptep_clear_flush(vma, address, pte);
+
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
+
+ if (!page->pte.mapcount)
+ BUG();
+
+ mm->rss--;
+ page->pte.mapcount--;
+ page_cache_release(page);
+
+out_unmap:
+ pte_unmap(pte);
+
+out:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
+}
+
+/**
+ * try_to_unmap_obj - unmap a page using the object-based rmap method
+ * @page: the page to unmap
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * This function is only called from try_to_unmap for object-based pages.
+ *
+ * The semaphore address_space->i_shared_sem is tried. If it can't be gotten,
+ * return a temporary error.
+ */
+static int
+try_to_unmap_obj(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct vm_area_struct *vma;
+ int ret = SWAP_AGAIN;
+
+ if (!mapping)
+ BUG();
+
+ if (PageSwapCache(page))
+ BUG();
+
+ if (down_trylock(&mapping->i_shared_sem))
+ return ret;
+
+ list_for_each_entry(vma, &mapping->i_mmap, shared) {
+ ret = try_to_unmap_obj_one(vma, page);
+ if (ret == SWAP_FAIL || !page->pte.mapcount)
+ goto out;
+ }
+
+ list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+ ret = try_to_unmap_obj_one(vma, page);
+ if (ret == SWAP_FAIL || !page->pte.mapcount)
+ goto out;
+ }
+
+out:
+ up(&mapping->i_shared_sem);
+ return ret;
+}
+
+/**
* try_to_unmap_one - worker function for try_to_unmap
* @page: page to unmap
* @ptep: page table entry to unmap from page
@@ -323,7 +582,7 @@ static int fastcall try_to_unmap_one(str
}

/* The page is mlock()d, we cannot swap it out. */
- if (vma->vm_flags & VM_LOCKED) {
+ if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) {
ret = SWAP_FAIL;
goto out_unlock;
}
@@ -397,11 +656,18 @@ int fastcall try_to_unmap(struct page *
if (!page->mapping)
BUG();

+ /*
+ * If it's an object-based page, use the object vma chain to find all
+ * the mappings.
+ */
+ if (!PageAnon(page)) {
+ ret = try_to_unmap_obj(page);
+ goto out;
+ }
+
if (PageDirect(page)) {
ret = try_to_unmap_one(page, page->pte.direct);
if (ret == SWAP_SUCCESS) {
- if (page_test_and_clear_dirty(page))
- set_page_dirty(page);
page->pte.direct = 0;
ClearPageDirect(page);
}
@@ -438,9 +704,6 @@ int fastcall try_to_unmap(struct page *
} else {
start->next_and_idx++;
}
- if (page->pte.direct == 0 &&
- page_test_and_clear_dirty(page))
- set_page_dirty(page);
break;
case SWAP_AGAIN:
/* Skip this pte, remembering status. */
@@ -453,12 +716,117 @@ int fastcall try_to_unmap(struct page *
}
}
out:
- if (!page_mapped(page))
+ if (!page_mapped(page)) {
+ if (page_test_and_clear_dirty(page))
+ set_page_dirty(page);
dec_page_state(nr_mapped);
+ ret = SWAP_SUCCESS;
+ }
return ret;
}

/**
+ * page_convert_anon - Convert an object-based mapped page to pte_chain-based.
+ * @page: the page to convert
+ *
+ * Find all the mappings for an object-based page and convert them
+ * to 'anonymous', ie create a pte_chain and store all the pte pointers there.
+ *
+ * This function takes the address_space->i_shared_sem, sets the PageAnon flag,
+ * then sets the mm->page_table_lock for each vma and calls page_add_rmap. This
+ * means there is a period when PageAnon is set, but still has some mappings
+ * with no pte_chain entry. This is in fact safe, since page_remove_rmap will
+ * simply not find it. try_to_unmap might erroneously return success, but it
+ * will never be called because the page_convert_anon() caller has locked the
+ * page.
+ *
+ * page_referenced() may fail to scan all the appropriate pte's and may return
+ * an inaccurate result. This is so rare that it does not matter.
+ */
+int page_convert_anon(struct page *page)
+{
+ struct address_space *mapping;
+ struct vm_area_struct *vma;
+ struct pte_chain *pte_chain = NULL;
+ pte_t *pte;
+ int err = 0;
+
+ mapping = page->mapping;
+ if (mapping == NULL)
+ goto out; /* truncate won the lock_page() race */
+
+ down(&mapping->i_shared_sem);
+ pte_chain_lock(page);
+
+ /*
+ * Has someone else done it for us before we got the lock?
+ * If so, pte.direct or pte.chain has replaced pte.mapcount.
+ */
+ if (PageAnon(page)) {
+ pte_chain_unlock(page);
+ goto out_unlock;
+ }
+
+ SetPageAnon(page);
+ if (page->pte.mapcount == 0) {
+ pte_chain_unlock(page);
+ goto out_unlock;
+ }
+ /* This is gonna get incremented by page_add_rmap */
+ dec_page_state(nr_mapped);
+ page->pte.mapcount = 0;
+
+ /*
+ * Now that the page is marked as anon, unlock it. page_add_rmap will
+ * lock it as necessary.
+ */
+ pte_chain_unlock(page);
+
+ list_for_each_entry(vma, &mapping->i_mmap, shared) {
+ if (!pte_chain) {
+ pte_chain = pte_chain_alloc(GFP_KERNEL);
+ if (!pte_chain) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ }
+ spin_lock(&vma->vm_mm->page_table_lock);
+ pte = find_pte(vma, page, NULL);
+ if (pte) {
+ /* Make sure this isn't a duplicate */
+ page_remove_rmap(page, pte);
+ pte_chain = page_add_rmap(page, pte, pte_chain);
+ pte_unmap(pte);
+ }
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ }
+ list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+ if (!pte_chain) {
+ pte_chain = pte_chain_alloc(GFP_KERNEL);
+ if (!pte_chain) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ }
+ spin_lock(&vma->vm_mm->page_table_lock);
+ pte = find_pte(vma, page, NULL);
+ if (pte) {
+ /* Make sure this isn't a duplicate */
+ page_remove_rmap(page, pte);
+ pte_chain = page_add_rmap(page, pte, pte_chain);
+ pte_unmap(pte);
+ }
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ }
+
+out_unlock:
+ pte_chain_free(pte_chain);
+ up(&mapping->i_shared_sem);
+out:
+ return err;
+}
+
+/**
** No more VM stuff below this comment, only pte_chain helper
** functions.
**/
--- 2.6.5-rc1/mm/swapfile.c 2004-03-16 07:00:20.000000000 +0000
+++ anobjrmap1/mm/swapfile.c 2004-03-18 21:26:40.802810136 +0000
@@ -390,6 +390,7 @@ unuse_pte(struct vm_area_struct *vma, un
vma->vm_mm->rss++;
get_page(page);
set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
+ SetPageAnon(page);
*pte_chainp = page_add_rmap(page, dir, *pte_chainp);
swap_free(entry);
}


2004-03-18 23:33:55

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH] anobjrmap 2/6 linux/rmap.h

anobjrmap 2/6 create include/linux/rmap.h

Start small: linux/rmap-locking.h has already gathered some
declarations unrelated to locking, then the rest of the rmap
declarations were over in linux/swap.h: gather them all
together in linux/rmap.h.

fs/exec.c | 2 -
include/linux/rmap-locking.h | 23 -------------------
include/linux/rmap.h | 51 +++++++++++++++++++++++++++++++++++++++++++
include/linux/swap.h | 18 ---------------
mm/fremap.c | 2 -
mm/memory.c | 2 -
mm/mremap.c | 2 -
mm/rmap.c | 3 --
mm/swapfile.c | 2 -
mm/vmscan.c | 2 -
10 files changed, 58 insertions(+), 49 deletions(-)

--- anobjrmap1/fs/exec.c 2004-03-18 21:26:40.786812568 +0000
+++ anobjrmap2/fs/exec.c 2004-03-18 21:26:52.270066848 +0000
@@ -45,7 +45,7 @@
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>

#include <asm/uaccess.h>
#include <asm/pgalloc.h>
--- anobjrmap1/include/linux/rmap-locking.h 2003-06-22 19:33:42.000000000 +0100
+++ anobjrmap2/include/linux/rmap-locking.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,23 +0,0 @@
-/*
- * include/linux/rmap-locking.h
- *
- * Locking primitives for exclusive access to a page's reverse-mapping
- * pte chain.
- */
-
-#include <linux/slab.h>
-
-struct pte_chain;
-extern kmem_cache_t *pte_chain_cache;
-
-#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, &page->flags)
-#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, &page->flags)
-
-struct pte_chain *pte_chain_alloc(int gfp_flags);
-void __pte_chain_free(struct pte_chain *pte_chain);
-
-static inline void pte_chain_free(struct pte_chain *pte_chain)
-{
- if (pte_chain)
- __pte_chain_free(pte_chain);
-}
--- anobjrmap1/include/linux/rmap.h 1970-01-01 01:00:00.000000000 +0100
+++ anobjrmap2/include/linux/rmap.h 2004-03-18 21:26:52.280065328 +0000
@@ -0,0 +1,51 @@
+#ifndef _LINUX_RMAP_H
+#define _LINUX_RMAP_H
+/*
+ * Declarations for Reverse Mapping functions in mm/rmap.c
+ * Its structures are declared within that file.
+ */
+
+#include <linux/config.h>
+#include <linux/linkage.h>
+
+#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, &(page)->flags)
+#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, &(page)->flags)
+
+#ifdef CONFIG_MMU
+
+struct pte_chain;
+struct pte_chain *pte_chain_alloc(int gfp_flags);
+void __pte_chain_free(struct pte_chain *pte_chain);
+
+static inline void pte_chain_free(struct pte_chain *pte_chain)
+{
+ if (pte_chain)
+ __pte_chain_free(pte_chain);
+}
+
+struct pte_chain * fastcall
+ page_add_rmap(struct page *, pte_t *, struct pte_chain *);
+void fastcall page_remove_rmap(struct page *, pte_t *);
+int page_convert_anon(struct page *page);
+
+/*
+ * Called from mm/vmscan.c to handle paging out
+ */
+int fastcall page_referenced(struct page *);
+int fastcall try_to_unmap(struct page *);
+
+#else /* !CONFIG_MMU */
+
+#define page_referenced(page) TestClearPageReferenced(page)
+#define try_to_unmap(page) SWAP_FAIL
+
+#endif /* CONFIG_MMU */
+
+/*
+ * Return values of try_to_unmap
+ */
+#define SWAP_SUCCESS 0
+#define SWAP_AGAIN 1
+#define SWAP_FAIL 2
+
+#endif /* _LINUX_RMAP_H */
--- anobjrmap1/include/linux/swap.h 2004-03-18 21:26:40.790811960 +0000
+++ anobjrmap2/include/linux/swap.h 2004-03-18 21:26:52.281065176 +0000
@@ -76,7 +76,6 @@ struct reclaim_state {
#ifdef __KERNEL__

struct address_space;
-struct pte_chain;
struct sysinfo;
struct writeback_control;
struct zone;
@@ -177,28 +176,11 @@ extern int try_to_free_pages(struct zone
extern int shrink_all_memory(int);
extern int vm_swappiness;

-/* linux/mm/rmap.c */
#ifdef CONFIG_MMU
-int FASTCALL(page_referenced(struct page *));
-struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *,
- struct pte_chain *));
-void FASTCALL(page_remove_rmap(struct page *, pte_t *));
-int FASTCALL(try_to_unmap(struct page *));
-
-int page_convert_anon(struct page *);
-
/* linux/mm/shmem.c */
extern int shmem_unuse(swp_entry_t entry, struct page *page);
-#else
-#define page_referenced(page) TestClearPageReferenced(page)
-#define try_to_unmap(page) SWAP_FAIL
#endif /* CONFIG_MMU */

-/* return values of try_to_unmap */
-#define SWAP_SUCCESS 0
-#define SWAP_AGAIN 1
-#define SWAP_FAIL 2
-
#ifdef CONFIG_SWAP
/* linux/mm/page_io.c */
extern int swap_readpage(struct file *, struct page *);
--- anobjrmap1/mm/fremap.c 2004-03-18 21:26:40.791811808 +0000
+++ anobjrmap2/mm/fremap.c 2004-03-18 21:26:52.282065024 +0000
@@ -12,7 +12,7 @@
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swapops.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
#include <linux/module.h>

#include <asm/mmu_context.h>
--- anobjrmap1/mm/memory.c 2004-03-18 21:26:40.794811352 +0000
+++ anobjrmap2/mm/memory.c 2004-03-18 21:26:52.285064568 +0000
@@ -43,7 +43,7 @@
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
#include <linux/module.h>
#include <linux/init.h>

--- anobjrmap1/mm/mremap.c 2004-02-18 03:00:07.000000000 +0000
+++ anobjrmap2/mm/mremap.c 2004-03-18 21:26:52.286064416 +0000
@@ -15,7 +15,7 @@
#include <linux/swap.h>
#include <linux/fs.h>
#include <linux/highmem.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
#include <linux/security.h>

#include <asm/uaccess.h>
--- anobjrmap1/mm/rmap.c 2004-03-18 21:26:40.800810440 +0000
+++ anobjrmap2/mm/rmap.c 2004-03-18 21:26:52.290063808 +0000
@@ -26,7 +26,7 @@
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
#include <linux/cache.h>
#include <linux/percpu.h>

@@ -551,7 +551,6 @@ out:
* pte_chain_lock shrink_list()
* mm->page_table_lock try_to_unmap_one(), trylock
*/
-static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t));
static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr)
{
pte_t *ptep = rmap_ptep_map(paddr);
--- anobjrmap1/mm/swapfile.c 2004-03-18 21:26:40.802810136 +0000
+++ anobjrmap2/mm/swapfile.c 2004-03-18 21:26:52.292063504 +0000
@@ -21,7 +21,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
#include <linux/security.h>

#include <asm/pgtable.h>
--- anobjrmap1/mm/vmscan.c 2004-03-16 07:00:20.000000000 +0000
+++ anobjrmap2/mm/vmscan.c 2004-03-18 21:26:52.294063200 +0000
@@ -28,7 +28,7 @@
#include <linux/mm_inline.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
#include <linux/topology.h>

#include <asm/pgalloc.h>

2004-03-18 23:33:56

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH] anobjrmap 3/6 page->mapping

anobjrmap 3/6 free page->mapping for use by anon

Tracking anonymous pages by mm,address needs a pointer,offset
pair in struct page: mapping,index the natural choice. However,
swapcache already uses them for &swapper_space,swp_entry_t.

But it's trivial to separate swapcache from pagecache with radix
tree; most of swapper_space is actually unused, just a fiction
to pretend swap like file; and page->private is a good place to
keep swp_entry_t now swap never uses bufferheads.

Define page_mapping(page) macro to give NULL when PageAnon,
whatever that may put in page->mapping; define PG_swapcache bit,
deduce swapper_space from that. This does mean more conditionals
(many hidden in page_mapping), but I believe they'll be worth it.

Some arches refer to page->mapping for their cache flushing,
generally use page_mapping(page) instead: it appears that they're
coping with shared pagecache issues, rather than anon swap.

Sorry to lose another PG_ bit? Don't worry, I'm sure we can
deduce PageSwapCache from PageAnon && private when tight; but
that will demand a little care with the Anon/Swap transitions,
at present they're pleasantly independent. Who owns page->list,
Anon or Swap? Dunno, at present neither, useful for testing.

Separating the caches slightly simplifies the tmpfs swizzling,
can use functions with fewer underscores since page can briefly
be in both caches.

Removed the unloved page_convert_anon for non-linear vmas, new rules
for PageAnon don't allow it to be abused for objects in that way:
non-linear freeing to be solved by a later patch, not in this group.
Similarly, I'm not calling those !page->mapping driver pages anon:
count them in and out, but don't attempt to unmap them (unless I'm
mistaken, they're usually pages a driver has allocated, has a
reference to, can't be freed anyway).

arch/arm/mm/fault-armv.c | 4 -
arch/mips/mm/cache.c | 6 -
arch/parisc/kernel/cache.c | 4 -
arch/sparc64/kernel/smp.c | 8 +-
arch/sparc64/mm/init.c | 12 +--
fs/buffer.c | 20 +----
include/asm-arm/cacheflush.h | 4 -
include/asm-parisc/cacheflush.h | 2
include/asm-sh/pgalloc.h | 2
include/linux/mm.h | 28 ++-----
include/linux/page-flags.h | 14 +--
include/linux/pagemap.h | 11 --
include/linux/rmap.h | 1
mm/filemap.c | 20 ++---
mm/fremap.c | 16 ----
mm/memory.c | 16 ++--
mm/page-writeback.c | 20 +++++
mm/page_alloc.c | 14 +++
mm/page_io.c | 19 +----
mm/rmap.c | 133 ++++++-----------------------------
mm/swap_state.c | 152 ++++++++++++++++++----------------------
mm/swapfile.c | 20 +++--
mm/vmscan.c | 33 ++++----
23 files changed, 223 insertions(+), 336 deletions(-)

--- anobjrmap2/arch/arm/mm/fault-armv.c 2003-09-28 01:51:32.000000000 +0100
+++ anobjrmap3/arch/arm/mm/fault-armv.c 2004-03-18 21:27:03.794314896 +0000
@@ -191,7 +191,7 @@ void __flush_dcache_page(struct page *pa

__cpuc_flush_dcache_page(page_address(page));

- if (!page->mapping)
+ if (!page_mapping(page))
return;

/*
@@ -292,7 +292,7 @@ void update_mmu_cache(struct vm_area_str
if (!pfn_valid(pfn))
return;
page = pfn_to_page(pfn);
- if (page->mapping) {
+ if (page_mapping(page)) {
int dirty = test_and_clear_bit(PG_dcache_dirty, &page->flags);

if (dirty)
--- anobjrmap2/arch/mips/mm/cache.c 2004-03-11 01:56:08.000000000 +0000
+++ anobjrmap3/arch/mips/mm/cache.c 2004-03-18 21:27:03.795314744 +0000
@@ -57,7 +57,7 @@ void flush_dcache_page(struct page *page
{
unsigned long addr;

- if (page->mapping &&
+ if (page_mapping(page) &&
list_empty(&page->mapping->i_mmap) &&
list_empty(&page->mapping->i_mmap_shared)) {
SetPageDcacheDirty(page);
@@ -66,7 +66,7 @@ void flush_dcache_page(struct page *page
}

/*
- * We could delay the flush for the !page->mapping case too. But that
+ * We could delay the flush for the !page_mapping case too. But that
* case is for exec env/arg pages and those are %99 certainly going to
* get faulted into the tlb (and thus flushed) anyways.
*/
@@ -81,7 +81,7 @@ void __update_cache(struct vm_area_struc
unsigned long pfn, addr;

pfn = pte_pfn(pte);
- if (pfn_valid(pfn) && (page = pfn_to_page(pfn), page->mapping) &&
+ if (pfn_valid(pfn) && (page = pfn_to_page(pfn), page_mapping(page)) &&
Page_dcache_dirty(page)) {
if (pages_do_alias((unsigned long)page_address(page),
address & PAGE_MASK)) {
--- anobjrmap2/arch/parisc/kernel/cache.c 2004-01-09 06:00:23.000000000 +0000
+++ anobjrmap3/arch/parisc/kernel/cache.c 2004-03-18 21:27:03.796314592 +0000
@@ -68,7 +68,7 @@ update_mmu_cache(struct vm_area_struct *
{
struct page *page = pte_page(pte);

- if (VALID_PAGE(page) && page->mapping &&
+ if (VALID_PAGE(page) && page_mapping(page) &&
test_bit(PG_dcache_dirty, &page->flags)) {

flush_kernel_dcache_page(page_address(page));
@@ -234,7 +234,7 @@ void __flush_dcache_page(struct page *pa

flush_kernel_dcache_page(page_address(page));

- if (!page->mapping)
+ if (!page_mapping(page))
return;
/* check shared list first if it's not empty...it's usually
* the shortest */
--- anobjrmap2/arch/sparc64/kernel/smp.c 2004-03-16 07:00:18.257670408 +0000
+++ anobjrmap3/arch/sparc64/kernel/smp.c 2004-03-18 21:27:03.798314288 +0000
@@ -671,9 +671,9 @@ static __inline__ void __local_flush_dca
#if (L1DCACHE_SIZE > PAGE_SIZE)
__flush_dcache_page(page->virtual,
((tlb_type == spitfire) &&
- page->mapping != NULL));
+ page_mapping(page) != NULL));
#else
- if (page->mapping != NULL &&
+ if (page_mapping(page) != NULL &&
tlb_type == spitfire)
__flush_icache_page(__pa(page->virtual));
#endif
@@ -694,7 +694,7 @@ void smp_flush_dcache_page_impl(struct p
if (tlb_type == spitfire) {
data0 =
((u64)&xcall_flush_dcache_page_spitfire);
- if (page->mapping != NULL)
+ if (page_mapping(page) != NULL)
data0 |= ((u64)1 << 32);
spitfire_xcall_deliver(data0,
__pa(page->virtual),
@@ -727,7 +727,7 @@ void flush_dcache_page_all(struct mm_str
goto flush_self;
if (tlb_type == spitfire) {
data0 = ((u64)&xcall_flush_dcache_page_spitfire);
- if (page->mapping != NULL)
+ if (page_mapping(page) != NULL)
data0 |= ((u64)1 << 32);
spitfire_xcall_deliver(data0,
__pa(page->virtual),
--- anobjrmap2/arch/sparc64/mm/init.c 2004-03-11 01:56:08.000000000 +0000
+++ anobjrmap3/arch/sparc64/mm/init.c 2004-03-18 21:27:03.801313832 +0000
@@ -139,9 +139,9 @@ __inline__ void flush_dcache_page_impl(s
#if (L1DCACHE_SIZE > PAGE_SIZE)
__flush_dcache_page(page->virtual,
((tlb_type == spitfire) &&
- page->mapping != NULL));
+ page_mapping(page) != NULL));
#else
- if (page->mapping != NULL &&
+ if (page_mapping(page) != NULL &&
tlb_type == spitfire)
__flush_icache_page(__pa(page->virtual));
#endif
@@ -203,7 +203,7 @@ void update_mmu_cache(struct vm_area_str

pfn = pte_pfn(pte);
if (pfn_valid(pfn) &&
- (page = pfn_to_page(pfn), page->mapping) &&
+ (page = pfn_to_page(pfn), page_mapping(page)) &&
((pg_flags = page->flags) & (1UL << PG_dcache_dirty))) {
int cpu = ((pg_flags >> 24) & (NR_CPUS - 1UL));

@@ -227,7 +227,7 @@ void flush_dcache_page(struct page *page
int dirty = test_bit(PG_dcache_dirty, &page->flags);
int dirty_cpu = dcache_dirty_cpu(page);

- if (page->mapping &&
+ if (page_mapping(page) &&
list_empty(&page->mapping->i_mmap) &&
list_empty(&page->mapping->i_mmap_shared)) {
if (dirty) {
@@ -237,7 +237,7 @@ void flush_dcache_page(struct page *page
}
set_dcache_dirty(page);
} else {
- /* We could delay the flush for the !page->mapping
+ /* We could delay the flush for the !page_mapping
* case too. But that case is for exec env/arg
* pages and those are %99 certainly going to get
* faulted into the tlb (and thus flushed) anyways.
@@ -279,7 +279,7 @@ static inline void flush_cache_pte_range
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
- if (PageReserved(page) || !page->mapping)
+ if (PageReserved(page) || !page_mapping(page))
continue;
pgaddr = (unsigned long) page_address(page);
uaddr = address + offset;
--- anobjrmap2/fs/buffer.c 2004-03-11 01:56:10.000000000 +0000
+++ anobjrmap3/fs/buffer.c 2004-03-18 21:27:03.805313224 +0000
@@ -837,19 +837,10 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
*
* FIXME: may need to call ->reservepage here as well. That's rather up to the
* address_space though.
- *
- * For now, we treat swapper_space specially. It doesn't use the normal
- * block a_ops.
*/
int __set_page_dirty_buffers(struct page *page)
{
struct address_space * const mapping = page->mapping;
- int ret = 0;
-
- if (mapping == NULL) {
- SetPageDirty(page);
- goto out;
- }

spin_lock(&mapping->private_lock);
if (page_has_buffers(page)) {
@@ -878,8 +869,7 @@ int __set_page_dirty_buffers(struct page
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}

-out:
- return ret;
+ return 0;
}
EXPORT_SYMBOL(__set_page_dirty_buffers);

@@ -1576,7 +1566,7 @@ static inline void discard_buffer(struct
*/
int try_to_release_page(struct page *page, int gfp_mask)
{
- struct address_space * const mapping = page->mapping;
+ struct address_space * const mapping = page_mapping(page);

if (!PageLocked(page))
BUG();
@@ -2881,7 +2871,7 @@ failed:

int try_to_free_buffers(struct page *page)
{
- struct address_space * const mapping = page->mapping;
+ struct address_space * const mapping = page_mapping(page);
struct buffer_head *buffers_to_free = NULL;
int ret = 0;

@@ -2889,14 +2879,14 @@ int try_to_free_buffers(struct page *pag
if (PageWriteback(page))
return 0;

- if (mapping == NULL) { /* swapped-in anon page */
+ if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(page, &buffers_to_free);
goto out;
}

spin_lock(&mapping->private_lock);
ret = drop_buffers(page, &buffers_to_free);
- if (ret && !PageSwapCache(page)) {
+ if (ret) {
/*
* If the filesystem writes its buffers by hand (eg ext3)
* then we can have clean buffers against a dirty page. We
--- anobjrmap2/include/asm-arm/cacheflush.h 2004-03-11 01:56:12.000000000 +0000
+++ anobjrmap3/include/asm-arm/cacheflush.h 2004-03-18 21:27:03.807312920 +0000
@@ -283,7 +283,7 @@ flush_cache_page(struct vm_area_struct *
* flush_dcache_page is used when the kernel has written to the page
* cache page at virtual address page->virtual.
*
- * If this page isn't mapped (ie, page->mapping = NULL), or it has
+ * If this page isn't mapped (ie, page_mapping == NULL), or it has
* userspace mappings (page->mapping->i_mmap or page->mapping->i_mmap_shared)
* then we _must_ always clean + invalidate the dcache entries associated
* with the kernel mapping.
@@ -299,7 +299,7 @@ extern void __flush_dcache_page(struct p

static inline void flush_dcache_page(struct page *page)
{
- if (page->mapping && !mapping_mapped(page->mapping))
+ if (page_mapping(page) && !mapping_mapped(page->mapping))
set_bit(PG_dcache_dirty, &page->flags);
else
__flush_dcache_page(page);
--- anobjrmap2/include/asm-parisc/cacheflush.h 2003-10-08 20:24:57.000000000 +0100
+++ anobjrmap3/include/asm-parisc/cacheflush.h 2004-03-18 21:27:03.808312768 +0000
@@ -69,7 +69,7 @@ extern void __flush_dcache_page(struct p

static inline void flush_dcache_page(struct page *page)
{
- if (page->mapping && list_empty(&page->mapping->i_mmap) &&
+ if (page_mapping(page) && list_empty(&page->mapping->i_mmap) &&
list_empty(&page->mapping->i_mmap_shared)) {
set_bit(PG_dcache_dirty, &page->flags);
} else {
--- anobjrmap2/include/asm-sh/pgalloc.h 2004-02-04 02:45:26.000000000 +0000
+++ anobjrmap3/include/asm-sh/pgalloc.h 2004-03-18 21:27:03.808312768 +0000
@@ -101,7 +101,7 @@ static inline pte_t ptep_get_and_clear(p
unsigned long pfn = pte_pfn(pte);
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
- if (!page->mapping
+ if (!page_mapping(page)
|| list_empty(&page->mapping->i_mmap_shared))
__clear_bit(PG_mapped, &page->flags);
}
--- anobjrmap2/include/linux/mm.h 2004-03-18 21:26:40.787812416 +0000
+++ anobjrmap3/include/linux/mm.h 2004-03-18 21:27:03.810312464 +0000
@@ -396,6 +396,16 @@ void page_address_init(void);
#endif

/*
+ * On an anonymous page mapped into a user virtual memory area,
+ * page->mapping points to its anonmm, not to a struct address_space.
+ *
+ * Please note that, confusingly, "page_mapping" refers to the inode
+ * address_space which maps the page from disk; whereas "page_mapped"
+ * refers to user virtual address space into which the page is mapped.
+ */
+#define page_mapping(page) (PageAnon(page)? NULL: (page)->mapping)
+
+/*
* Return true if this page is mapped into pagetables. Subtle: test pte.direct
* rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain
* is only 32-bit.
@@ -464,6 +474,7 @@ int get_user_pages(struct task_struct *t

int __set_page_dirty_buffers(struct page *page);
int __set_page_dirty_nobuffers(struct page *page);
+int set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);

/*
@@ -490,23 +501,6 @@ extern struct shrinker *set_shrinker(int
extern void remove_shrinker(struct shrinker *shrinker);

/*
- * If the mapping doesn't provide a set_page_dirty a_op, then
- * just fall through and assume that it wants buffer_heads.
- * FIXME: make the method unconditional.
- */
-static inline int set_page_dirty(struct page *page)
-{
- if (page->mapping) {
- int (*spd)(struct page *);
-
- spd = page->mapping->a_ops->set_page_dirty;
- if (spd)
- return (*spd)(page);
- }
- return __set_page_dirty_buffers(page);
-}
-
-/*
* On a two-level page table, this ends up being trivial. Thus the
* inlining and the symmetry break with pte_alloc_map() that does all
* of this out-of-line.
--- anobjrmap2/include/linux/page-flags.h 2004-03-18 21:26:40.789812112 +0000
+++ anobjrmap3/include/linux/page-flags.h 2004-03-18 21:27:03.812312160 +0000
@@ -75,8 +75,9 @@
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */
-#define PG_anon 20 /* Anonymous page */

+#define PG_anon 20 /* Anonymous page: anonmm in mapping */
+#define PG_swapcache 21 /* Swap page: swp_entry_t in private */

/*
* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -306,15 +307,12 @@ extern void get_full_page_state(struct p
#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags)
#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags)

-/*
- * The PageSwapCache predicate doesn't use a PG_flag at this time,
- * but it may again do so one day.
- */
#ifdef CONFIG_SWAP
-extern struct address_space swapper_space;
-#define PageSwapCache(page) ((page)->mapping == &swapper_space)
+#define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags)
+#define SetPageSwapCache(page) set_bit(PG_swapcache, &(page)->flags)
+#define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags)
#else
-#define PageSwapCache(page) 0
+#define PageSwapCache(page) 0
#endif

struct page; /* forward declaration */
--- anobjrmap2/include/linux/pagemap.h 2004-01-09 06:00:23.000000000 +0000
+++ anobjrmap3/include/linux/pagemap.h 2004-03-18 21:27:03.812312160 +0000
@@ -138,17 +138,6 @@ static inline unsigned long get_page_cac
return atomic_read(&nr_pagecache);
}

-static inline void ___add_to_page_cache(struct page *page,
- struct address_space *mapping, unsigned long index)
-{
- list_add(&page->list, &mapping->clean_pages);
- page->mapping = mapping;
- page->index = index;
-
- mapping->nrpages++;
- pagecache_acct(1);
-}
-
extern void FASTCALL(__lock_page(struct page *page));
extern void FASTCALL(unlock_page(struct page *page));

--- anobjrmap2/include/linux/rmap.h 2004-03-18 21:26:52.280065328 +0000
+++ anobjrmap3/include/linux/rmap.h 2004-03-18 21:27:03.813312008 +0000
@@ -26,7 +26,6 @@ static inline void pte_chain_free(struct
struct pte_chain * fastcall
page_add_rmap(struct page *, pte_t *, struct pte_chain *);
void fastcall page_remove_rmap(struct page *, pte_t *);
-int page_convert_anon(struct page *page);

/*
* Called from mm/vmscan.c to handle paging out
--- anobjrmap2/mm/filemap.c 2004-03-11 01:56:08.000000000 +0000
+++ anobjrmap3/mm/filemap.c 2004-03-18 21:27:03.816311552 +0000
@@ -118,10 +118,12 @@ void remove_from_page_cache(struct page

static inline int sync_page(struct page *page)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = page_mapping(page);

if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
return mapping->a_ops->sync_page(page);
+ if (PageSwapCache(page))
+ blk_run_queues();
return 0;
}

@@ -235,13 +237,9 @@ EXPORT_SYMBOL(filemap_fdatawait);
* This function is used for two things: adding newly allocated pagecache
* pages and for moving existing anon pages into swapcache.
*
- * In the case of pagecache pages, the page is new, so we can just run
- * SetPageLocked() against it. The other page state flags were set by
- * rmqueue()
- *
- * In the case of swapcache, try_to_swap_out() has already locked the page, so
- * SetPageLocked() is ugly-but-OK there too. The required page state has been
- * set up by swap_out_add_to_swap_cache().
+ * This function is used to add newly allocated pagecache pages:
+ * the page is new, so we can just run SetPageLocked() against it.
+ * The other page state flags were set by rmqueue().
*
* This function does not add the page to the LRU. The caller must do that.
*/
@@ -256,7 +254,11 @@ int add_to_page_cache(struct page *page,
error = radix_tree_insert(&mapping->page_tree, offset, page);
if (!error) {
SetPageLocked(page);
- ___add_to_page_cache(page, mapping, offset);
+ list_add(&page->list, &mapping->clean_pages);
+ page->mapping = mapping;
+ page->index = offset;
+ mapping->nrpages++;
+ pagecache_acct(1);
} else {
page_cache_release(page);
}
--- anobjrmap2/mm/fremap.c 2004-03-18 21:26:52.282065024 +0000
+++ anobjrmap3/mm/fremap.c 2004-03-18 21:27:03.817311400 +0000
@@ -61,26 +61,11 @@ int install_page(struct mm_struct *mm, s
pmd_t *pmd;
pte_t pte_val;
struct pte_chain *pte_chain;
- unsigned long pgidx;

pte_chain = pte_chain_alloc(GFP_KERNEL);
if (!pte_chain)
goto err;

- /*
- * Convert this page to anon for objrmap if it's nonlinear
- */
- pgidx = (addr - vma->vm_start) >> PAGE_SHIFT;
- pgidx += vma->vm_pgoff;
- pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
- if (!PageAnon(page) && (page->index != pgidx)) {
- lock_page(page);
- err = page_convert_anon(page);
- unlock_page(page);
- if (err < 0)
- goto err_free;
- }
-
pgd = pgd_offset(mm, addr);
spin_lock(&mm->page_table_lock);

@@ -105,7 +90,6 @@ int install_page(struct mm_struct *mm, s
err = 0;
err_unlock:
spin_unlock(&mm->page_table_lock);
-err_free:
pte_chain_free(pte_chain);
err:
return err;
--- anobjrmap2/mm/memory.c 2004-03-18 21:26:52.285064568 +0000
+++ anobjrmap3/mm/memory.c 2004-03-18 21:27:03.820310944 +0000
@@ -417,8 +417,8 @@ zap_pte_range(struct mmu_gather *tlb, pm
if (!PageReserved(page)) {
if (pte_dirty(pte))
set_page_dirty(page);
- if (page->mapping && pte_young(pte) &&
- !PageSwapCache(page))
+ if (pte_young(pte) &&
+ page_mapping(page))
mark_page_accessed(page);
tlb->freed++;
page_remove_rmap(page, ptep);
@@ -1422,6 +1422,7 @@ do_no_page(struct mm_struct *mm, struct
struct pte_chain *pte_chain;
int sequence = 0;
int ret = VM_FAULT_MINOR;
+ int anon = 0;

if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, page_table,
@@ -1447,10 +1448,6 @@ retry:
if (!pte_chain)
goto oom;

- /* See if nopage returned an anon page */
- if (!new_page->mapping || PageSwapCache(new_page))
- SetPageAnon(new_page);
-
/*
* Should we do an early C-O-W break?
*/
@@ -1460,9 +1457,8 @@ retry:
goto oom;
copy_user_highpage(page, new_page, address);
page_cache_release(new_page);
- lru_cache_add_active(page);
- SetPageAnon(page);
new_page = page;
+ anon = 1;
}

spin_lock(&mm->page_table_lock);
@@ -1500,6 +1496,10 @@ retry:
if (write_access)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
set_pte(page_table, entry);
+ if (anon) {
+ SetPageAnon(new_page);
+ lru_cache_add_active(new_page);
+ }
pte_chain = page_add_rmap(new_page, page_table, pte_chain);
pte_unmap(page_table);
} else {
--- anobjrmap2/mm/page-writeback.c 2004-02-04 02:45:34.000000000 +0000
+++ anobjrmap3/mm/page-writeback.c 2004-03-18 21:27:03.821310792 +0000
@@ -532,6 +532,24 @@ int __set_page_dirty_nobuffers(struct pa
EXPORT_SYMBOL(__set_page_dirty_nobuffers);

/*
+ * If the mapping doesn't provide a set_page_dirty a_op, then
+ * just fall through and assume that it wants buffer_heads.
+ */
+int set_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ int (*spd)(struct page *);
+
+ if (!mapping) {
+ SetPageDirty(page);
+ return 0;
+ }
+ spd = mapping->a_ops->set_page_dirty;
+ return spd? (*spd)(page): __set_page_dirty_buffers(page);
+}
+EXPORT_SYMBOL(set_page_dirty);
+
+/*
* set_page_dirty() is racy if the caller has no reference against
* page->mapping->host, and if the page is unlocked. This is because another
* CPU could truncate the page off the mapping and then free the mapping.
@@ -559,7 +577,7 @@ EXPORT_SYMBOL(set_page_dirty_lock);
int test_clear_page_dirty(struct page *page)
{
if (TestClearPageDirty(page)) {
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = page_mapping(page);

if (mapping && !mapping->backing_dev_info->memory_backed)
dec_page_state(nr_dirty);
--- anobjrmap2/mm/page_alloc.c 2004-03-18 21:26:40.796811048 +0000
+++ anobjrmap3/mm/page_alloc.c 2004-03-18 21:27:03.824310336 +0000
@@ -83,6 +83,10 @@ static void bad_page(const char *functio
1 << PG_lru |
1 << PG_active |
1 << PG_dirty |
+ 1 << PG_chainlock |
+ 1 << PG_direct |
+ 1 << PG_anon |
+ 1 << PG_swapcache |
1 << PG_writeback);
set_page_count(page, 0);
page->mapping = NULL;
@@ -220,12 +224,14 @@ static inline void free_pages_check(cons
1 << PG_active |
1 << PG_reclaim |
1 << PG_slab |
+ 1 << PG_chainlock |
+ 1 << PG_direct |
+ 1 << PG_anon |
+ 1 << PG_swapcache |
1 << PG_writeback )))
bad_page(function, page);
if (PageDirty(page))
ClearPageDirty(page);
- if (PageAnon(page))
- ClearPageAnon(page);
}

/*
@@ -329,6 +335,10 @@ static void prep_new_page(struct page *p
1 << PG_active |
1 << PG_dirty |
1 << PG_reclaim |
+ 1 << PG_chainlock |
+ 1 << PG_direct |
+ 1 << PG_anon |
+ 1 << PG_swapcache |
1 << PG_writeback )))
bad_page(__FUNCTION__, page);

--- anobjrmap2/mm/page_io.c 2002-12-16 01:08:28.000000000 +0000
+++ anobjrmap3/mm/page_io.c 2004-03-18 21:27:03.825310184 +0000
@@ -16,8 +16,6 @@
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/swapops.h>
-#include <linux/buffer_head.h> /* for block_sync_page() */
-#include <linux/mpage.h>
#include <linux/writeback.h>
#include <asm/pgtable.h>

@@ -32,7 +30,7 @@ get_swap_bio(int gfp_flags, struct page
swp_entry_t entry;

BUG_ON(!PageSwapCache(page));
- entry.val = page->index;
+ entry.val = page->private;
sis = get_swap_info_struct(swp_type(entry));

bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
@@ -130,13 +128,6 @@ out:
return ret;
}

-struct address_space_operations swap_aops = {
- .writepage = swap_writepage,
- .readpage = swap_readpage,
- .sync_page = block_sync_page,
- .set_page_dirty = __set_page_dirty_nobuffers,
-};
-
/*
* A scruffy utility function to read or write an arbitrary swap page
* and wait on the I/O.
@@ -149,10 +140,8 @@ int rw_swap_page_sync(int rw, swp_entry_
};

lock_page(page);
-
- BUG_ON(page->mapping);
- page->mapping = &swapper_space;
- page->index = entry.val;
+ SetPageSwapCache(page);
+ page->private = entry.val;

if (rw == READ) {
ret = swap_readpage(NULL, page);
@@ -161,7 +150,7 @@ int rw_swap_page_sync(int rw, swp_entry_
ret = swap_writepage(page, &swap_wbc);
wait_on_page_writeback(page);
}
- page->mapping = NULL;
+ ClearPageSwapCache(page);
if (ret == 0 && (!PageUptodate(page) || PageError(page)))
ret = -EIO;
return ret;
--- anobjrmap2/mm/rmap.c 2004-03-18 21:26:52.290063808 +0000
+++ anobjrmap3/mm/rmap.c 2004-03-18 21:27:03.828309728 +0000
@@ -35,7 +35,18 @@
#include <asm/tlb.h>
#include <asm/tlbflush.h>

-/* #define DEBUG_RMAP */
+/*
+ * Something oopsable to put for now in the page->mapping
+ * of an anonymous page, to test that it is ignored.
+ */
+#define ANON_MAPPING_DEBUG ((struct address_space *) 1)
+
+static inline void clear_page_anon(struct page *page)
+{
+ BUG_ON(page->mapping != ANON_MAPPING_DEBUG);
+ page->mapping = NULL;
+ ClearPageAnon(page);
+}

/*
* Shared pages have a chain of pte_chain structures, used to locate
@@ -212,7 +223,7 @@ page_referenced_obj(struct page *page)
return 0;

if (!mapping)
- BUG();
+ return 0;

if (PageSwapCache(page))
BUG();
@@ -316,8 +327,6 @@ page_add_rmap(struct page *page, pte_t *
* find the mappings by walking the object vma chain for that object.
*/
if (!PageAnon(page)) {
- if (!page->mapping)
- BUG();
if (PageSwapCache(page))
BUG();
if (!page->pte.mapcount)
@@ -326,6 +335,8 @@ page_add_rmap(struct page *page, pte_t *
goto out;
}

+ page->mapping = ANON_MAPPING_DEBUG;
+
if (page->pte.direct == 0) {
page->pte.direct = pte_paddr;
SetPageDirect(page);
@@ -389,8 +400,6 @@ void fastcall page_remove_rmap(struct pa
* find the mappings by walking the object vma chain for that object.
*/
if (!PageAnon(page)) {
- if (!page->mapping)
- BUG();
if (PageSwapCache(page))
BUG();
page->pte.mapcount--;
@@ -436,6 +445,8 @@ out:
if (!page_mapped(page)) {
if (page_test_and_clear_dirty(page))
set_page_dirty(page);
+ if (PageAnon(page))
+ clear_page_anon(page);
dec_page_state(nr_mapped);
}
out_unlock:
@@ -590,12 +601,13 @@ static int fastcall try_to_unmap_one(str
flush_cache_page(vma, address);
pte = ptep_clear_flush(vma, address, ptep);

- if (PageSwapCache(page)) {
+ if (PageAnon(page)) {
+ swp_entry_t entry = { .val = page->private };
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
- swp_entry_t entry = { .val = page->index };
+ BUG_ON(!PageSwapCache(page));
swap_duplicate(entry);
set_pte(ptep, swp_entry_to_pte(entry));
BUG_ON(pte_file(*ptep));
@@ -652,7 +664,7 @@ int fastcall try_to_unmap(struct page *
if (!PageLocked(page))
BUG();
/* We need backing store to swap out a page. */
- if (!page->mapping)
+ if (!page_mapping(page) && !PageSwapCache(page))
BUG();

/*
@@ -718,6 +730,8 @@ out:
if (!page_mapped(page)) {
if (page_test_and_clear_dirty(page))
set_page_dirty(page);
+ if (PageAnon(page))
+ clear_page_anon(page);
dec_page_state(nr_mapped);
ret = SWAP_SUCCESS;
}
@@ -725,107 +739,6 @@ out:
}

/**
- * page_convert_anon - Convert an object-based mapped page to pte_chain-based.
- * @page: the page to convert
- *
- * Find all the mappings for an object-based page and convert them
- * to 'anonymous', ie create a pte_chain and store all the pte pointers there.
- *
- * This function takes the address_space->i_shared_sem, sets the PageAnon flag,
- * then sets the mm->page_table_lock for each vma and calls page_add_rmap. This
- * means there is a period when PageAnon is set, but still has some mappings
- * with no pte_chain entry. This is in fact safe, since page_remove_rmap will
- * simply not find it. try_to_unmap might erroneously return success, but it
- * will never be called because the page_convert_anon() caller has locked the
- * page.
- *
- * page_referenced() may fail to scan all the appropriate pte's and may return
- * an inaccurate result. This is so rare that it does not matter.
- */
-int page_convert_anon(struct page *page)
-{
- struct address_space *mapping;
- struct vm_area_struct *vma;
- struct pte_chain *pte_chain = NULL;
- pte_t *pte;
- int err = 0;
-
- mapping = page->mapping;
- if (mapping == NULL)
- goto out; /* truncate won the lock_page() race */
-
- down(&mapping->i_shared_sem);
- pte_chain_lock(page);
-
- /*
- * Has someone else done it for us before we got the lock?
- * If so, pte.direct or pte.chain has replaced pte.mapcount.
- */
- if (PageAnon(page)) {
- pte_chain_unlock(page);
- goto out_unlock;
- }
-
- SetPageAnon(page);
- if (page->pte.mapcount == 0) {
- pte_chain_unlock(page);
- goto out_unlock;
- }
- /* This is gonna get incremented by page_add_rmap */
- dec_page_state(nr_mapped);
- page->pte.mapcount = 0;
-
- /*
- * Now that the page is marked as anon, unlock it. page_add_rmap will
- * lock it as necessary.
- */
- pte_chain_unlock(page);
-
- list_for_each_entry(vma, &mapping->i_mmap, shared) {
- if (!pte_chain) {
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain) {
- err = -ENOMEM;
- goto out_unlock;
- }
- }
- spin_lock(&vma->vm_mm->page_table_lock);
- pte = find_pte(vma, page, NULL);
- if (pte) {
- /* Make sure this isn't a duplicate */
- page_remove_rmap(page, pte);
- pte_chain = page_add_rmap(page, pte, pte_chain);
- pte_unmap(pte);
- }
- spin_unlock(&vma->vm_mm->page_table_lock);
- }
- list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- if (!pte_chain) {
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain) {
- err = -ENOMEM;
- goto out_unlock;
- }
- }
- spin_lock(&vma->vm_mm->page_table_lock);
- pte = find_pte(vma, page, NULL);
- if (pte) {
- /* Make sure this isn't a duplicate */
- page_remove_rmap(page, pte);
- pte_chain = page_add_rmap(page, pte, pte_chain);
- pte_unmap(pte);
- }
- spin_unlock(&vma->vm_mm->page_table_lock);
- }
-
-out_unlock:
- pte_chain_free(pte_chain);
- up(&mapping->i_shared_sem);
-out:
- return err;
-}
-
-/**
** No more VM stuff below this comment, only pte_chain helper
** functions.
**/
--- anobjrmap2/mm/swap_state.c 2003-08-09 05:44:10.000000000 +0100
+++ anobjrmap3/mm/swap_state.c 2004-03-18 21:27:03.830309424 +0000
@@ -21,23 +21,20 @@ static struct backing_dev_info swap_back
.memory_backed = 1, /* Does not contribute to dirty memory */
};

-extern struct address_space_operations swap_aops;
+static struct address_space_operations swap_aops = {
+ .writepage = swap_writepage,
+ .readpage = swap_readpage,
+ /*
+ * sync_page and set_page_dirty are special-cased.
+ */
+};

struct address_space swapper_space = {
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC),
.page_lock = SPIN_LOCK_UNLOCKED,
- .clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages),
- .dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages),
- .io_pages = LIST_HEAD_INIT(swapper_space.io_pages),
- .locked_pages = LIST_HEAD_INIT(swapper_space.locked_pages),
+ .nrpages = 0,
.a_ops = &swap_aops,
.backing_dev_info = &swap_backing_dev_info,
- .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap),
- .i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared),
- .i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem),
- .truncate_count = ATOMIC_INIT(0),
- .private_lock = SPIN_LOCK_UNLOCKED,
- .private_list = LIST_HEAD_INIT(swapper_space.private_list),
};

#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
@@ -59,30 +56,55 @@ void show_swap_cache_info(void)
swap_cache_info.noent_race, swap_cache_info.exist_race);
}

+/*
+ * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * but sets SwapCache flag and private instead of mapping and index.
+ */
+static int __add_to_swap_cache(struct page *page,
+ swp_entry_t entry, int gfp_mask)
+{
+ int error;
+
+ BUG_ON(PageSwapCache(page));
+ BUG_ON(PagePrivate(page));
+ error = radix_tree_preload(gfp_mask);
+ if (!error) {
+ page_cache_get(page);
+ spin_lock(&swapper_space.page_lock);
+ error = radix_tree_insert(&swapper_space.page_tree,
+ entry.val, page);
+ if (!error) {
+ SetPageLocked(page);
+ SetPageSwapCache(page);
+ page->private = entry.val;
+ total_swapcache_pages++;
+ pagecache_acct(1);
+ } else
+ page_cache_release(page);
+ spin_unlock(&swapper_space.page_lock);
+ radix_tree_preload_end();
+ }
+ return error;
+}
+
static int add_to_swap_cache(struct page *page, swp_entry_t entry)
{
int error;

- if (page->mapping)
- BUG();
if (!swap_duplicate(entry)) {
INC_CACHE_INFO(noent_race);
return -ENOENT;
}
- error = add_to_page_cache(page, &swapper_space, entry.val, GFP_KERNEL);
+ error = __add_to_swap_cache(page, entry, GFP_KERNEL);
/*
* Anon pages are already on the LRU, we don't run lru_cache_add here.
*/
- if (error != 0) {
+ if (error) {
swap_free(entry);
if (error == -EEXIST)
INC_CACHE_INFO(exist_race);
return error;
}
- if (!PageLocked(page))
- BUG();
- if (!PageSwapCache(page))
- BUG();
INC_CACHE_INFO(add_total);
return 0;
}
@@ -96,7 +118,11 @@ void __delete_from_swap_cache(struct pag
BUG_ON(!PageLocked(page));
BUG_ON(!PageSwapCache(page));
BUG_ON(PageWriteback(page));
- __remove_from_page_cache(page);
+
+ radix_tree_delete(&swapper_space.page_tree, page->private);
+ ClearPageSwapCache(page);
+ total_swapcache_pages--;
+ pagecache_acct(-1);
INC_CACHE_INFO(del_total);
}

@@ -140,8 +166,7 @@ int add_to_swap(struct page * page)
/*
* Add it to the swap cache and mark it dirty
*/
- err = add_to_page_cache(page, &swapper_space,
- entry.val, GFP_ATOMIC);
+ err = __add_to_swap_cache(page, entry, GFP_ATOMIC);

if (pf_flags & PF_MEMALLOC)
current->flags |= PF_MEMALLOC;
@@ -149,8 +174,7 @@ int add_to_swap(struct page * page)
switch (err) {
case 0: /* Success */
SetPageUptodate(page);
- ClearPageDirty(page);
- set_page_dirty(page);
+ SetPageDirty(page);
INC_CACHE_INFO(add_total);
return 1;
case -EEXIST:
@@ -176,11 +200,12 @@ void delete_from_swap_cache(struct page
{
swp_entry_t entry;

+ BUG_ON(!PageSwapCache(page));
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
BUG_ON(PagePrivate(page));

- entry.val = page->index;
+ entry.val = page->private;

spin_lock(&swapper_space.page_lock);
__delete_from_swap_cache(page);
@@ -192,27 +217,13 @@ void delete_from_swap_cache(struct page

int move_to_swap_cache(struct page *page, swp_entry_t entry)
{
- struct address_space *mapping = page->mapping;
- int err;
-
- spin_lock(&swapper_space.page_lock);
- spin_lock(&mapping->page_lock);
-
- err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
- if (!err) {
- __remove_from_page_cache(page);
- ___add_to_page_cache(page, &swapper_space, entry.val);
- }
-
- spin_unlock(&mapping->page_lock);
- spin_unlock(&swapper_space.page_lock);
-
+ int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
if (!err) {
+ remove_from_page_cache(page);
+ page_cache_release(page); /* pagecache ref */
if (!swap_duplicate(entry))
BUG();
- /* shift page from clean_pages to dirty_pages list */
- BUG_ON(PageDirty(page));
- set_page_dirty(page);
+ SetPageDirty(page);
INC_CACHE_INFO(add_total);
} else if (err == -EEXIST)
INC_CACHE_INFO(exist_race);
@@ -222,29 +233,9 @@ int move_to_swap_cache(struct page *page
int move_from_swap_cache(struct page *page, unsigned long index,
struct address_space *mapping)
{
- swp_entry_t entry;
- int err;
-
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
- BUG_ON(PagePrivate(page));
-
- entry.val = page->index;
-
- spin_lock(&swapper_space.page_lock);
- spin_lock(&mapping->page_lock);
-
- err = radix_tree_insert(&mapping->page_tree, index, page);
+ int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
if (!err) {
- __delete_from_swap_cache(page);
- ___add_to_page_cache(page, mapping, index);
- }
-
- spin_unlock(&mapping->page_lock);
- spin_unlock(&swapper_space.page_lock);
-
- if (!err) {
- swap_free(entry);
+ delete_from_swap_cache(page);
/* shift page from clean_pages to dirty_pages list */
ClearPageDirty(page);
set_page_dirty(page);
@@ -252,7 +243,6 @@ int move_from_swap_cache(struct page *pa
return err;
}

-
/*
* If we are the only user, then try to free up the swap cache.
*
@@ -310,19 +300,17 @@ void free_pages_and_swap_cache(struct pa
*/
struct page * lookup_swap_cache(swp_entry_t entry)
{
- struct page *found;
+ struct page *page;

- found = find_get_page(&swapper_space, entry.val);
- /*
- * Unsafe to assert PageSwapCache and mapping on page found:
- * if SMP nothing prevents swapoff from deleting this page from
- * the swap cache at this moment. find_lock_page would prevent
- * that, but no need to change: we _have_ got the right page.
- */
- INC_CACHE_INFO(find_total);
- if (found)
+ spin_lock(&swapper_space.page_lock);
+ page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
+ if (page) {
+ page_cache_get(page);
INC_CACHE_INFO(find_success);
- return found;
+ }
+ spin_unlock(&swapper_space.page_lock);
+ INC_CACHE_INFO(find_total);
+ return page;
}

/*
@@ -340,10 +328,14 @@ struct page * read_swap_cache_async(swp_
/*
* First check the swap cache. Since this is normally
* called after lookup_swap_cache() failed, re-calling
- * that would confuse statistics: use find_get_page()
- * directly.
+ * that would confuse statistics.
*/
- found_page = find_get_page(&swapper_space, entry.val);
+ spin_lock(&swapper_space.page_lock);
+ found_page = radix_tree_lookup(&swapper_space.page_tree,
+ entry.val);
+ if (found_page)
+ page_cache_get(found_page);
+ spin_unlock(&swapper_space.page_lock);
if (found_page)
break;

--- anobjrmap2/mm/swapfile.c 2004-03-18 21:26:52.292063504 +0000
+++ anobjrmap3/mm/swapfile.c 2004-03-18 21:27:03.832309120 +0000
@@ -247,7 +247,7 @@ static int exclusive_swap_page(struct pa
struct swap_info_struct * p;
swp_entry_t entry;

- entry.val = page->index;
+ entry.val = page->private;
p = swap_info_get(entry);
if (p) {
/* Is the only swap cache user the cache itself? */
@@ -315,7 +315,7 @@ int remove_exclusive_swap_page(struct pa
if (page_count(page) != 2) /* 2: us + cache */
return 0;

- entry.val = page->index;
+ entry.val = page->private;
p = swap_info_get(entry);
if (!p)
return 0;
@@ -353,8 +353,14 @@ void free_swap_and_cache(swp_entry_t ent

p = swap_info_get(entry);
if (p) {
- if (swap_entry_free(p, swp_offset(entry)) == 1)
- page = find_trylock_page(&swapper_space, entry.val);
+ if (swap_entry_free(p, swp_offset(entry)) == 1) {
+ spin_lock(&swapper_space.page_lock);
+ page = radix_tree_lookup(&swapper_space.page_tree,
+ entry.val);
+ if (page && TestSetPageLocked(page))
+ page = NULL;
+ spin_unlock(&swapper_space.page_lock);
+ }
swap_info_put(p);
}
if (page) {
@@ -997,14 +1003,14 @@ int page_queue_congested(struct page *pa

BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */

- bdi = page->mapping->backing_dev_info;
if (PageSwapCache(page)) {
- swp_entry_t entry = { .val = page->index };
+ swp_entry_t entry = { .val = page->private };
struct swap_info_struct *sis;

sis = get_swap_info_struct(swp_type(entry));
bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
- }
+ } else
+ bdi = page->mapping->backing_dev_info;
return bdi_write_congested(bdi);
}
#endif
--- anobjrmap2/mm/vmscan.c 2004-03-18 21:26:52.294063200 +0000
+++ anobjrmap3/mm/vmscan.c 2004-03-18 21:27:03.835308664 +0000
@@ -174,20 +174,20 @@ static int shrink_slab(unsigned long sca
/* Must be called with page's pte_chain_lock held. */
static inline int page_mapping_inuse(struct page *page)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping;

/* Page is in somebody's page tables. */
if (page_mapped(page))
return 1;

- /* XXX: does this happen ? */
- if (!mapping)
- return 0;
-
/* Be more reluctant to reclaim swapcache than pagecache */
if (PageSwapCache(page))
return 1;

+ mapping = page_mapping(page);
+ if (!mapping)
+ return 0;
+
/* File is mmap'd by somebody. */
if (!list_empty(&mapping->i_mmap))
return 1;
@@ -231,7 +231,7 @@ static void handle_write_error(struct ad
struct page *page, int error)
{
lock_page(page);
- if (page->mapping == mapping) {
+ if (page_mapping(page) == mapping) {
if (error == -ENOSPC)
set_bit(AS_ENOSPC, &mapping->flags);
else
@@ -283,21 +283,23 @@ shrink_list(struct list_head *page_list,
goto activate_locked;
}

- mapping = page->mapping;
+ mapping = page_mapping(page);

#ifdef CONFIG_SWAP
/*
- * Anonymous process memory without backing store. Try to
- * allocate it some swap space here.
+ * Anonymous process memory has backing store?
+ * Try to allocate it some swap space here.
*
* XXX: implement swap clustering ?
*/
- if (page_mapped(page) && !mapping && !PagePrivate(page)) {
+ if (PageSwapCache(page))
+ mapping = &swapper_space;
+ else if (PageAnon(page)) {
pte_chain_unlock(page);
if (!add_to_swap(page))
goto activate_locked;
pte_chain_lock(page);
- mapping = page->mapping;
+ mapping = &swapper_space;
}
#endif /* CONFIG_SWAP */

@@ -362,7 +364,9 @@ shrink_list(struct list_head *page_list,
.for_reclaim = 1,
};

- list_move(&page->list, &mapping->locked_pages);
+ if (!PageSwapCache(page))
+ list_move(&page->list,
+ &mapping->locked_pages);
spin_unlock(&mapping->page_lock);

SetPageReclaim(page);
@@ -427,7 +431,7 @@ shrink_list(struct list_head *page_list,

#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page->index };
+ swp_entry_t swap = { .val = page->private };
__delete_from_swap_cache(page);
spin_unlock(&mapping->page_lock);
swap_free(swap);
@@ -668,8 +672,7 @@ refill_inactive_zone(struct zone *zone,
* FIXME: need to consider page_count(page) here if/when we
* reap orphaned pages via the LRU (Daniel's locking stuff)
*/
- if (total_swap_pages == 0 && !page->mapping &&
- !PagePrivate(page)) {
+ if (total_swap_pages == 0 && PageAnon(page)) {
list_add(&page->lru, &l_active);
continue;
}

2004-03-18 23:41:34

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH] anobjrmap 5/6 anonmm

anobjrmap 5/6 add anonmm to track anonymous pages

Introduce struct anonmm per mm to track anonymous pages,
all forks from one exec share same bundle of linked anonmms:
anonymous pages may start out in one mm but get forked into
another later. Callouts from fork.c to rmap.c to allocate,
dup and exit anonmm.

page_referenced and try_to_unmap call _anon or _obj variants
to process lists, which call _one for each vma or anonmm.
The resulting rmap.c is a lot easier to read than this patch.

include/linux/rmap.h | 13 +
include/linux/sched.h | 1
kernel/fork.c | 19 +-
mm/rmap.c | 444 ++++++++++++++++++++++++++++++++++++++------------
4 files changed, 376 insertions(+), 101 deletions(-)

--- anobjrmap4/include/linux/rmap.h 2004-03-18 21:27:15.345558840 +0000
+++ anobjrmap5/include/linux/rmap.h 2004-03-18 21:27:26.840811296 +0000
@@ -35,6 +35,14 @@ static inline void page_dup_rmap(struct
}

/*
+ * Called from kernel/fork.c to manage anonymous memory
+ */
+void init_rmap(void);
+int exec_rmap(struct mm_struct *);
+int dup_rmap(struct mm_struct *, struct mm_struct *oldmm);
+void exit_rmap(struct mm_struct *);
+
+/*
* Called from mm/vmscan.c to handle paging out
*/
int fastcall page_referenced(struct page *);
@@ -42,6 +50,11 @@ int fastcall try_to_unmap(struct page *)

#else /* !CONFIG_MMU */

+#define init_rmap() do {} while (0)
+#define exec_rmap(mm) (0)
+#define dup_rmap(mm, oldmm) (0)
+#define exit_rmap(mm) do {} while (0)
+
#define page_referenced(page) TestClearPageReferenced(page)
#define try_to_unmap(page) SWAP_FAIL

--- anobjrmap4/include/linux/sched.h 2004-03-11 01:56:07.000000000 +0000
+++ anobjrmap5/include/linux/sched.h 2004-03-18 21:27:26.842810992 +0000
@@ -199,6 +199,7 @@ struct mm_struct {
* together off init_mm.mmlist, and are protected
* by mmlist_lock
*/
+ struct anonmm *anonmm; /* For rmap to track anon mem */

unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
--- anobjrmap4/kernel/fork.c 2004-03-11 01:56:07.000000000 +0000
+++ anobjrmap5/kernel/fork.c 2004-03-18 21:27:26.844810688 +0000
@@ -31,6 +31,7 @@
#include <linux/futex.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
+#include <linux/rmap.h>

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -417,9 +418,14 @@ struct mm_struct * mm_alloc(void)
mm = allocate_mm();
if (mm) {
memset(mm, 0, sizeof(*mm));
- return mm_init(mm);
+ mm = mm_init(mm);
+ if (mm && exec_rmap(mm)) {
+ mm_free_pgd(mm);
+ free_mm(mm);
+ mm = NULL;
+ }
}
- return NULL;
+ return mm;
}

/*
@@ -446,6 +452,7 @@ void mmput(struct mm_struct *mm)
spin_unlock(&mmlist_lock);
exit_aio(mm);
exit_mmap(mm);
+ exit_rmap(mm);
mmdrop(mm);
}
}
@@ -550,6 +557,12 @@ static int copy_mm(unsigned long clone_f
if (!mm_init(mm))
goto fail_nomem;

+ if (dup_rmap(mm, oldmm)) {
+ mm_free_pgd(mm);
+ free_mm(mm);
+ goto fail_nomem;
+ }
+
if (init_new_context(tsk,mm))
goto fail_nocontext;

@@ -1246,4 +1259,6 @@ void __init proc_caches_init(void)
SLAB_HWCACHE_ALIGN, NULL, NULL);
if(!mm_cachep)
panic("vma_init: Cannot alloc mm_struct SLAB cache");
+
+ init_rmap();
}
--- anobjrmap4/mm/rmap.c 2004-03-18 21:27:15.362556256 +0000
+++ anobjrmap5/mm/rmap.c 2004-03-18 21:27:26.848810080 +0000
@@ -25,52 +25,173 @@
#include <linux/init.h>
#include <linux/rmap.h>

+/*
+ * struct anonmm: to track a bundle of anonymous memory mappings.
+ *
+ * Could be embedded in mm_struct, but mm_struct is rather heavyweight,
+ * and we may need the anonmm to stay around long after the mm_struct
+ * and its pgd have been freed: because pages originally faulted into
+ * that mm have been duped into forked mms, and still need tracking.
+ */
+struct anonmm {
+ atomic_t count; /* ref count, incl. 1 per page */
+ spinlock_t lock; /* head's locks list; others unused */
+ struct mm_struct *mm; /* assoc mm_struct, NULL when gone */
+ struct anonmm *head; /* exec starts new chain from head */
+ struct list_head list; /* chain of associated anonmms */
+};
+static kmem_cache_t *anonmm_cachep;
+
+/**
+ ** Functions for creating and destroying struct anonmm.
+ **/
+
+void __init init_rmap(void)
+{
+ anonmm_cachep = kmem_cache_create("anonmm",
+ sizeof(struct anonmm), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!anonmm_cachep)
+ panic("init_rmap: Cannot alloc anonmm SLAB cache");
+}
+
+int exec_rmap(struct mm_struct *mm)
+{
+ struct anonmm *anonmm;
+
+ anonmm = kmem_cache_alloc(anonmm_cachep, SLAB_KERNEL);
+ if (unlikely(!anonmm))
+ return -ENOMEM;
+
+ atomic_set(&anonmm->count, 2); /* ref by mm and head */
+ anonmm->lock = SPIN_LOCK_UNLOCKED; /* this lock is used */
+ anonmm->mm = mm;
+ anonmm->head = anonmm;
+ INIT_LIST_HEAD(&anonmm->list);
+ mm->anonmm = anonmm;
+ return 0;
+}
+
+int dup_rmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ struct anonmm *anonmm;
+ struct anonmm *anonhd = oldmm->anonmm->head;
+
+ anonmm = kmem_cache_alloc(anonmm_cachep, SLAB_KERNEL);
+ if (unlikely(!anonmm))
+ return -ENOMEM;
+
+ /*
+ * copy_mm calls us before dup_mmap has reset the mm fields,
+ * so reset rss ourselves before adding to anonhd's list,
+ * to keep away from this mm until it's worth examining.
+ */
+ mm->rss = 0;
+
+ atomic_set(&anonmm->count, 1); /* ref by mm */
+ anonmm->lock = SPIN_LOCK_UNLOCKED; /* this lock is not used */
+ anonmm->mm = mm;
+ anonmm->head = anonhd;
+ spin_lock(&anonhd->lock);
+ atomic_inc(&anonhd->count); /* ref by anonmm's head */
+ list_add_tail(&anonmm->list, &anonhd->list);
+ spin_unlock(&anonhd->lock);
+ mm->anonmm = anonmm;
+ return 0;
+}
+
+void exit_rmap(struct mm_struct *mm)
+{
+ struct anonmm *anonmm = mm->anonmm;
+ struct anonmm *anonhd = anonmm->head;
+
+ mm->anonmm = NULL;
+ spin_lock(&anonhd->lock);
+ anonmm->mm = NULL;
+ if (atomic_dec_and_test(&anonmm->count)) {
+ BUG_ON(anonmm == anonhd);
+ list_del(&anonmm->list);
+ kmem_cache_free(anonmm_cachep, anonmm);
+ if (atomic_dec_and_test(&anonhd->count))
+ BUG();
+ }
+ spin_unlock(&anonhd->lock);
+ if (atomic_read(&anonhd->count) == 1) {
+ BUG_ON(anonhd->mm);
+ BUG_ON(!list_empty(&anonhd->list));
+ kmem_cache_free(anonmm_cachep, anonhd);
+ }
+}
+
+static void free_anonmm(struct anonmm *anonmm)
+{
+ struct anonmm *anonhd = anonmm->head;
+
+ BUG_ON(anonmm->mm);
+ BUG_ON(anonmm == anonhd);
+ spin_lock(&anonhd->lock);
+ list_del(&anonmm->list);
+ if (atomic_dec_and_test(&anonhd->count))
+ BUG();
+ spin_unlock(&anonhd->lock);
+ kmem_cache_free(anonmm_cachep, anonmm);
+}
+
static inline void clear_page_anon(struct page *page)
{
+ struct anonmm *anonmm = (struct anonmm *) page->mapping;
+
page->mapping = NULL;
ClearPageAnon(page);
+ if (atomic_dec_and_test(&anonmm->count))
+ free_anonmm(anonmm);
}

/**
** VM stuff below this comment
**/

-/**
- * find_pte - Find a pte pointer given a vma and a struct page.
- * @vma: the vma to search
- * @page: the page to find
- *
- * Determine if this page is mapped in this vma. If it is, map and return
- * the pte pointer associated with it. Return null if the page is not
- * mapped in this vma for any reason.
- *
- * This is strictly an internal helper function for the object-based rmap
- * functions.
- *
- * It is the caller's responsibility to unmap the pte if it is returned.
+/*
+ * At what user virtual address is page expected in file-backed vma?
*/
-static inline pte_t *
-find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr)
+#define NOADDR (~0UL) /* impossible user virtual address */
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ unsigned long pgoff;
+ unsigned long address;
+
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ return (address >= vma->vm_start && address < vma->vm_end)?
+ address: NOADDR;
+}
+
+/**
+ ** Subfunctions of page_referenced: page_referenced_one called
+ ** repeatedly from either page_referenced_anon or page_referenced_obj.
+ **/
+
+static int page_referenced_one(struct page *page,
+ struct mm_struct *mm, unsigned long address, int *mapcount)
{
- struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
- unsigned long loffset;
- unsigned long address;
+ int referenced = 0;

- loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
- address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT);
- if (address < vma->vm_start || address >= vma->vm_end)
+ if (!spin_trylock(&mm->page_table_lock)) {
+ referenced++;
goto out;
+ }

pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
- goto out;
+ goto out_unlock;

pmd = pmd_offset(pgd, address);
if (!pmd_present(*pmd))
- goto out;
+ goto out_unlock;

pte = pte_offset_map(pmd, address);
if (!pte_present(*pte))
@@ -79,53 +200,59 @@ find_pte(struct vm_area_struct *vma, str
if (page_to_pfn(page) != pte_pfn(*pte))
goto out_unmap;

- if (addr)
- *addr = address;
+ if (ptep_test_and_clear_young(pte))
+ referenced++;

- return pte;
+ (*mapcount)--;

out_unmap:
pte_unmap(pte);
+
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+
out:
- return NULL;
+ return referenced;
}

-/**
- * page_referenced_obj_one - referenced check for object-based rmap
- * @vma: the vma to look in.
- * @page: the page we're working on.
- *
- * Find a pte entry for a page/vma pair, then check and clear the referenced
- * bit.
- *
- * This is strictly a helper function for page_referenced_obj.
- */
-static int
-page_referenced_obj_one(struct vm_area_struct *vma, struct page *page)
+static inline int page_referenced_anon(struct page *page, int *mapcount)
{
- struct mm_struct *mm = vma->vm_mm;
- pte_t *pte;
+ struct anonmm *anonmm = (struct anonmm *) page->mapping;
+ struct anonmm *anonhd = anonmm->head;
+ struct list_head *seek_head;
int referenced = 0;

- if (!spin_trylock(&mm->page_table_lock))
- return 1;
-
- pte = find_pte(vma, page, NULL);
- if (pte) {
- if (ptep_test_and_clear_young(pte))
- referenced++;
- pte_unmap(pte);
+ spin_lock(&anonhd->lock);
+ /*
+ * First try the indicated mm, it's the most likely.
+ */
+ if (anonmm->mm && anonmm->mm->rss) {
+ referenced += page_referenced_one(
+ page, anonmm->mm, page->index, mapcount);
+ if (!*mapcount)
+ goto out;
}

- spin_unlock(&mm->page_table_lock);
+ /*
+ * Then down the rest of the list, from that as the head. Stop
+ * when we reach anonhd? No: although a page cannot get dup'ed
+ * into an older mm, once swapped, its indicated mm may not be
+ * the oldest, just the first into which it was faulted back.
+ */
+ seek_head = &anonmm->list;
+ list_for_each_entry(anonmm, seek_head, list) {
+ if (!anonmm->mm || !anonmm->mm->rss)
+ continue;
+ referenced += page_referenced_one(
+ page, anonmm->mm, page->index, mapcount);
+ if (!*mapcount)
+ goto out;
+ }
+out:
+ spin_unlock(&anonhd->lock);
return referenced;
}

-static inline int page_referenced_anon(struct page *page)
-{
- return 0; /* until next patch */
-}
-
/**
* page_referenced_obj - referenced check for object-based rmap
* @page: the page we're checking references on.
@@ -140,24 +267,41 @@ static inline int page_referenced_anon(s
* The semaphore address_space->i_shared_sem is tried. If it can't be gotten,
* assume a reference count of 1.
*/
-static int
-page_referenced_obj(struct page *page)
+static inline int page_referenced_obj(struct page *page, int *mapcount)
{
struct address_space *mapping = page->mapping;
struct vm_area_struct *vma;
+ unsigned long address;
int referenced = 0;

if (down_trylock(&mapping->i_shared_sem))
return 1;
-
- list_for_each_entry(vma, &mapping->i_mmap, shared)
- referenced += page_referenced_obj_one(vma, page);

- list_for_each_entry(vma, &mapping->i_mmap_shared, shared)
- referenced += page_referenced_obj_one(vma, page);
+ list_for_each_entry(vma, &mapping->i_mmap, shared) {
+ if (!vma->vm_mm->rss)
+ continue;
+ address = vma_address(page, vma);
+ if (address != NOADDR) {
+ referenced += page_referenced_one(
+ page, vma->vm_mm, address, mapcount);
+ if (!*mapcount)
+ goto out;
+ }
+ }

+ list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+ if (!vma->vm_mm->rss)
+ continue;
+ address = vma_address(page, vma);
+ if (address != NOADDR) {
+ referenced += page_referenced_one(
+ page, vma->vm_mm, address, mapcount);
+ if (!*mapcount)
+ goto out;
+ }
+ }
+out:
up(&mapping->i_shared_sem);
-
return referenced;
}

@@ -166,11 +310,12 @@ page_referenced_obj(struct page *page)
* @page: the page to test
*
* Quick test_and_clear_referenced for all mappings to a page,
- * returns the number of processes which referenced the page.
+ * returns the number of ptes which referenced the page.
* Caller needs to hold the rmap_lock.
*/
int fastcall page_referenced(struct page * page)
{
+ int mapcount = page->mapcount;
int referenced = 0;

if (page_test_and_clear_young(page))
@@ -181,9 +326,9 @@ int fastcall page_referenced(struct page

if (page->mapcount && page->mapping) {
if (PageAnon(page))
- referenced += page_referenced_anon(page);
+ referenced += page_referenced_anon(page, &mapcount);
else
- referenced += page_referenced_obj(page);
+ referenced += page_referenced_obj(page, &mapcount);
}
return referenced;
}
@@ -199,14 +344,18 @@ int fastcall page_referenced(struct page
void fastcall page_add_anon_rmap(struct page *page,
struct mm_struct *mm, unsigned long address)
{
+ struct anonmm *anonmm;
+
BUG_ON(PageReserved(page));
BUG_ON(page_mapping(page));

rmap_lock(page);
if (!page->mapcount) {
+ anonmm = mm->anonmm;
SetPageAnon(page);
page->index = address & PAGE_MASK;
- page->mapping = (void *) mm; /* until next patch */
+ page->mapping = (void *) anonmm;
+ atomic_inc(&anonmm->count);
inc_page_state(nr_mapped);
}
page->mapcount++;
@@ -227,13 +376,28 @@ void fastcall page_add_anon_rmap(struct
void fastcall page_update_anon_rmap(struct page *page,
struct mm_struct *mm, unsigned long address)
{
+ struct anonmm *anonmm;
+
BUG_ON(!PageAnon(page));
if (page->mapcount != 1)
return;

+ anonmm = mm->anonmm;
+ address &= PAGE_MASK;
+ if (anonmm == (struct anonmm *) page->mapping &&
+ address == page->index)
+ return;
+
rmap_lock(page);
- page->index = address & PAGE_MASK;
- page->mapping = (void *) mm; /* until next patch */
+ if (page->mapcount == 1) {
+ page->index = address;
+ if (anonmm != (struct anonmm *) page->mapping) {
+ clear_page_anon(page);
+ SetPageAnon(page);
+ page->mapping = (void *) anonmm;
+ atomic_inc(&anonmm->count);
+ }
+ }
rmap_unlock(page);
}

@@ -280,37 +444,71 @@ void fastcall page_remove_rmap(struct pa
}

/**
- * try_to_unmap_obj_one - unmap a page using the object-based rmap method
- * @page: the page to unmap
- *
- * Determine whether a page is mapped in a given vma and unmap it if it's found.
- *
- * This function is strictly a helper function for try_to_unmap_obj.
- */
-static inline int
-try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page)
+ ** Subfunctions of try_to_unmap: try_to_unmap_one called
+ ** repeatedly from either try_to_unmap_anon or try_to_unmap_obj.
+ **/
+
+static int try_to_unmap_one(struct page *page, struct mm_struct *mm,
+ unsigned long address, int *mapcount, struct vm_area_struct *vma)
{
- struct mm_struct *mm = vma->vm_mm;
- unsigned long address;
+ pgd_t *pgd;
+ pmd_t *pmd;
pte_t *pte;
pte_t pteval;
int ret = SWAP_AGAIN;

+ /*
+ * We need the page_table_lock to protect us from page faults,
+ * munmap, fork, etc...
+ */
if (!spin_trylock(&mm->page_table_lock))
- return ret;
-
- pte = find_pte(vma, page, &address);
- if (!pte)
goto out;

- if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) {
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out_unlock;
+
+ pmd = pmd_offset(pgd, address);
+ if (!pmd_present(*pmd))
+ goto out_unlock;
+
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte))
+ goto out_unmap;
+
+ if (page_to_pfn(page) != pte_pfn(*pte))
+ goto out_unmap;
+
+ (*mapcount)--;
+
+ /*
+ * If the page is mlock()d, we cannot swap it out.
+ * During mremap, it's possible pages are not in a VMA.
+ */
+ if (!vma)
+ vma = find_vma(mm, address);
+ if (!vma || (vma->vm_flags & (VM_LOCKED|VM_RESERVED))) {
ret = SWAP_FAIL;
goto out_unmap;
}

+ /* Nuke the page table entry. */
flush_cache_page(vma, address);
pteval = ptep_clear_flush(vma, address, pte);

+ if (PageAnon(page)) {
+ swp_entry_t entry = { .val = page->private };
+ /*
+ * Store the swap location in the pte.
+ * See handle_pte_fault() ...
+ */
+ BUG_ON(!PageSwapCache(page));
+ swap_duplicate(entry);
+ set_pte(pte, swp_entry_to_pte(entry));
+ BUG_ON(pte_file(*pte));
+ }
+
+ /* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
set_page_dirty(page);

@@ -322,14 +520,49 @@ try_to_unmap_obj_one(struct vm_area_stru
out_unmap:
pte_unmap(pte);

-out:
+out_unlock:
spin_unlock(&mm->page_table_lock);
+
+out:
return ret;
}

-static inline int try_to_unmap_anon(struct page *page)
+static inline int try_to_unmap_anon(struct page *page, int *mapcount)
{
- return SWAP_FAIL; /* until next patch */
+ struct anonmm *anonmm = (struct anonmm *) page->mapping;
+ struct anonmm *anonhd = anonmm->head;
+ struct list_head *seek_head;
+ int ret = SWAP_AGAIN;
+
+ spin_lock(&anonhd->lock);
+ /*
+ * First try the indicated mm, it's the most likely.
+ */
+ if (anonmm->mm && anonmm->mm->rss) {
+ ret = try_to_unmap_one(
+ page, anonmm->mm, page->index, mapcount, NULL);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
+ }
+
+ /*
+ * Then down the rest of the list, from that as the head. Stop
+ * when we reach anonhd? No: although a page cannot get dup'ed
+ * into an older mm, once swapped, its indicated mm may not be
+ * the oldest, just the first into which it was faulted back.
+ */
+ seek_head = &anonmm->list;
+ list_for_each_entry(anonmm, seek_head, list) {
+ if (!anonmm->mm || !anonmm->mm->rss)
+ continue;
+ ret = try_to_unmap_one(
+ page, anonmm->mm, page->index, mapcount, NULL);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
+ }
+out:
+ spin_unlock(&anonhd->lock);
+ return ret;
}

/**
@@ -344,26 +577,38 @@ static inline int try_to_unmap_anon(stru
* The semaphore address_space->i_shared_sem is tried. If it can't be gotten,
* return a temporary error.
*/
-static int
-try_to_unmap_obj(struct page *page)
+static inline int try_to_unmap_obj(struct page *page, int *mapcount)
{
struct address_space *mapping = page->mapping;
struct vm_area_struct *vma;
+ unsigned long address;
int ret = SWAP_AGAIN;

if (down_trylock(&mapping->i_shared_sem))
return ret;
-
+
list_for_each_entry(vma, &mapping->i_mmap, shared) {
- ret = try_to_unmap_obj_one(vma, page);
- if (ret == SWAP_FAIL || !page->mapcount)
- goto out;
+ if (!vma->vm_mm->rss)
+ continue;
+ address = vma_address(page, vma);
+ if (address != NOADDR) {
+ ret = try_to_unmap_one(
+ page, vma->vm_mm, address, mapcount, vma);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
+ }
}

list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- ret = try_to_unmap_obj_one(vma, page);
- if (ret == SWAP_FAIL || !page->mapcount)
- goto out;
+ if (!vma->vm_mm->rss)
+ continue;
+ address = vma_address(page, vma);
+ if (address != NOADDR) {
+ ret = try_to_unmap_one(
+ page, vma->vm_mm, address, mapcount, vma);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
+ }
}

out:
@@ -385,6 +630,7 @@ out:
*/
int fastcall try_to_unmap(struct page * page)
{
+ int mapcount = page->mapcount;
int ret;

BUG_ON(PageReserved(page));
@@ -392,9 +638,9 @@ int fastcall try_to_unmap(struct page *
BUG_ON(!page->mapcount);

if (PageAnon(page))
- ret = try_to_unmap_anon(page);
+ ret = try_to_unmap_anon(page, &mapcount);
else
- ret = try_to_unmap_obj(page);
+ ret = try_to_unmap_obj(page, &mapcount);

if (!page->mapcount) {
if (page_test_and_clear_dirty(page))

2004-03-18 23:47:36

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH] anobjrmap 4/6 no pte_chains

anobjrmap 4/6 remove pte-pointer-based rmap

Lots of deletions: the next patch will put in the new anon rmap,
I expect it to look clearer if first we remove all of the old
pte-pointer-based anon rmap in this patch - which therefore
leaves anonymous rmap totally disabled, anon pages locked in
memory until the process frees them.

A few additions: the previous patch brought ClearPageAnon into
rmap.c instead of leaving it to final page free; but I think
there'd be a race with swapin or swapoff doing SetPageAnon:
now SetPageAnon under lock within page_add_anon_rmap. That
lock now being called rmap_lock instead of pte_chain_lock.

fs/exec.c | 28 --
include/linux/mm.h | 20 -
include/linux/page-flags.h | 11
include/linux/rmap.h | 33 +-
init/main.c | 2
mm/fremap.c | 17 -
mm/memory.c | 112 +-------
mm/mremap.c | 48 +--
mm/nommu.c | 4
mm/page_alloc.c | 9
mm/rmap.c | 598 +++++++--------------------------------------
mm/swapfile.c | 41 ---
mm/vmscan.c | 22 -
13 files changed, 207 insertions(+), 738 deletions(-)

--- anobjrmap3/fs/exec.c 2004-03-18 21:26:52.270066848 +0000
+++ anobjrmap4/fs/exec.c 2004-03-18 21:27:15.341559448 +0000
@@ -293,54 +293,46 @@ EXPORT_SYMBOL(copy_strings_kernel);
* This routine is used to map in a page into an address space: needed by
* execve() for the initial stack and environment pages.
*
- * tsk->mmap_sem is held for writing.
+ * tsk->mm->mmap_sem is held for writing.
*/
void put_dirty_page(struct task_struct *tsk, struct page *page,
unsigned long address, pgprot_t prot)
{
+ struct mm_struct *mm = tsk->mm;
pgd_t * pgd;
pmd_t * pmd;
pte_t * pte;
- struct pte_chain *pte_chain;

if (page_count(page) != 1)
printk(KERN_ERR "mem_map disagrees with %p at %08lx\n",
page, address);

- pgd = pgd_offset(tsk->mm, address);
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain)
- goto out_sig;
- spin_lock(&tsk->mm->page_table_lock);
- pmd = pmd_alloc(tsk->mm, pgd, address);
+ pgd = pgd_offset(mm, address);
+ spin_lock(&mm->page_table_lock);
+ pmd = pmd_alloc(mm, pgd, address);
if (!pmd)
goto out;
- pte = pte_alloc_map(tsk->mm, pmd, address);
+ pte = pte_alloc_map(mm, pmd, address);
if (!pte)
goto out;
if (!pte_none(*pte)) {
pte_unmap(pte);
goto out;
}
+ mm->rss++;
lru_cache_add_active(page);
flush_dcache_page(page);
- SetPageAnon(page);
set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))));
- pte_chain = page_add_rmap(page, pte, pte_chain);
+ page_add_anon_rmap(page, mm, address);
pte_unmap(pte);
- tsk->mm->rss++;
- spin_unlock(&tsk->mm->page_table_lock);
+ spin_unlock(&mm->page_table_lock);

/* no need for flush_tlb */
- pte_chain_free(pte_chain);
return;
out:
- spin_unlock(&tsk->mm->page_table_lock);
-out_sig:
+ spin_unlock(&mm->page_table_lock);
__free_page(page);
force_sig(SIGKILL, tsk);
- pte_chain_free(pte_chain);
- return;
}

int setup_arg_pages(struct linux_binprm *bprm)
--- anobjrmap3/include/linux/mm.h 2004-03-18 21:27:03.810312464 +0000
+++ anobjrmap4/include/linux/mm.h 2004-03-18 21:27:15.342559296 +0000
@@ -147,8 +147,6 @@ struct vm_operations_struct {
int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
};

-/* forward declaration; pte_chain is meant to be internal to rmap.c */
-struct pte_chain;
struct mmu_gather;
struct inode;

@@ -171,17 +169,12 @@ struct page {
unsigned long flags; /* atomic flags, some possibly
updated asynchronously */
atomic_t count; /* Usage count, see below. */
+ int mapcount; /* rmap counts ptes mapped in mms */
struct list_head list; /* ->mapping has some page lists. */
struct address_space *mapping; /* The inode (or ...) we belong to. */
unsigned long index; /* Our offset within mapping. */
struct list_head lru; /* Pageout list, eg. active_list;
protected by zone->lru_lock !! */
- union {
- struct pte_chain *chain;/* Reverse pte mapping pointer.
- * protected by PG_chainlock */
- pte_addr_t direct;
- int mapcount;
- } pte;
unsigned long private; /* mapping-private opaque data */

/*
@@ -404,16 +397,7 @@ void page_address_init(void);
* refers to user virtual address space into which the page is mapped.
*/
#define page_mapping(page) (PageAnon(page)? NULL: (page)->mapping)
-
-/*
- * Return true if this page is mapped into pagetables. Subtle: test pte.direct
- * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain
- * is only 32-bit.
- */
-static inline int page_mapped(struct page *page)
-{
- return page->pte.direct != 0;
-}
+#define page_mapped(page) ((page)->mapcount != 0)

/*
* Error return values for the *_nopage functions
--- anobjrmap3/include/linux/page-flags.h 2004-03-18 21:27:03.812312160 +0000
+++ anobjrmap4/include/linux/page-flags.h 2004-03-18 21:27:15.344558992 +0000
@@ -69,15 +69,14 @@
#define PG_private 12 /* Has something at ->private */
#define PG_writeback 13 /* Page is under writeback */
#define PG_nosave 14 /* Used for system suspend/resume */
-#define PG_chainlock 15 /* lock bit for ->pte_chain */
+#define PG_rmaplock 15 /* Lock bit for reversing to ptes */

-#define PG_direct 16 /* ->pte_chain points directly at pte */
+#define PG_swapcache 16 /* Swap page: swp_entry_t in private */
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */

#define PG_anon 20 /* Anonymous page: anonmm in mapping */
-#define PG_swapcache 21 /* Swap page: swp_entry_t in private */

/*
* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -284,12 +283,6 @@ extern void get_full_page_state(struct p
#define ClearPageNosave(page) clear_bit(PG_nosave, &(page)->flags)
#define TestClearPageNosave(page) test_and_clear_bit(PG_nosave, &(page)->flags)

-#define PageDirect(page) test_bit(PG_direct, &(page)->flags)
-#define SetPageDirect(page) set_bit(PG_direct, &(page)->flags)
-#define TestSetPageDirect(page) test_and_set_bit(PG_direct, &(page)->flags)
-#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags)
-#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags)
-
#define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags)
#define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
#define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
--- anobjrmap3/include/linux/rmap.h 2004-03-18 21:27:03.813312008 +0000
+++ anobjrmap4/include/linux/rmap.h 2004-03-18 21:27:15.345558840 +0000
@@ -8,25 +8,32 @@
#include <linux/config.h>
#include <linux/linkage.h>

-#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, &(page)->flags)
-#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, &(page)->flags)
+#define rmap_lock(page) bit_spin_lock(PG_rmaplock, &(page)->flags)
+#define rmap_unlock(page) bit_spin_unlock(PG_rmaplock, &(page)->flags)

#ifdef CONFIG_MMU

-struct pte_chain;
-struct pte_chain *pte_chain_alloc(int gfp_flags);
-void __pte_chain_free(struct pte_chain *pte_chain);
-
-static inline void pte_chain_free(struct pte_chain *pte_chain)
+void fastcall page_add_anon_rmap(struct page *,
+ struct mm_struct *, unsigned long addr);
+void fastcall page_update_anon_rmap(struct page *,
+ struct mm_struct *, unsigned long addr);
+void fastcall page_add_obj_rmap(struct page *);
+void fastcall page_remove_rmap(struct page *);
+
+/**
+ * page_dup_rmap - duplicate pte mapping to a page
+ * @page: the page to add the mapping to
+ *
+ * For copy_page_range only: minimal extract from page_add_rmap,
+ * avoiding unnecessary tests (already checked) so it's quicker.
+ */
+static inline void page_dup_rmap(struct page *page)
{
- if (pte_chain)
- __pte_chain_free(pte_chain);
+ rmap_lock(page);
+ page->mapcount++;
+ rmap_unlock(page);
}

-struct pte_chain * fastcall
- page_add_rmap(struct page *, pte_t *, struct pte_chain *);
-void fastcall page_remove_rmap(struct page *, pte_t *);
-
/*
* Called from mm/vmscan.c to handle paging out
*/
--- anobjrmap3/init/main.c 2004-03-16 07:00:20.000000000 +0000
+++ anobjrmap4/init/main.c 2004-03-18 21:27:15.346558688 +0000
@@ -84,7 +84,6 @@ extern void signals_init(void);
extern void buffer_init(void);
extern void pidhash_init(void);
extern void pidmap_init(void);
-extern void pte_chain_init(void);
extern void radix_tree_init(void);
extern void free_initmem(void);
extern void populate_rootfs(void);
@@ -457,7 +456,6 @@ asmlinkage void __init start_kernel(void
calibrate_delay();
pidmap_init();
pgtable_cache_init();
- pte_chain_init();
#ifdef CONFIG_X86
if (efi_enabled)
efi_enter_virtual_mode();
--- anobjrmap3/mm/fremap.c 2004-03-18 21:27:03.817311400 +0000
+++ anobjrmap4/mm/fremap.c 2004-03-18 21:27:15.347558536 +0000
@@ -36,7 +36,7 @@ static inline void zap_pte(struct mm_str
if (!PageReserved(page)) {
if (pte_dirty(pte))
set_page_dirty(page);
- page_remove_rmap(page, ptep);
+ page_remove_rmap(page);
page_cache_release(page);
mm->rss--;
}
@@ -49,7 +49,7 @@ static inline void zap_pte(struct mm_str
}

/*
- * Install a page to a given virtual memory address, release any
+ * Install a file page to a given virtual memory address, release any
* previously existing mapping.
*/
int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -60,11 +60,12 @@ int install_page(struct mm_struct *mm, s
pgd_t *pgd;
pmd_t *pmd;
pte_t pte_val;
- struct pte_chain *pte_chain;

- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain)
- goto err;
+ /*
+ * We use page_add_obj_rmap below: if install_page is
+ * ever extended to anonymous pages, this will warn us.
+ */
+ BUG_ON(!page_mapping(page));

pgd = pgd_offset(mm, addr);
spin_lock(&mm->page_table_lock);
@@ -82,7 +83,7 @@ int install_page(struct mm_struct *mm, s
mm->rss++;
flush_icache_page(vma, page);
set_pte(pte, mk_pte(page, prot));
- pte_chain = page_add_rmap(page, pte, pte_chain);
+ page_add_obj_rmap(page);
pte_val = *pte;
pte_unmap(pte);
update_mmu_cache(vma, addr, pte_val);
@@ -90,8 +91,6 @@ int install_page(struct mm_struct *mm, s
err = 0;
err_unlock:
spin_unlock(&mm->page_table_lock);
- pte_chain_free(pte_chain);
-err:
return err;
}
EXPORT_SYMBOL(install_page);
--- anobjrmap3/mm/memory.c 2004-03-18 21:27:03.820310944 +0000
+++ anobjrmap4/mm/memory.c 2004-03-18 21:27:15.351557928 +0000
@@ -217,20 +217,10 @@ int copy_page_range(struct mm_struct *ds
unsigned long address = vma->vm_start;
unsigned long end = vma->vm_end;
unsigned long cow;
- struct pte_chain *pte_chain = NULL;

if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst, src, vma);

- pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN);
- if (!pte_chain) {
- spin_unlock(&dst->page_table_lock);
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- spin_lock(&dst->page_table_lock);
- if (!pte_chain)
- goto nomem;
- }
-
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
src_pgd = pgd_offset(src, address)-1;
dst_pgd = pgd_offset(dst, address)-1;
@@ -329,32 +319,8 @@ skip_copy_pte_range:
pte = pte_mkold(pte);
get_page(page);
dst->rss++;
-
set_pte(dst_pte, pte);
- pte_chain = page_add_rmap(page, dst_pte,
- pte_chain);
- if (pte_chain)
- goto cont_copy_pte_range_noset;
- pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN);
- if (pte_chain)
- goto cont_copy_pte_range_noset;
-
- /*
- * pte_chain allocation failed, and we need to
- * run page reclaim.
- */
- pte_unmap_nested(src_pte);
- pte_unmap(dst_pte);
- spin_unlock(&src->page_table_lock);
- spin_unlock(&dst->page_table_lock);
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- spin_lock(&dst->page_table_lock);
- if (!pte_chain)
- goto nomem;
- spin_lock(&src->page_table_lock);
- dst_pte = pte_offset_map(dst_pmd, address);
- src_pte = pte_offset_map_nested(src_pmd,
- address);
+ page_dup_rmap(page);
cont_copy_pte_range_noset:
address += PAGE_SIZE;
if (address >= end) {
@@ -377,10 +343,8 @@ cont_copy_pmd_range:
out_unlock:
spin_unlock(&src->page_table_lock);
out:
- pte_chain_free(pte_chain);
return 0;
nomem:
- pte_chain_free(pte_chain);
return -ENOMEM;
}

@@ -421,7 +385,7 @@ zap_pte_range(struct mmu_gather *tlb, pm
page_mapping(page))
mark_page_accessed(page);
tlb->freed++;
- page_remove_rmap(page, ptep);
+ page_remove_rmap(page);
tlb_remove_page(tlb, page);
}
}
@@ -1014,7 +978,6 @@ static int do_wp_page(struct mm_struct *
{
struct page *old_page, *new_page;
unsigned long pfn = pte_pfn(pte);
- struct pte_chain *pte_chain;
pte_t entry;

if (unlikely(!pfn_valid(pfn))) {
@@ -1039,6 +1002,14 @@ static int do_wp_page(struct mm_struct *
entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
vma);
ptep_establish(vma, address, page_table, entry);
+ if (PageAnon(old_page)) {
+ /*
+ * Optimization: the page may have been
+ * registered under a long defunct mm:
+ * now we know it belongs only to this.
+ */
+ page_update_anon_rmap(old_page, mm, address);
+ }
update_mmu_cache(vma, address, entry);
pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
@@ -1053,9 +1024,6 @@ static int do_wp_page(struct mm_struct *
page_cache_get(old_page);
spin_unlock(&mm->page_table_lock);

- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain)
- goto no_pte_chain;
new_page = alloc_page(GFP_HIGHUSER);
if (!new_page)
goto no_new_page;
@@ -1069,11 +1037,11 @@ static int do_wp_page(struct mm_struct *
if (pte_same(*page_table, pte)) {
if (PageReserved(old_page))
++mm->rss;
- page_remove_rmap(old_page, page_table);
+ else
+ page_remove_rmap(old_page);
break_cow(vma, new_page, address, page_table);
- SetPageAnon(new_page);
- pte_chain = page_add_rmap(new_page, page_table, pte_chain);
lru_cache_add_active(new_page);
+ page_add_anon_rmap(new_page, mm, address);

/* Free the old page.. */
new_page = old_page;
@@ -1082,12 +1050,9 @@ static int do_wp_page(struct mm_struct *
page_cache_release(new_page);
page_cache_release(old_page);
spin_unlock(&mm->page_table_lock);
- pte_chain_free(pte_chain);
return VM_FAULT_MINOR;

no_new_page:
- pte_chain_free(pte_chain);
-no_pte_chain:
page_cache_release(old_page);
return VM_FAULT_OOM;
}
@@ -1245,7 +1210,6 @@ static int do_swap_page(struct mm_struct
swp_entry_t entry = pte_to_swp_entry(orig_pte);
pte_t pte;
int ret = VM_FAULT_MINOR;
- struct pte_chain *pte_chain = NULL;

pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
@@ -1275,11 +1239,6 @@ static int do_swap_page(struct mm_struct
}

mark_page_accessed(page);
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain) {
- ret = VM_FAULT_OOM;
- goto out;
- }
lock_page(page);

/*
@@ -1311,15 +1270,13 @@ static int do_swap_page(struct mm_struct

flush_icache_page(vma, page);
set_pte(page_table, pte);
- SetPageAnon(page);
- pte_chain = page_add_rmap(page, page_table, pte_chain);
+ page_add_anon_rmap(page, mm, address);

/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
out:
- pte_chain_free(pte_chain);
return ret;
}

@@ -1335,20 +1292,7 @@ do_anonymous_page(struct mm_struct *mm,
{
pte_t entry;
struct page * page = ZERO_PAGE(addr);
- struct pte_chain *pte_chain;
- int ret;

- pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN);
- if (!pte_chain) {
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain)
- goto no_mem;
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, addr);
- }
-
/* Read-only mapping of ZERO_PAGE. */
entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));

@@ -1370,7 +1314,6 @@ do_anonymous_page(struct mm_struct *mm,
pte_unmap(page_table);
page_cache_release(page);
spin_unlock(&mm->page_table_lock);
- ret = VM_FAULT_MINOR;
goto out;
}
mm->rss++;
@@ -1379,25 +1322,19 @@ do_anonymous_page(struct mm_struct *mm,
vma);
lru_cache_add_active(page);
mark_page_accessed(page);
- SetPageAnon(page);
+ page_add_anon_rmap(page, mm, addr);
}

set_pte(page_table, entry);
- /* ignores ZERO_PAGE */
- pte_chain = page_add_rmap(page, page_table, pte_chain);
pte_unmap(page_table);

/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, entry);
spin_unlock(&mm->page_table_lock);
- ret = VM_FAULT_MINOR;
- goto out;
-
-no_mem:
- ret = VM_FAULT_OOM;
out:
- pte_chain_free(pte_chain);
- return ret;
+ return VM_FAULT_MINOR;
+no_mem:
+ return VM_FAULT_OOM;
}

/*
@@ -1419,7 +1356,6 @@ do_no_page(struct mm_struct *mm, struct
struct page * new_page;
struct address_space *mapping = NULL;
pte_t entry;
- struct pte_chain *pte_chain;
int sequence = 0;
int ret = VM_FAULT_MINOR;
int anon = 0;
@@ -1444,10 +1380,6 @@ retry:
if (new_page == NOPAGE_OOM)
return VM_FAULT_OOM;

- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain)
- goto oom;
-
/*
* Should we do an early C-O-W break?
*/
@@ -1472,7 +1404,6 @@ retry:
sequence = atomic_read(&mapping->truncate_count);
spin_unlock(&mm->page_table_lock);
page_cache_release(new_page);
- pte_chain_free(pte_chain);
goto retry;
}
page_table = pte_offset_map(pmd, address);
@@ -1497,10 +1428,10 @@ retry:
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
set_pte(page_table, entry);
if (anon) {
- SetPageAnon(new_page);
lru_cache_add_active(new_page);
- }
- pte_chain = page_add_rmap(new_page, page_table, pte_chain);
+ page_add_anon_rmap(new_page, mm, address);
+ } else
+ page_add_obj_rmap(new_page);
pte_unmap(page_table);
} else {
/* One of our sibling threads was faster, back out. */
@@ -1518,7 +1449,6 @@ oom:
page_cache_release(new_page);
ret = VM_FAULT_OOM;
out:
- pte_chain_free(pte_chain);
return ret;
}

--- anobjrmap3/mm/mremap.c 2004-03-18 21:26:52.286064416 +0000
+++ anobjrmap4/mm/mremap.c 2004-03-18 21:27:15.353557624 +0000
@@ -79,31 +79,29 @@ static inline pte_t *alloc_one_pte_map(s
return pte;
}

-static int
+static void
copy_one_pte(struct vm_area_struct *vma, unsigned long old_addr,
- pte_t *src, pte_t *dst, struct pte_chain **pte_chainp)
+ unsigned long new_addr, pte_t *src, pte_t *dst)
{
- int error = 0;
pte_t pte;
- struct page *page = NULL;

- if (pte_present(*src))
- page = pte_page(*src);
+ pte = ptep_clear_flush(vma, old_addr, src);
+ set_pte(dst, pte);

- if (!pte_none(*src)) {
- if (page)
- page_remove_rmap(page, src);
- pte = ptep_clear_flush(vma, old_addr, src);
- if (!dst) {
- /* No dest? We must put it back. */
- dst = src;
- error++;
+ /*
+ * This block handles a common case, but is grossly inadequate
+ * for the general case: what if the anon page is shared with
+ * parent or child? what if it's currently swapped out?
+ * Return to handle mremap moving rmap in a later patch.
+ */
+ if (pte_present(pte)) {
+ unsigned long pfn = pte_pfn(pte);
+ if (pfn_valid(pfn)) {
+ struct page *page = pfn_to_page(pfn);
+ if (PageAnon(page))
+ page_update_anon_rmap(page, vma->vm_mm, new_addr);
}
- set_pte(dst, pte);
- if (page)
- *pte_chainp = page_add_rmap(page, dst, *pte_chainp);
}
- return error;
}

static int
@@ -113,13 +111,7 @@ move_one_page(struct vm_area_struct *vma
struct mm_struct *mm = vma->vm_mm;
int error = 0;
pte_t *src, *dst;
- struct pte_chain *pte_chain;

- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain) {
- error = -ENOMEM;
- goto out;
- }
spin_lock(&mm->page_table_lock);
src = get_one_pte_map_nested(mm, old_addr);
if (src) {
@@ -140,15 +132,15 @@ move_one_page(struct vm_area_struct *vma
* page_table_lock, we should re-check the src entry...
*/
if (src) {
- error = copy_one_pte(vma, old_addr, src,
- dst, &pte_chain);
+ if (dst)
+ copy_one_pte(vma, old_addr, new_addr, src, dst);
+ else
+ error = -ENOMEM;
pte_unmap_nested(src);
}
pte_unmap(dst);
}
spin_unlock(&mm->page_table_lock);
- pte_chain_free(pte_chain);
-out:
return error;
}

--- anobjrmap3/mm/nommu.c 2004-02-04 02:45:41.000000000 +0000
+++ anobjrmap4/mm/nommu.c 2004-03-18 21:27:15.354557472 +0000
@@ -567,7 +567,3 @@ unsigned long get_unmapped_area(struct f
{
return -ENOMEM;
}
-
-void pte_chain_init(void)
-{
-}
--- anobjrmap3/mm/page_alloc.c 2004-03-18 21:27:03.824310336 +0000
+++ anobjrmap4/mm/page_alloc.c 2004-03-18 21:27:15.356557168 +0000
@@ -83,8 +83,7 @@ static void bad_page(const char *functio
1 << PG_lru |
1 << PG_active |
1 << PG_dirty |
- 1 << PG_chainlock |
- 1 << PG_direct |
+ 1 << PG_rmaplock |
1 << PG_anon |
1 << PG_swapcache |
1 << PG_writeback);
@@ -224,8 +223,7 @@ static inline void free_pages_check(cons
1 << PG_active |
1 << PG_reclaim |
1 << PG_slab |
- 1 << PG_chainlock |
- 1 << PG_direct |
+ 1 << PG_rmaplock |
1 << PG_anon |
1 << PG_swapcache |
1 << PG_writeback )))
@@ -335,8 +333,7 @@ static void prep_new_page(struct page *p
1 << PG_active |
1 << PG_dirty |
1 << PG_reclaim |
- 1 << PG_chainlock |
- 1 << PG_direct |
+ 1 << PG_rmaplock |
1 << PG_anon |
1 << PG_swapcache |
1 << PG_writeback )))
--- anobjrmap3/mm/rmap.c 2004-03-18 21:27:03.828309728 +0000
+++ anobjrmap4/mm/rmap.c 2004-03-18 21:27:15.362556256 +0000
@@ -4,17 +4,14 @@
* Copyright 2001, Rik van Riel <[email protected]>
* Released under the General Public License (GPL).
*
- *
- * Simple, low overhead pte-based reverse mapping scheme.
- * This is kept modular because we may want to experiment
- * with object-based reverse mapping schemes. Please try
- * to keep this thing as modular as possible.
+ * Simple, low overhead reverse mapping scheme.
+ * Please try to keep this thing as modular as possible.
*/

/*
* Locking:
- * - the page->pte.chain is protected by the PG_chainlock bit,
- * which nests within the the mm->page_table_lock,
+ * - the page->mapcount field is protected by the PG_rmaplock bit,
+ * which nests within the mm->page_table_lock,
* which nests within the page lock.
* - because swapout locking is opposite to the locking order
* in the page fault path, the swapout path uses trylocks
@@ -27,87 +24,13 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rmap.h>
-#include <linux/cache.h>
-#include <linux/percpu.h>
-
-#include <asm/pgalloc.h>
-#include <asm/rmap.h>
-#include <asm/tlb.h>
-#include <asm/tlbflush.h>
-
-/*
- * Something oopsable to put for now in the page->mapping
- * of an anonymous page, to test that it is ignored.
- */
-#define ANON_MAPPING_DEBUG ((struct address_space *) 1)

static inline void clear_page_anon(struct page *page)
{
- BUG_ON(page->mapping != ANON_MAPPING_DEBUG);
page->mapping = NULL;
ClearPageAnon(page);
}

-/*
- * Shared pages have a chain of pte_chain structures, used to locate
- * all the mappings to this page. We only need a pointer to the pte
- * here, the page struct for the page table page contains the process
- * it belongs to and the offset within that process.
- *
- * We use an array of pte pointers in this structure to minimise cache misses
- * while traversing reverse maps.
- */
-#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t))
-
-/*
- * next_and_idx encodes both the address of the next pte_chain and the
- * offset of the highest-index used pte in ptes[].
- */
-struct pte_chain {
- unsigned long next_and_idx;
- pte_addr_t ptes[NRPTE];
-} ____cacheline_aligned;
-
-kmem_cache_t *pte_chain_cache;
-
-static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain)
-{
- return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE);
-}
-
-static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr)
-{
- return (struct pte_chain *)(pte_chain_addr & ~NRPTE);
-}
-
-static inline int pte_chain_idx(struct pte_chain *pte_chain)
-{
- return pte_chain->next_and_idx & NRPTE;
-}
-
-static inline unsigned long
-pte_chain_encode(struct pte_chain *pte_chain, int idx)
-{
- return (unsigned long)pte_chain | idx;
-}
-
-/*
- * pte_chain list management policy:
- *
- * - If a page has a pte_chain list then it is shared by at least two processes,
- * because a single sharing uses PageDirect. (Well, this isn't true yet,
- * coz this code doesn't collapse singletons back to PageDirect on the remove
- * path).
- * - A pte_chain list has free space only in the head member - all succeeding
- * members are 100% full.
- * - If the head element has free space, it occurs in its leading slots.
- * - All free space in the pte_chain is at the start of the head member.
- * - Insertion into the pte_chain puts a pte pointer in the last free slot of
- * the head member.
- * - Removal from a pte chain moves the head pte of the head member onto the
- * victim pte and frees the head member if it became empty.
- */
-
/**
** VM stuff below this comment
**/
@@ -198,6 +121,11 @@ page_referenced_obj_one(struct vm_area_s
return referenced;
}

+static inline int page_referenced_anon(struct page *page)
+{
+ return 0; /* until next patch */
+}
+
/**
* page_referenced_obj - referenced check for object-based rmap
* @page: the page we're checking references on.
@@ -219,15 +147,6 @@ page_referenced_obj(struct page *page)
struct vm_area_struct *vma;
int referenced = 0;

- if (!page->pte.mapcount)
- return 0;
-
- if (!mapping)
- return 0;
-
- if (PageSwapCache(page))
- BUG();
-
if (down_trylock(&mapping->i_shared_sem))
return 1;

@@ -248,14 +167,10 @@ page_referenced_obj(struct page *page)
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of processes which referenced the page.
- * Caller needs to hold the pte_chain_lock.
- *
- * If the page has a single-entry pte_chain, collapse that back to a PageDirect
- * representation. This way, it's only done under memory pressure.
+ * Caller needs to hold the rmap_lock.
*/
int fastcall page_referenced(struct page * page)
{
- struct pte_chain *pc;
int referenced = 0;

if (page_test_and_clear_young(page))
@@ -264,194 +179,104 @@ int fastcall page_referenced(struct page
if (TestClearPageReferenced(page))
referenced++;

- if (!PageAnon(page)) {
- referenced += page_referenced_obj(page);
- goto out;
- }
- if (PageDirect(page)) {
- pte_t *pte = rmap_ptep_map(page->pte.direct);
- if (ptep_test_and_clear_young(pte))
- referenced++;
- rmap_ptep_unmap(pte);
- } else {
- int nr_chains = 0;
-
- /* Check all the page tables mapping this page. */
- for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) {
- int i;
-
- for (i = pte_chain_idx(pc); i < NRPTE; i++) {
- pte_addr_t pte_paddr = pc->ptes[i];
- pte_t *p;
-
- p = rmap_ptep_map(pte_paddr);
- if (ptep_test_and_clear_young(p))
- referenced++;
- rmap_ptep_unmap(p);
- nr_chains++;
- }
- }
- if (nr_chains == 1) {
- pc = page->pte.chain;
- page->pte.direct = pc->ptes[NRPTE-1];
- SetPageDirect(page);
- pc->ptes[NRPTE-1] = 0;
- __pte_chain_free(pc);
- }
+ if (page->mapcount && page->mapping) {
+ if (PageAnon(page))
+ referenced += page_referenced_anon(page);
+ else
+ referenced += page_referenced_obj(page);
}
-out:
return referenced;
}

/**
- * page_add_rmap - add reverse mapping entry to a page
- * @page: the page to add the mapping to
- * @ptep: the page table entry mapping this page
+ * page_add_anon_rmap - add pte mapping to an anonymous page
+ * @page: the page to add the mapping to
+ * @mm: the mm in which the mapping is added
+ * @address: the user virtual address mapped
*
- * Add a new pte reverse mapping to a page.
* The caller needs to hold the mm->page_table_lock.
*/
-struct pte_chain * fastcall
-page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain)
+void fastcall page_add_anon_rmap(struct page *page,
+ struct mm_struct *mm, unsigned long address)
{
- pte_addr_t pte_paddr = ptep_to_paddr(ptep);
- struct pte_chain *cur_pte_chain;
-
- if (PageReserved(page))
- return pte_chain;
-
- pte_chain_lock(page);
-
- /*
- * If this is an object-based page, just count it. We can
- * find the mappings by walking the object vma chain for that object.
- */
- if (!PageAnon(page)) {
- if (PageSwapCache(page))
- BUG();
- if (!page->pte.mapcount)
- inc_page_state(nr_mapped);
- page->pte.mapcount++;
- goto out;
- }
-
- page->mapping = ANON_MAPPING_DEBUG;
+ BUG_ON(PageReserved(page));
+ BUG_ON(page_mapping(page));

- if (page->pte.direct == 0) {
- page->pte.direct = pte_paddr;
- SetPageDirect(page);
+ rmap_lock(page);
+ if (!page->mapcount) {
+ SetPageAnon(page);
+ page->index = address & PAGE_MASK;
+ page->mapping = (void *) mm; /* until next patch */
inc_page_state(nr_mapped);
- goto out;
}
+ page->mapcount++;
+ rmap_unlock(page);
+}

- if (PageDirect(page)) {
- /* Convert a direct pointer into a pte_chain */
- ClearPageDirect(page);
- pte_chain->ptes[NRPTE-1] = page->pte.direct;
- pte_chain->ptes[NRPTE-2] = pte_paddr;
- pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2);
- page->pte.direct = 0;
- page->pte.chain = pte_chain;
- pte_chain = NULL; /* We consumed it */
- goto out;
- }
+/**
+ * page_update_anon_rmap - move pte mapping of an anonymous page
+ * @page: the page to update the mapping of
+ * @mm: the new mm in which the mapping is found
+ * @address: the new user virtual address mapped
+ *
+ * The caller needs to hold the mm->page_table_lock.
+ *
+ * For do_wp_page: to update mapping to the one remaining mm.
+ * For copy_one_pte: to update address when vma is mremapped.
+ */
+void fastcall page_update_anon_rmap(struct page *page,
+ struct mm_struct *mm, unsigned long address)
+{
+ BUG_ON(!PageAnon(page));
+ if (page->mapcount != 1)
+ return;

- cur_pte_chain = page->pte.chain;
- if (cur_pte_chain->ptes[0]) { /* It's full */
- pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain,
- NRPTE - 1);
- page->pte.chain = pte_chain;
- pte_chain->ptes[NRPTE-1] = pte_paddr;
- pte_chain = NULL; /* We consumed it */
- goto out;
- }
- cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr;
- cur_pte_chain->next_and_idx--;
-out:
- pte_chain_unlock(page);
- return pte_chain;
+ rmap_lock(page);
+ page->index = address & PAGE_MASK;
+ page->mapping = (void *) mm; /* until next patch */
+ rmap_unlock(page);
}

/**
- * page_remove_rmap - take down reverse mapping to a page
- * @page: page to remove mapping from
- * @ptep: page table entry to remove
+ * page_add_obj_rmap - add pte mapping to a file page
+ * @page: the page to add the mapping to
*
- * Removes the reverse mapping from the pte_chain of the page,
- * after that the caller can clear the page table entry and free
- * the page.
- * Caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the mm->page_table_lock.
*/
-void fastcall page_remove_rmap(struct page *page, pte_t *ptep)
+void fastcall page_add_obj_rmap(struct page *page)
{
- pte_addr_t pte_paddr = ptep_to_paddr(ptep);
- struct pte_chain *pc;
-
+ BUG_ON(PageAnon(page));
if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
return;

- pte_chain_lock(page);
+ rmap_lock(page);
+ if (!page->mapcount)
+ inc_page_state(nr_mapped);
+ page->mapcount++;
+ rmap_unlock(page);
+}

- if (!page_mapped(page))
- goto out_unlock;
+/**
+ * page_remove_rmap - take down pte mapping from a page
+ * @page: page to remove mapping from
+ *
+ * Caller needs to hold the mm->page_table_lock.
+ */
+void fastcall page_remove_rmap(struct page *page)
+{
+ BUG_ON(PageReserved(page));
+ BUG_ON(!page->mapcount);

- /*
- * If this is an object-based page, just uncount it. We can
- * find the mappings by walking the object vma chain for that object.
- */
- if (!PageAnon(page)) {
- if (PageSwapCache(page))
- BUG();
- page->pte.mapcount--;
- goto out;
- }
-
- if (PageDirect(page)) {
- if (page->pte.direct == pte_paddr) {
- page->pte.direct = 0;
- ClearPageDirect(page);
- goto out;
- }
- } else {
- struct pte_chain *start = page->pte.chain;
- struct pte_chain *next;
- int victim_i = pte_chain_idx(start);
-
- for (pc = start; pc; pc = next) {
- int i;
-
- next = pte_chain_next(pc);
- if (next)
- prefetch(next);
- for (i = pte_chain_idx(pc); i < NRPTE; i++) {
- pte_addr_t pa = pc->ptes[i];
-
- if (pa != pte_paddr)
- continue;
- pc->ptes[i] = start->ptes[victim_i];
- start->ptes[victim_i] = 0;
- if (victim_i == NRPTE-1) {
- /* Emptied a pte_chain */
- page->pte.chain = pte_chain_next(start);
- __pte_chain_free(start);
- } else {
- start->next_and_idx++;
- }
- goto out;
- }
- }
- }
-out:
- if (!page_mapped(page)) {
+ rmap_lock(page);
+ page->mapcount--;
+ if (!page->mapcount) {
if (page_test_and_clear_dirty(page))
set_page_dirty(page);
if (PageAnon(page))
clear_page_anon(page);
dec_page_state(nr_mapped);
}
-out_unlock:
- pte_chain_unlock(page);
- return;
+ rmap_unlock(page);
}

/**
@@ -489,11 +314,9 @@ try_to_unmap_obj_one(struct vm_area_stru
if (pte_dirty(pteval))
set_page_dirty(page);

- if (!page->pte.mapcount)
- BUG();
-
mm->rss--;
- page->pte.mapcount--;
+ BUG_ON(!page->mapcount);
+ page->mapcount--;
page_cache_release(page);

out_unmap:
@@ -504,6 +327,11 @@ out:
return ret;
}

+static inline int try_to_unmap_anon(struct page *page)
+{
+ return SWAP_FAIL; /* until next patch */
+}
+
/**
* try_to_unmap_obj - unmap a page using the object-based rmap method
* @page: the page to unmap
@@ -523,24 +351,18 @@ try_to_unmap_obj(struct page *page)
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;

- if (!mapping)
- BUG();
-
- if (PageSwapCache(page))
- BUG();
-
if (down_trylock(&mapping->i_shared_sem))
return ret;

list_for_each_entry(vma, &mapping->i_mmap, shared) {
ret = try_to_unmap_obj_one(vma, page);
- if (ret == SWAP_FAIL || !page->pte.mapcount)
+ if (ret == SWAP_FAIL || !page->mapcount)
goto out;
}

list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
ret = try_to_unmap_obj_one(vma, page);
- if (ret == SWAP_FAIL || !page->pte.mapcount)
+ if (ret == SWAP_FAIL || !page->mapcount)
goto out;
}

@@ -550,103 +372,12 @@ out:
}

/**
- * try_to_unmap_one - worker function for try_to_unmap
- * @page: page to unmap
- * @ptep: page table entry to unmap from page
- *
- * Internal helper function for try_to_unmap, called for each page
- * table entry mapping a page. Because locking order here is opposite
- * to the locking order used by the page fault path, we use trylocks.
- * Locking:
- * page lock shrink_list(), trylock
- * pte_chain_lock shrink_list()
- * mm->page_table_lock try_to_unmap_one(), trylock
- */
-static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr)
-{
- pte_t *ptep = rmap_ptep_map(paddr);
- unsigned long address = ptep_to_address(ptep);
- struct mm_struct * mm = ptep_to_mm(ptep);
- struct vm_area_struct * vma;
- pte_t pte;
- int ret;
-
- if (!mm)
- BUG();
-
- /*
- * We need the page_table_lock to protect us from page faults,
- * munmap, fork, etc...
- */
- if (!spin_trylock(&mm->page_table_lock)) {
- rmap_ptep_unmap(ptep);
- return SWAP_AGAIN;
- }
-
-
- /* During mremap, it's possible pages are not in a VMA. */
- vma = find_vma(mm, address);
- if (!vma) {
- ret = SWAP_FAIL;
- goto out_unlock;
- }
-
- /* The page is mlock()d, we cannot swap it out. */
- if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) {
- ret = SWAP_FAIL;
- goto out_unlock;
- }
-
- /* Nuke the page table entry. */
- flush_cache_page(vma, address);
- pte = ptep_clear_flush(vma, address, ptep);
-
- if (PageAnon(page)) {
- swp_entry_t entry = { .val = page->private };
- /*
- * Store the swap location in the pte.
- * See handle_pte_fault() ...
- */
- BUG_ON(!PageSwapCache(page));
- swap_duplicate(entry);
- set_pte(ptep, swp_entry_to_pte(entry));
- BUG_ON(pte_file(*ptep));
- } else {
- unsigned long pgidx;
- /*
- * If a nonlinear mapping then store the file page offset
- * in the pte.
- */
- pgidx = (address - vma->vm_start) >> PAGE_SHIFT;
- pgidx += vma->vm_pgoff;
- pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
- if (page->index != pgidx) {
- set_pte(ptep, pgoff_to_pte(page->index));
- BUG_ON(!pte_file(*ptep));
- }
- }
-
- /* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pte))
- set_page_dirty(page);
-
- mm->rss--;
- page_cache_release(page);
- ret = SWAP_SUCCESS;
-
-out_unlock:
- rmap_ptep_unmap(ptep);
- spin_unlock(&mm->page_table_lock);
- return ret;
-}
-
-/**
* try_to_unmap - try to remove all page table mappings to a page
* @page: the page to get unmapped
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock
- * and its pte chain lock. Return values are:
+ * and its rmap_lock. Return values are:
*
* SWAP_SUCCESS - we succeeded in removing all mappings
* SWAP_AGAIN - we missed a trylock, try again later
@@ -654,80 +385,18 @@ out_unlock:
*/
int fastcall try_to_unmap(struct page * page)
{
- struct pte_chain *pc, *next_pc, *start;
- int ret = SWAP_SUCCESS;
- int victim_i;
-
- /* This page should not be on the pageout lists. */
- if (PageReserved(page))
- BUG();
- if (!PageLocked(page))
- BUG();
- /* We need backing store to swap out a page. */
- if (!page_mapping(page) && !PageSwapCache(page))
- BUG();
-
- /*
- * If it's an object-based page, use the object vma chain to find all
- * the mappings.
- */
- if (!PageAnon(page)) {
- ret = try_to_unmap_obj(page);
- goto out;
- }
+ int ret;

- if (PageDirect(page)) {
- ret = try_to_unmap_one(page, page->pte.direct);
- if (ret == SWAP_SUCCESS) {
- page->pte.direct = 0;
- ClearPageDirect(page);
- }
- goto out;
- }
+ BUG_ON(PageReserved(page));
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!page->mapcount);
+
+ if (PageAnon(page))
+ ret = try_to_unmap_anon(page);
+ else
+ ret = try_to_unmap_obj(page);

- start = page->pte.chain;
- victim_i = pte_chain_idx(start);
- for (pc = start; pc; pc = next_pc) {
- int i;
-
- next_pc = pte_chain_next(pc);
- if (next_pc)
- prefetch(next_pc);
- for (i = pte_chain_idx(pc); i < NRPTE; i++) {
- pte_addr_t pte_paddr = pc->ptes[i];
-
- switch (try_to_unmap_one(page, pte_paddr)) {
- case SWAP_SUCCESS:
- /*
- * Release a slot. If we're releasing the
- * first pte in the first pte_chain then
- * pc->ptes[i] and start->ptes[victim_i] both
- * refer to the same thing. It works out.
- */
- pc->ptes[i] = start->ptes[victim_i];
- start->ptes[victim_i] = 0;
- victim_i++;
- if (victim_i == NRPTE) {
- page->pte.chain = pte_chain_next(start);
- __pte_chain_free(start);
- start = page->pte.chain;
- victim_i = 0;
- } else {
- start->next_and_idx++;
- }
- break;
- case SWAP_AGAIN:
- /* Skip this pte, remembering status. */
- ret = SWAP_AGAIN;
- continue;
- case SWAP_FAIL:
- ret = SWAP_FAIL;
- goto out;
- }
- }
- }
-out:
- if (!page_mapped(page)) {
+ if (!page->mapcount) {
if (page_test_and_clear_dirty(page))
set_page_dirty(page);
if (PageAnon(page))
@@ -737,76 +406,3 @@ out:
}
return ret;
}
-
-/**
- ** No more VM stuff below this comment, only pte_chain helper
- ** functions.
- **/
-
-static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags)
-{
- struct pte_chain *pc = p;
-
- memset(pc, 0, sizeof(*pc));
-}
-
-DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0;
-
-/**
- * __pte_chain_free - free pte_chain structure
- * @pte_chain: pte_chain struct to free
- */
-void __pte_chain_free(struct pte_chain *pte_chain)
-{
- struct pte_chain **pte_chainp;
-
- pte_chainp = &get_cpu_var(local_pte_chain);
- if (pte_chain->next_and_idx)
- pte_chain->next_and_idx = 0;
- if (*pte_chainp)
- kmem_cache_free(pte_chain_cache, *pte_chainp);
- *pte_chainp = pte_chain;
- put_cpu_var(local_pte_chain);
-}
-
-/*
- * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap().
- *
- * The caller of page_add_rmap() must perform the allocation because
- * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap()
- * will not actually use the pte_chain, because there is space available in one
- * of the existing pte_chains which are attached to the page. So the case of
- * allocating and then freeing a single pte_chain is specially optimised here,
- * with a one-deep per-cpu cache.
- */
-struct pte_chain *pte_chain_alloc(int gfp_flags)
-{
- struct pte_chain *ret;
- struct pte_chain **pte_chainp;
-
- might_sleep_if(gfp_flags & __GFP_WAIT);
-
- pte_chainp = &get_cpu_var(local_pte_chain);
- if (*pte_chainp) {
- ret = *pte_chainp;
- *pte_chainp = NULL;
- put_cpu_var(local_pte_chain);
- } else {
- put_cpu_var(local_pte_chain);
- ret = kmem_cache_alloc(pte_chain_cache, gfp_flags);
- }
- return ret;
-}
-
-void __init pte_chain_init(void)
-{
- pte_chain_cache = kmem_cache_create( "pte_chain",
- sizeof(struct pte_chain),
- 0,
- SLAB_MUST_HWCACHE_ALIGN,
- pte_chain_ctor,
- NULL);
-
- if (!pte_chain_cache)
- panic("failed to create pte_chain cache!\n");
-}
--- anobjrmap3/mm/swapfile.c 2004-03-18 21:27:03.832309120 +0000
+++ anobjrmap4/mm/swapfile.c 2004-03-18 21:27:15.365555800 +0000
@@ -391,20 +391,19 @@ void free_swap_and_cache(swp_entry_t ent
/* vma->vm_mm->page_table_lock is held */
static void
unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
- swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
+ swp_entry_t entry, struct page *page)
{
vma->vm_mm->rss++;
get_page(page);
set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
- SetPageAnon(page);
- *pte_chainp = page_add_rmap(page, dir, *pte_chainp);
+ page_add_anon_rmap(page, vma->vm_mm, address);
swap_free(entry);
}

/* vma->vm_mm->page_table_lock is held */
static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
unsigned long address, unsigned long size, unsigned long offset,
- swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
+ swp_entry_t entry, struct page *page)
{
pte_t * pte;
unsigned long end;
@@ -429,8 +428,7 @@ static int unuse_pmd(struct vm_area_stru
* Test inline before going to call unuse_pte.
*/
if (unlikely(pte_same(*pte, swp_pte))) {
- unuse_pte(vma, offset + address, pte,
- entry, page, pte_chainp);
+ unuse_pte(vma, offset + address, pte, entry, page);
pte_unmap(pte);
return 1;
}
@@ -444,7 +442,7 @@ static int unuse_pmd(struct vm_area_stru
/* vma->vm_mm->page_table_lock is held */
static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
unsigned long address, unsigned long size,
- swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
+ swp_entry_t entry, struct page *page)
{
pmd_t * pmd;
unsigned long offset, end;
@@ -466,7 +464,7 @@ static int unuse_pgd(struct vm_area_stru
BUG();
do {
if (unuse_pmd(vma, pmd, address, end - address,
- offset, entry, page, pte_chainp))
+ offset, entry, page))
return 1;
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
@@ -476,15 +474,14 @@ static int unuse_pgd(struct vm_area_stru

/* vma->vm_mm->page_table_lock is held */
static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
- swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
+ swp_entry_t entry, struct page *page)
{
unsigned long start = vma->vm_start, end = vma->vm_end;

if (start >= end)
BUG();
do {
- if (unuse_pgd(vma, pgdir, start, end - start,
- entry, page, pte_chainp))
+ if (unuse_pgd(vma, pgdir, start, end - start, entry, page))
return 1;
start = (start + PGDIR_SIZE) & PGDIR_MASK;
pgdir++;
@@ -492,15 +489,10 @@ static int unuse_vma(struct vm_area_stru
return 0;
}

-static int unuse_process(struct mm_struct * mm,
+static void unuse_process(struct mm_struct * mm,
swp_entry_t entry, struct page* page)
{
struct vm_area_struct* vma;
- struct pte_chain *pte_chain;
-
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain)
- return -ENOMEM;

/*
* Go through process' page directory.
@@ -508,12 +500,10 @@ static int unuse_process(struct mm_struc
spin_lock(&mm->page_table_lock);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
pgd_t * pgd = pgd_offset(mm, vma->vm_start);
- if (unuse_vma(vma, pgd, entry, page, &pte_chain))
+ if (unuse_vma(vma, pgd, entry, page))
break;
}
spin_unlock(&mm->page_table_lock);
- pte_chain_free(pte_chain);
- return 0;
}

/*
@@ -661,7 +651,7 @@ static int try_to_unuse(unsigned int typ
if (start_mm == &init_mm)
shmem = shmem_unuse(entry, page);
else
- retval = unuse_process(start_mm, entry, page);
+ unuse_process(start_mm, entry, page);
}
if (*swap_map > 1) {
int set_start_mm = (*swap_map >= swcount);
@@ -673,7 +663,7 @@ static int try_to_unuse(unsigned int typ
atomic_inc(&new_start_mm->mm_users);
atomic_inc(&prev_mm->mm_users);
spin_lock(&mmlist_lock);
- while (*swap_map > 1 && !retval &&
+ while (*swap_map > 1 &&
(p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
atomic_inc(&mm->mm_users);
@@ -690,7 +680,7 @@ static int try_to_unuse(unsigned int typ
set_start_mm = 1;
shmem = shmem_unuse(entry, page);
} else
- retval = unuse_process(mm, entry, page);
+ unuse_process(mm, entry, page);
if (set_start_mm && *swap_map < swcount) {
mmput(new_start_mm);
atomic_inc(&mm->mm_users);
@@ -704,11 +694,6 @@ static int try_to_unuse(unsigned int typ
mmput(start_mm);
start_mm = new_start_mm;
}
- if (retval) {
- unlock_page(page);
- page_cache_release(page);
- break;
- }

/*
* How could swap count reach 0x7fff when the maximum
--- anobjrmap3/mm/vmscan.c 2004-03-18 21:27:03.835308664 +0000
+++ anobjrmap4/mm/vmscan.c 2004-03-18 21:27:15.367555496 +0000
@@ -171,7 +171,7 @@ static int shrink_slab(unsigned long sca
return 0;
}

-/* Must be called with page's pte_chain_lock held. */
+/* Must be called with page's rmap_lock held. */
static inline int page_mapping_inuse(struct page *page)
{
struct address_space *mapping;
@@ -275,11 +275,11 @@ shrink_list(struct list_head *page_list,
if (PageWriteback(page))
goto keep_locked;

- pte_chain_lock(page);
+ rmap_lock(page);
referenced = page_referenced(page);
if (referenced && page_mapping_inuse(page)) {
/* In active use or really unfreeable. Activate it. */
- pte_chain_unlock(page);
+ rmap_unlock(page);
goto activate_locked;
}

@@ -295,10 +295,10 @@ shrink_list(struct list_head *page_list,
if (PageSwapCache(page))
mapping = &swapper_space;
else if (PageAnon(page)) {
- pte_chain_unlock(page);
+ rmap_unlock(page);
if (!add_to_swap(page))
goto activate_locked;
- pte_chain_lock(page);
+ rmap_lock(page);
mapping = &swapper_space;
}
#endif /* CONFIG_SWAP */
@@ -313,16 +313,16 @@ shrink_list(struct list_head *page_list,
if (page_mapped(page) && mapping) {
switch (try_to_unmap(page)) {
case SWAP_FAIL:
- pte_chain_unlock(page);
+ rmap_unlock(page);
goto activate_locked;
case SWAP_AGAIN:
- pte_chain_unlock(page);
+ rmap_unlock(page);
goto keep_locked;
case SWAP_SUCCESS:
; /* try to free the page below */
}
}
- pte_chain_unlock(page);
+ rmap_unlock(page);

/*
* If the page is dirty, only perform writeback if that write
@@ -660,13 +660,13 @@ refill_inactive_zone(struct zone *zone,
list_add(&page->lru, &l_active);
continue;
}
- pte_chain_lock(page);
+ rmap_lock(page);
if (page_referenced(page)) {
- pte_chain_unlock(page);
+ rmap_unlock(page);
list_add(&page->lru, &l_active);
continue;
}
- pte_chain_unlock(page);
+ rmap_unlock(page);
}
/*
* FIXME: need to consider page_count(page) here if/when we

2004-03-18 23:52:04

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH] anobjrmap 6/6 cleanup

anobjrmap 6/6 remove remnants of pte_chain rmap

Cleanup, mostly of arch headers: only bother with this patch
if you're sure pte_chain rmap is finished. No special page table
initialization is needed for full objrmap (though it can still be
helpful in debugging). But ppc and ppc64 have come to rely on it for
their ptep_test_and_clear_young, so reinstate there. Delete pte_addr_t
from asm/pgtable.h, KM_PTE2 from asm/kmap_types.h; delete asm/rmap.h.

arch/arm/mm/mm-armv.c | 3 -
arch/ppc/mm/pgtable.c | 28 ++++++++----
arch/ppc64/mm/hugetlbpage.c | 3 -
arch/ppc64/mm/tlb.c | 4 -
include/asm-alpha/pgtable.h | 2
include/asm-alpha/rmap.h | 7 ---
include/asm-arm/kmap_types.h | 1
include/asm-arm/pgtable.h | 2
include/asm-arm/rmap.h | 6 --
include/asm-arm26/pgtable.h | 2
include/asm-arm26/rmap.h | 66 -----------------------------
include/asm-cris/pgtable.h | 2
include/asm-cris/rmap.h | 7 ---
include/asm-generic/rmap.h | 90 ----------------------------------------
include/asm-h8300/pgtable.h | 2
include/asm-i386/kmap_types.h | 1
include/asm-i386/pgtable.h | 12 -----
include/asm-i386/rmap.h | 21 ---------
include/asm-ia64/pgtable.h | 2
include/asm-ia64/rmap.h | 7 ---
include/asm-m68k/pgtable.h | 2
include/asm-m68k/rmap.h | 7 ---
include/asm-m68knommu/pgtable.h | 2
include/asm-m68knommu/rmap.h | 2
include/asm-mips/kmap_types.h | 1
include/asm-mips/pgtable-32.h | 6 --
include/asm-mips/pgtable-64.h | 2
include/asm-mips/rmap.h | 7 ---
include/asm-parisc/pgtable.h | 2
include/asm-parisc/rmap.h | 7 ---
include/asm-ppc/pgtable.h | 2
include/asm-ppc/rmap.h | 9 ----
include/asm-ppc64/pgalloc.h | 31 ++++++++++---
include/asm-ppc64/pgtable.h | 2
include/asm-ppc64/rmap.h | 9 ----
include/asm-s390/pgtable.h | 2
include/asm-s390/rmap.h | 7 ---
include/asm-sh/pgtable.h | 2
include/asm-sh/rmap.h | 7 ---
include/asm-sparc/kmap_types.h | 1
include/asm-sparc/pgtable.h | 2
include/asm-sparc/rmap.h | 7 ---
include/asm-sparc64/pgtable.h | 2
include/asm-sparc64/rmap.h | 7 ---
include/asm-um/pgtable.h | 12 -----
include/asm-um/rmap.h | 6 --
include/asm-v850/pgtable.h | 2
include/asm-v850/rmap.h | 1
include/asm-x86_64/pgtable.h | 2
include/asm-x86_64/rmap.h | 7 ---
mm/memory.c | 6 --
51 files changed, 48 insertions(+), 384 deletions(-)

--- anobjrmap5/arch/arm/mm/mm-armv.c 2004-03-11 01:56:10.000000000 +0000
+++ anobjrmap6/arch/arm/mm/mm-armv.c 2004-03-18 21:27:38.330064664 +0000
@@ -19,7 +19,6 @@
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/page.h>
-#include <asm/rmap.h>
#include <asm/io.h>
#include <asm/setup.h>
#include <asm/tlbflush.h>
@@ -232,7 +231,7 @@ void free_pgd_slow(pgd_t *pgd)

pte = pmd_page(*pmd);
pmd_clear(pmd);
- pgtable_remove_rmap(pte);
+ dec_page_state(nr_page_table_pages);
pte_free(pte);
pmd_free(pmd);
free:
--- anobjrmap5/arch/ppc/mm/pgtable.c 2004-02-18 03:00:06.000000000 +0000
+++ anobjrmap6/arch/ppc/mm/pgtable.c 2004-03-18 21:27:38.331064512 +0000
@@ -86,9 +86,14 @@ pte_t *pte_alloc_one_kernel(struct mm_st
extern int mem_init_done;
extern void *early_get_page(void);

- if (mem_init_done)
+ if (mem_init_done) {
pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- else
+ if (pte) {
+ struct page *ptepage = virt_to_page(pte);
+ ptepage->mapping = (void *) mm;
+ ptepage->index = address & PMD_MASK;
+ }
+ } else
pte = (pte_t *)early_get_page();
if (pte)
clear_page(pte);
@@ -97,7 +102,7 @@ pte_t *pte_alloc_one_kernel(struct mm_st

struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
- struct page *pte;
+ struct page *ptepage;

#ifdef CONFIG_HIGHPTE
int flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT;
@@ -105,10 +110,13 @@ struct page *pte_alloc_one(struct mm_str
int flags = GFP_KERNEL | __GFP_REPEAT;
#endif

- pte = alloc_pages(flags, 0);
- if (pte)
- clear_highpage(pte);
- return pte;
+ ptepage = alloc_pages(flags, 0);
+ if (ptepage) {
+ ptepage->mapping = (void *) mm;
+ ptepage->index = address & PMD_MASK;
+ clear_highpage(ptepage);
+ }
+ return ptepage;
}

void pte_free_kernel(pte_t *pte)
@@ -116,15 +124,17 @@ void pte_free_kernel(pte_t *pte)
#ifdef CONFIG_SMP
hash_page_sync();
#endif
+ virt_to_page(pte)->mapping = NULL;
free_page((unsigned long)pte);
}

-void pte_free(struct page *pte)
+void pte_free(struct page *ptepage)
{
#ifdef CONFIG_SMP
hash_page_sync();
#endif
- __free_page(pte);
+ ptepage->mapping = NULL;
+ __free_page(ptepage);
}

#ifndef CONFIG_44x
--- anobjrmap5/arch/ppc64/mm/hugetlbpage.c 2004-03-11 01:56:09.000000000 +0000
+++ anobjrmap6/arch/ppc64/mm/hugetlbpage.c 2004-03-18 21:27:38.333064208 +0000
@@ -25,7 +25,6 @@
#include <asm/machdep.h>
#include <asm/cputable.h>
#include <asm/tlb.h>
-#include <asm/rmap.h>

#include <linux/sysctl.h>

@@ -273,7 +272,7 @@ static int open_32bit_htlbpage_range(str
}

pmd_clear(pmd);
- pgtable_remove_rmap(page);
+ dec_page_state(nr_page_table_pages);
pte_free(page);
}
}
--- anobjrmap5/arch/ppc64/mm/tlb.c 2004-03-11 01:56:13.000000000 +0000
+++ anobjrmap6/arch/ppc64/mm/tlb.c 2004-03-18 21:27:38.334064056 +0000
@@ -31,7 +31,6 @@
#include <asm/tlb.h>
#include <asm/hardirq.h>
#include <linux/highmem.h>
-#include <asm/rmap.h>

DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);

@@ -59,7 +58,8 @@ void hpte_update(pte_t *ptep, unsigned l

ptepage = virt_to_page(ptep);
mm = (struct mm_struct *) ptepage->mapping;
- addr = ptep_to_address(ptep);
+ addr = ptepage->index +
+ (((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE);

if (REGION_ID(addr) == USER_REGION_ID)
context = mm->context.id;
--- anobjrmap5/include/asm-alpha/pgtable.h 2003-10-08 20:24:56.000000000 +0100
+++ anobjrmap6/include/asm-alpha/pgtable.h 2004-03-18 21:27:38.335063904 +0000
@@ -349,6 +349,4 @@ extern void paging_init(void);
/* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */
#define HAVE_ARCH_UNMAPPED_AREA

-typedef pte_t *pte_addr_t;
-
#endif /* _ALPHA_PGTABLE_H */
--- anobjrmap5/include/asm-alpha/rmap.h 2002-07-20 20:12:35.000000000 +0100
+++ anobjrmap6/include/asm-alpha/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,7 +0,0 @@
-#ifndef _ALPHA_RMAP_H
-#define _ALPHA_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-arm/kmap_types.h 2004-01-09 06:00:24.000000000 +0000
+++ anobjrmap6/include/asm-arm/kmap_types.h 2004-03-18 21:27:38.345062384 +0000
@@ -14,7 +14,6 @@ enum km_type {
KM_BIO_DST_IRQ,
KM_PTE0,
KM_PTE1,
- KM_PTE2,
KM_IRQ0,
KM_IRQ1,
KM_SOFTIRQ0,
--- anobjrmap5/include/asm-arm/pgtable.h 2004-01-09 06:00:23.000000000 +0000
+++ anobjrmap6/include/asm-arm/pgtable.h 2004-03-18 21:27:38.346062232 +0000
@@ -353,8 +353,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD
#define io_remap_page_range(vma,from,phys,size,prot) \
remap_page_range(vma,from,phys,size,prot)

-typedef pte_t *pte_addr_t;
-
#define pgtable_cache_init() do { } while (0)

#endif /* !__ASSEMBLY__ */
--- anobjrmap5/include/asm-arm/rmap.h 2002-08-01 22:17:36.000000000 +0100
+++ anobjrmap6/include/asm-arm/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,6 +0,0 @@
-#ifndef _ARM_RMAP_H
-#define _ARM_RMAP_H
-
-#include <asm-generic/rmap.h>
-
-#endif /* _ARM_RMAP_H */
--- anobjrmap5/include/asm-arm26/pgtable.h 2003-10-08 20:24:55.000000000 +0100
+++ anobjrmap6/include/asm-arm26/pgtable.h 2004-03-18 21:27:38.348061928 +0000
@@ -290,8 +290,6 @@ static inline pte_t mk_pte_phys(unsigned
#define io_remap_page_range(vma,from,phys,size,prot) \
remap_page_range(vma,from,phys,size,prot)

-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */

#endif /* _ASMARM_PGTABLE_H */
--- anobjrmap5/include/asm-arm26/rmap.h 2003-06-14 20:18:58.000000000 +0100
+++ anobjrmap6/include/asm-arm26/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,66 +0,0 @@
-#ifndef _ARM_RMAP_H
-#define _ARM_RMAP_H
-
-/*
- * linux/include/asm-arm26/proc-armv/rmap.h
- *
- * Architecture dependant parts of the reverse mapping code,
- *
- * ARM is different since hardware page tables are smaller than
- * the page size and Linux uses a "duplicate" one with extra info.
- * For rmap this means that the first 2 kB of a page are the hardware
- * page tables and the last 2 kB are the software page tables.
- */
-
-static inline void pgtable_add_rmap(struct page *page, struct mm_struct * mm, unsigned long address)
-{
- page->mapping = (void *)mm;
- page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
- inc_page_state(nr_page_table_pages);
-}
-
-static inline void pgtable_remove_rmap(struct page *page)
-{
- page->mapping = NULL;
- page->index = 0;
- dec_page_state(nr_page_table_pages);
-}
-
-static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
-{
- struct page * page = virt_to_page(ptep);
- return (struct mm_struct *)page->mapping;
-}
-
-/* The page table takes half of the page */
-#define PTE_MASK ((PAGE_SIZE / 2) - 1)
-
-static inline unsigned long ptep_to_address(pte_t * ptep)
-{
- struct page * page = virt_to_page(ptep);
- unsigned long low_bits;
-
- low_bits = ((unsigned long)ptep & PTE_MASK) * PTRS_PER_PTE;
- return page->index + low_bits;
-}
-
-//FIXME!!! IS these correct?
-static inline pte_addr_t ptep_to_paddr(pte_t *ptep)
-{
- return (pte_addr_t)ptep;
-}
-
-static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr)
-{
- return (pte_t *)pte_paddr;
-}
-
-static inline void rmap_ptep_unmap(pte_t *pte)
-{
- return;
-}
-
-
-//#include <asm-generic/rmap.h>
-
-#endif /* _ARM_RMAP_H */
--- anobjrmap5/include/asm-cris/pgtable.h 2003-07-10 21:16:26.000000000 +0100
+++ anobjrmap6/include/asm-cris/pgtable.h 2004-03-18 21:27:38.350061624 +0000
@@ -337,6 +337,4 @@ extern inline void update_mmu_cache(stru
#define pte_to_pgoff(x) (pte_val(x) >> 6)
#define pgoff_to_pte(x) __pte(((x) << 6) | _PAGE_FILE)

-typedef pte_t *pte_addr_t;
-
#endif /* _CRIS_PGTABLE_H */
--- anobjrmap5/include/asm-cris/rmap.h 2002-07-20 20:12:35.000000000 +0100
+++ anobjrmap6/include/asm-cris/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,7 +0,0 @@
-#ifndef _CRIS_RMAP_H
-#define _CRIS_RMAP_H
-
-/* nothing to see, move along :) */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-generic/rmap.h 2003-05-27 02:01:29.000000000 +0100
+++ anobjrmap6/include/asm-generic/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,90 +0,0 @@
-#ifndef _GENERIC_RMAP_H
-#define _GENERIC_RMAP_H
-/*
- * linux/include/asm-generic/rmap.h
- *
- * Architecture dependent parts of the reverse mapping code,
- * this version should work for most architectures with a
- * 'normal' page table layout.
- *
- * We use the struct page of the page table page to find out
- * the process and full address of a page table entry:
- * - page->mapping points to the process' mm_struct
- * - page->index has the high bits of the address
- * - the lower bits of the address are calculated from the
- * offset of the page table entry within the page table page
- *
- * For CONFIG_HIGHPTE, we need to represent the address of a pte in a
- * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE
- * bits and is then ORed with the byte offset of the pte within its page.
- *
- * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for
- * the offset.
- *
- * For CONFIG_HIGHMEM64G, the pte_addr_t is 64 bits. 52 for the pfn, 12 for
- * the offset.
- */
-#include <linux/mm.h>
-
-static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address)
-{
-#ifdef BROKEN_PPC_PTE_ALLOC_ONE
- /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */
- extern int mem_init_done;
-
- if (!mem_init_done)
- return;
-#endif
- page->mapping = (void *)mm;
- page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
- inc_page_state(nr_page_table_pages);
-}
-
-static inline void pgtable_remove_rmap(struct page * page)
-{
- page->mapping = NULL;
- page->index = 0;
- dec_page_state(nr_page_table_pages);
-}
-
-static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
-{
- struct page * page = kmap_atomic_to_page(ptep);
- return (struct mm_struct *) page->mapping;
-}
-
-static inline unsigned long ptep_to_address(pte_t * ptep)
-{
- struct page * page = kmap_atomic_to_page(ptep);
- unsigned long low_bits;
- low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
- return page->index + low_bits;
-}
-
-#ifdef CONFIG_HIGHPTE
-static inline pte_addr_t ptep_to_paddr(pte_t *ptep)
-{
- pte_addr_t paddr;
- paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT;
- return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK);
-}
-#else
-static inline pte_addr_t ptep_to_paddr(pte_t *ptep)
-{
- return (pte_addr_t)ptep;
-}
-#endif
-
-#ifndef CONFIG_HIGHPTE
-static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr)
-{
- return (pte_t *)pte_paddr;
-}
-
-static inline void rmap_ptep_unmap(pte_t *pte)
-{
- return;
-}
-#endif
-
-#endif /* _GENERIC_RMAP_H */
--- anobjrmap5/include/asm-h8300/pgtable.h 2003-08-09 05:44:10.000000000 +0100
+++ anobjrmap6/include/asm-h8300/pgtable.h 2004-03-18 21:27:38.355060864 +0000
@@ -7,8 +7,6 @@
#include <asm/page.h>
#include <asm/io.h>

-typedef pte_t *pte_addr_t;
-
#define pgd_present(pgd) (1) /* pages are always present on NO_MM */
#define pgd_none(pgd) (0)
#define pgd_bad(pgd) (0)
--- anobjrmap5/include/asm-i386/kmap_types.h 2003-05-27 02:01:30.000000000 +0100
+++ anobjrmap6/include/asm-i386/kmap_types.h 2004-03-18 21:27:38.356060712 +0000
@@ -19,7 +19,6 @@ D(5) KM_BIO_SRC_IRQ,
D(6) KM_BIO_DST_IRQ,
D(7) KM_PTE0,
D(8) KM_PTE1,
-D(9) KM_PTE2,
D(10) KM_IRQ0,
D(11) KM_IRQ1,
D(12) KM_SOFTIRQ0,
--- anobjrmap5/include/asm-i386/pgtable.h 2004-03-16 07:00:20.000000000 +0000
+++ anobjrmap6/include/asm-i386/pgtable.h 2004-03-18 21:27:38.357060560 +0000
@@ -308,18 +308,6 @@ static inline pte_t pte_modify(pte_t pte
#define pte_unmap_nested(pte) do { } while (0)
#endif

-#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G)
-typedef u32 pte_addr_t;
-#endif
-
-#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G)
-typedef u64 pte_addr_t;
-#endif
-
-#if !defined(CONFIG_HIGHPTE)
-typedef pte_t *pte_addr_t;
-#endif
-
/*
* The i386 doesn't have any external MMU info: the kernel page
* tables contain all the necessary information.
--- anobjrmap5/include/asm-i386/rmap.h 2002-09-16 03:19:56.000000000 +0100
+++ anobjrmap6/include/asm-i386/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,21 +0,0 @@
-#ifndef _I386_RMAP_H
-#define _I386_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#ifdef CONFIG_HIGHPTE
-static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr)
-{
- unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT);
- unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK;
- return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off);
-}
-
-static inline void rmap_ptep_unmap(pte_t *pte)
-{
- kunmap_atomic(pte, KM_PTE2);
-}
-#endif
-
-#endif
--- anobjrmap5/include/asm-ia64/pgtable.h 2004-02-04 02:45:17.000000000 +0000
+++ anobjrmap6/include/asm-ia64/pgtable.h 2004-03-18 21:27:38.359060256 +0000
@@ -468,8 +468,6 @@ extern void hugetlb_free_pgtables(struct
struct vm_area_struct * prev, unsigned long start, unsigned long end);
#endif

-typedef pte_t *pte_addr_t;
-
/*
* IA-64 doesn't have any external MMU info: the page tables contain all the necessary
* information. However, we use this routine to take care of any (delayed) i-cache
--- anobjrmap5/include/asm-ia64/rmap.h 2002-08-27 20:28:05.000000000 +0100
+++ anobjrmap6/include/asm-ia64/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,7 +0,0 @@
-#ifndef _ASM_IA64_RMAP_H
-#define _ASM_IA64_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif /* _ASM_IA64_RMAP_H */
--- anobjrmap5/include/asm-m68k/pgtable.h 2004-02-04 02:45:41.000000000 +0000
+++ anobjrmap6/include/asm-m68k/pgtable.h 2004-03-18 21:27:38.361059952 +0000
@@ -168,8 +168,6 @@ static inline void update_mmu_cache(stru
? (__pgprot((pgprot_val(prot) & _CACHEMASK040) | _PAGE_NOCACHE_S)) \
: (prot)))

-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */

/*
--- anobjrmap5/include/asm-m68k/rmap.h 2002-07-20 20:12:35.000000000 +0100
+++ anobjrmap6/include/asm-m68k/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,7 +0,0 @@
-#ifndef _M68K_RMAP_H
-#define _M68K_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-m68knommu/pgtable.h 2003-05-27 02:01:29.000000000 +0100
+++ anobjrmap6/include/asm-m68knommu/pgtable.h 2004-03-18 21:27:38.362059800 +0000
@@ -11,8 +11,6 @@
#include <asm/page.h>
#include <asm/io.h>

-typedef pte_t *pte_addr_t;
-
/*
* Trivial page table functions.
*/
--- anobjrmap5/include/asm-m68knommu/rmap.h 2002-11-04 21:31:04.000000000 +0000
+++ anobjrmap6/include/asm-m68knommu/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,2 +0,0 @@
-/* Do not need anything here */
-
--- anobjrmap5/include/asm-mips/kmap_types.h 2003-07-02 22:00:48.000000000 +0100
+++ anobjrmap6/include/asm-mips/kmap_types.h 2004-03-18 21:27:38.364059496 +0000
@@ -19,7 +19,6 @@ D(5) KM_BIO_SRC_IRQ,
D(6) KM_BIO_DST_IRQ,
D(7) KM_PTE0,
D(8) KM_PTE1,
-D(9) KM_PTE2,
D(10) KM_IRQ0,
D(11) KM_IRQ1,
D(12) KM_SOFTIRQ0,
--- anobjrmap5/include/asm-mips/pgtable-32.h 2004-03-11 01:56:09.000000000 +0000
+++ anobjrmap6/include/asm-mips/pgtable-32.h 2004-03-18 21:27:38.365059344 +0000
@@ -216,10 +216,4 @@ static inline pmd_t *pmd_offset(pgd_t *d
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })

-#ifdef CONFIG_64BIT_PHYS_ADDR
-typedef u64 pte_addr_t;
-#else
-typedef pte_t *pte_addr_t;
-#endif
-
#endif /* _ASM_PGTABLE_32_H */
--- anobjrmap5/include/asm-mips/pgtable-64.h 2004-03-11 01:56:07.000000000 +0000
+++ anobjrmap6/include/asm-mips/pgtable-64.h 2004-03-18 21:27:38.365059344 +0000
@@ -214,6 +214,4 @@ static inline pte_t mk_swap_pte(unsigned
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })

-typedef pte_t *pte_addr_t;
-
#endif /* _ASM_PGTABLE_64_H */
--- anobjrmap5/include/asm-mips/rmap.h 2003-07-02 22:00:11.000000000 +0100
+++ anobjrmap6/include/asm-mips/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,7 +0,0 @@
-#ifndef __ASM_RMAP_H
-#define __ASM_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif /* __ASM_RMAP_H */
--- anobjrmap5/include/asm-parisc/pgtable.h 2004-02-04 02:45:42.000000000 +0000
+++ anobjrmap6/include/asm-parisc/pgtable.h 2004-03-18 21:27:38.367059040 +0000
@@ -450,8 +450,6 @@ static inline void ptep_mkdirty(pte_t *p

#define pte_same(A,B) (pte_val(A) == pte_val(B))

-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */

#define io_remap_page_range remap_page_range
--- anobjrmap5/include/asm-parisc/rmap.h 2002-07-20 20:12:35.000000000 +0100
+++ anobjrmap6/include/asm-parisc/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,7 +0,0 @@
-#ifndef _PARISC_RMAP_H
-#define _PARISC_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-ppc/pgtable.h 2004-02-18 03:00:06.000000000 +0000
+++ anobjrmap6/include/asm-ppc/pgtable.h 2004-03-18 21:27:38.369058736 +0000
@@ -670,8 +670,6 @@ extern void kernel_set_cachemode (unsign
*/
#define pgtable_cache_init() do { } while (0)

-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */

#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
--- anobjrmap5/include/asm-ppc/rmap.h 2002-07-20 20:12:35.000000000 +0100
+++ anobjrmap6/include/asm-ppc/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,9 +0,0 @@
-#ifndef _PPC_RMAP_H
-#define _PPC_RMAP_H
-
-/* PPC calls pte_alloc() before mem_map[] is setup ... */
-#define BROKEN_PPC_PTE_ALLOC_ONE
-
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-ppc64/pgalloc.h 2004-02-04 02:45:16.000000000 +0000
+++ anobjrmap6/include/asm-ppc64/pgalloc.h 2004-03-18 21:27:38.371058432 +0000
@@ -48,28 +48,43 @@ pmd_free(pmd_t *pmd)
pmd_populate_kernel(mm, pmd, page_address(pte_page))

static inline pte_t *
-pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr)
+pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
- return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT);
+ pte_t *pte;
+ pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT);
+ if (pte) {
+ struct page *ptepage = virt_to_page(pte);
+ ptepage->mapping = (void *) mm;
+ ptepage->index = address & PMD_MASK;
+ }
+ return pte;
}

static inline struct page *
pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
- pte_t *pte = pte_alloc_one_kernel(mm, address);
-
- if (pte)
- return virt_to_page(pte);
-
+ pte_t *pte;
+ pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT);
+ if (pte) {
+ struct page *ptepage = virt_to_page(pte);
+ ptepage->mapping = (void *) mm;
+ ptepage->index = address & PMD_MASK;
+ return ptepage;
+ }
return NULL;
}

static inline void pte_free_kernel(pte_t *pte)
{
+ virt_to_page(pte)->mapping = NULL;
kmem_cache_free(zero_cache, pte);
}

-#define pte_free(pte_page) pte_free_kernel(page_address(pte_page))
+static inline void pte_free(struct page *ptepage)
+{
+ ptepage->mapping = NULL;
+ kmem_cache_free(zero_cache, page_address(ptepage));
+}

struct pte_freelist_batch
{
--- anobjrmap5/include/asm-ppc64/pgtable.h 2004-03-11 01:56:12.000000000 +0000
+++ anobjrmap6/include/asm-ppc64/pgtable.h 2004-03-18 21:27:38.372058280 +0000
@@ -488,8 +488,6 @@ extern struct vm_struct * im_get_area(un
int region_type);
unsigned long im_free(void *addr);

-typedef pte_t *pte_addr_t;
-
long pSeries_lpar_hpte_insert(unsigned long hpte_group,
unsigned long va, unsigned long prpn,
int secondary, unsigned long hpteflags,
--- anobjrmap5/include/asm-ppc64/rmap.h 2002-07-24 22:03:39.000000000 +0100
+++ anobjrmap6/include/asm-ppc64/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,9 +0,0 @@
-#ifndef _PPC64_RMAP_H
-#define _PPC64_RMAP_H
-
-/* PPC64 calls pte_alloc() before mem_map[] is setup ... */
-#define BROKEN_PPC_PTE_ALLOC_ONE
-
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-s390/pgtable.h 2004-02-04 02:45:37.000000000 +0000
+++ anobjrmap6/include/asm-s390/pgtable.h 2004-03-18 21:27:38.375057824 +0000
@@ -764,8 +764,6 @@ extern inline pte_t mk_swap_pte(unsigned
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })

-typedef pte_t *pte_addr_t;
-
#ifndef __s390x__
# define PTE_FILE_MAX_BITS 26
#else /* __s390x__ */
--- anobjrmap5/include/asm-s390/rmap.h 2002-07-20 20:12:35.000000000 +0100
+++ anobjrmap6/include/asm-s390/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,7 +0,0 @@
-#ifndef _S390_RMAP_H
-#define _S390_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-sh/pgtable.h 2004-02-04 02:45:43.000000000 +0000
+++ anobjrmap6/include/asm-sh/pgtable.h 2004-03-18 21:27:38.376057672 +0000
@@ -263,8 +263,6 @@ extern void update_mmu_cache(struct vm_a

#define pte_same(A,B) (pte_val(A) == pte_val(B))

-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */

#define kern_addr_valid(addr) (1)
--- anobjrmap5/include/asm-sh/rmap.h 2002-07-20 20:12:35.000000000 +0100
+++ anobjrmap6/include/asm-sh/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,7 +0,0 @@
-#ifndef _SH_RMAP_H
-#define _SH_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-sparc/kmap_types.h 2004-01-09 06:00:23.000000000 +0000
+++ anobjrmap6/include/asm-sparc/kmap_types.h 2004-03-18 21:27:38.378057368 +0000
@@ -11,7 +11,6 @@ enum km_type {
KM_BIO_DST_IRQ,
KM_PTE0,
KM_PTE1,
- KM_PTE2,
KM_IRQ0,
KM_IRQ1,
KM_SOFTIRQ0,
--- anobjrmap5/include/asm-sparc/pgtable.h 2004-02-18 03:00:07.000000000 +0000
+++ anobjrmap6/include/asm-sparc/pgtable.h 2004-03-18 21:27:38.379057216 +0000
@@ -490,8 +490,6 @@ extern int io_remap_page_range(struct vm

#include <asm-generic/pgtable.h>

-typedef pte_t *pte_addr_t;
-
#endif /* !(__ASSEMBLY__) */

/* We provide our own get_unmapped_area to cope with VA holes for userland */
--- anobjrmap5/include/asm-sparc/rmap.h 2002-07-20 20:12:35.000000000 +0100
+++ anobjrmap6/include/asm-sparc/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,7 +0,0 @@
-#ifndef _SPARC_RMAP_H
-#define _SPARC_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-sparc64/pgtable.h 2004-01-09 06:00:23.000000000 +0000
+++ anobjrmap6/include/asm-sparc64/pgtable.h 2004-03-18 21:27:38.380057064 +0000
@@ -384,8 +384,6 @@ extern unsigned long get_fb_unmapped_are

extern void check_pgt_cache(void);

-typedef pte_t *pte_addr_t;
-
#endif /* !(__ASSEMBLY__) */

#endif /* !(_SPARC64_PGTABLE_H) */
--- anobjrmap5/include/asm-sparc64/rmap.h 2002-07-20 20:12:35.000000000 +0100
+++ anobjrmap6/include/asm-sparc64/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,7 +0,0 @@
-#ifndef _SPARC64_RMAP_H
-#define _SPARC64_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-um/pgtable.h 2003-10-08 20:24:57.000000000 +0100
+++ anobjrmap6/include/asm-um/pgtable.h 2004-03-18 21:27:38.382056760 +0000
@@ -384,18 +384,6 @@ static inline pmd_t * pmd_offset(pgd_t *
#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)

-#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G)
-typedef u32 pte_addr_t;
-#endif
-
-#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G)
-typedef u64 pte_addr_t;
-#endif
-
-#if !defined(CONFIG_HIGHPTE)
-typedef pte_t *pte_addr_t;
-#endif
-
#define update_mmu_cache(vma,address,pte) do ; while (0)

/* Encode and de-code a swap entry */
--- anobjrmap5/include/asm-um/rmap.h 2002-09-16 03:20:25.000000000 +0100
+++ anobjrmap6/include/asm-um/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,6 +0,0 @@
-#ifndef __UM_RMAP_H
-#define __UM_RMAP_H
-
-#include "asm/arch/rmap.h"
-
-#endif
--- anobjrmap5/include/asm-v850/pgtable.h 2002-11-04 21:31:04.000000000 +0000
+++ anobjrmap6/include/asm-v850/pgtable.h 2004-03-18 21:27:38.384056456 +0000
@@ -5,8 +5,6 @@
#include <asm/page.h>


-typedef pte_t *pte_addr_t;
-
#define pgd_present(pgd) (1) /* pages are always present on NO_MM */
#define pgd_none(pgd) (0)
#define pgd_bad(pgd) (0)
--- anobjrmap5/include/asm-v850/rmap.h 2002-11-04 21:31:04.000000000 +0000
+++ anobjrmap6/include/asm-v850/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1 +0,0 @@
-/* Do not need anything here */
--- anobjrmap5/include/asm-x86_64/pgtable.h 2004-03-11 01:56:11.000000000 +0000
+++ anobjrmap6/include/asm-x86_64/pgtable.h 2004-03-18 21:27:38.386056152 +0000
@@ -390,8 +390,6 @@ extern inline pte_t pte_modify(pte_t pte
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })

-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */

extern int kern_addr_valid(unsigned long addr);
--- anobjrmap5/include/asm-x86_64/rmap.h 2002-10-16 04:29:25.000000000 +0100
+++ anobjrmap6/include/asm-x86_64/rmap.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,7 +0,0 @@
-#ifndef _X8664_RMAP_H
-#define _X8664_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/mm/memory.c 2004-03-18 21:27:15.351557928 +0000
+++ anobjrmap6/mm/memory.c 2004-03-18 21:27:38.388055848 +0000
@@ -48,7 +48,6 @@
#include <linux/init.h>

#include <asm/pgalloc.h>
-#include <asm/rmap.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
@@ -105,7 +104,7 @@ static inline void free_one_pmd(struct m
}
page = pmd_page(*dir);
pmd_clear(dir);
- pgtable_remove_rmap(page);
+ dec_page_state(nr_page_table_pages);
pte_free_tlb(tlb, page);
}

@@ -164,7 +163,7 @@ pte_t fastcall * pte_alloc_map(struct mm
pte_free(new);
goto out;
}
- pgtable_add_rmap(new, mm, address);
+ inc_page_state(nr_page_table_pages);
pmd_populate(mm, pmd, new);
}
out:
@@ -190,7 +189,6 @@ pte_t fastcall * pte_alloc_kernel(struct
pte_free_kernel(new);
goto out;
}
- pgtable_add_rmap(virt_to_page(new), mm, address);
pmd_populate_kernel(mm, pmd, new);
}
out:

2004-03-19 02:42:04

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Thu, Mar 18, 2004 at 11:21:07PM +0000, Hugh Dickins wrote:
> + if (!spin_trylock(&mm->page_table_lock))
> + return 1;
> +
[..]
> + if (down_trylock(&mapping->i_shared_sem))
> + return 1;
> +

those two will hang your kernel in the workload I posted to the list a
few days ago.

With previous kernels the above didn't matter, but starting with
2.6.5-rc1 it does matter, if we cannot know if it's referenced or not,
we must assume it's not and return 0 or it lives locks hard with all
tasks stuck and one must click reboot.

I recommend you to share my objrmap patch, the objrmap should be exactly
the same for both of us. It took me a while to figure out the above
issue and fix it in the objrmap patch, since it was hard to assume a
change in 2.6.5-rc1 broke objrmap (there were no rejects and objrmap was
pretty much unchanged since the 2.5.x days for an year).

2004-03-19 07:08:26

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Fri, 19 Mar 2004, Andrea Arcangeli wrote:
> On Thu, Mar 18, 2004 at 11:21:07PM +0000, Hugh Dickins wrote:
> > + if (!spin_trylock(&mm->page_table_lock))
> > + return 1;
> > +
> [..]
> > + if (down_trylock(&mapping->i_shared_sem))
> > + return 1;
> > +
>
> those two will hang your kernel in the workload I posted to the list a
> few days ago.

I missed the actual workload, will search the archives later.
Fear I won't reproduce it exactly, and more anxious to plug
the mremap-move and non-linear holes.

> With previous kernels the above didn't matter, but starting with
> 2.6.5-rc1 it does matter, if we cannot know if it's referenced or not,
> we must assume it's not and return 0 or it lives locks hard with all
> tasks stuck and one must click reboot.

I don't much care whether we return 1 or 0 in that case, be happy to
make the change if we understand _why_ it's suddenly become necessary.
I don't remember seeing an explanation from you (and fair enough, you
didn't want to get stuck on that detail) or anyone else.

> I recommend you to share my objrmap patch, the objrmap should be exactly
> the same for both of us.

I can't take its mm/mmap.c (and if Martin keeps that page_table_lock
avoidance in his tree, then I think he shouldn't have followed your
advice to skip Dave's mmap_sem in unuse_process). But of course,
I could have started from exactly yours and then a patch to change
those back. Just so long as we're aware they're not identical.

Hmm, where's page_test_and_clear_dirty gone in your final objrmap.c?

There's a lot that could be shared between the two approaches.
Nice if we kept to the same struct page layout: I put int mapcount
after atomic_t count because almost all arches have atomic_t as an
int, so won't that placing save us 4 bytes on the 64-bit arches?

Hugh

2004-03-19 14:38:13

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Thu, Mar 18, 2004 at 11:21:07PM +0000, Hugh Dickins wrote:
> First of six patches implementing full object-based rmap over 2.6.5-rc1,
> reviving my anonmm method to compare against Andrea's anon_vma method.
> I've not yet implemented Linus' early-COW solution to the mremap move
> issue, that will follow; handling of non-linear obj vmas also to follow.
> Sorry, not yet checked against wli's tree, he may have some fixes to it.

It would actually take serious rereading to verify that what issues I'd
fixed weren't ones I introduced myself. In that set of patches,
anobjrmap appeared alongside a page allocator rewrite, a top-down vma
allocation policy for i386, an arch/i386/mm/pgtable.c rewrite, wrapping
every modification to userspace ptes to track statistics wanted by
/proc/, highpmd, something that RCU'd inode->i_mmap{,_shared} missing
the needed smp_read_barrier_depends() calls, and using wrappers around
rwlocks to allow mapping->page_lock to be configured as an rwlock or
spinlock at compile-time thrown in for good measure, so there isn't
much of a way to rule out my own hacks. There was even experimental
junk at some point e.g. to remove files_lock in addition to a fair
number of other quetionable/buggy patches I dumped instead of debugging.

The story of that tree is too tortuous and sad to tell. I'll put up a
new tree with a substantially different emphasis, comprised of
completely different patches, when I have enough material to warrant it.


-- wli

2004-03-19 16:15:43

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 5/6 anonmm

On Thu, 18 Mar 2004, Hugh Dickins wrote:

> + (*mapcount)--;

> + if (anonmm->mm && anonmm->mm->rss) {
> + referenced += page_referenced_one(
> + page, anonmm->mm, page->index, mapcount);
> + if (!*mapcount)
> + goto out;
> }

Brilliant little optimisation over what I thought Linus
proposed at first. This certainly removes the biggest
disadvantage I (and presumably Andrea) thought the mm
based reverse mapping would have !

I like this code a lot...

--
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

2004-03-19 17:11:48

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

Well, I'm pleased to say not only is your code stable in my tests, it's
also faster than partial objrmap (not by that much, but definitely
measurable). And of course, the code's cleaner. Kernbench & SDET are both
heavy on fork/exec/exit, so this should give these paths a heavy workout.
(this was on 16-way NUMA-Q).

Andrea, are you still working on your code at the moment, or is it ready
for others to play with? I'll make a run at that as well if you say it's
ready, though I think I might have lost track of the latest version ;-)

M.

Kernbench: (make -j N vmlinux, where N = 2 x num_cpus)
Elapsed System User CPU
2.6.5-rc1 45.75 102.49 577.39 1486.00
2.6.5-rc1-partial 44.84 85.75 576.63 1476.67
2.6.5-rc1-hugh 44.79 83.85 576.71 1474.67

Kernbench: (make -j N vmlinux, where N = 16 x num_cpus)
Elapsed System User CPU
2.6.5-rc1 46.99 121.95 580.82 1495.33
2.6.5-rc1-partial 45.09 97.16 579.59 1501.00
2.6.5-rc1-hugh 45.00 95.45 579.05 1498.67

Kernbench: (make -j vmlinux, maximal tasks)
Elapsed System User CPU
2.6.5-rc1 46.96 122.43 580.65 1495.00
2.6.5-rc1-partial 45.18 93.60 579.10 1488.33
2.6.5-rc1-hugh 44.89 91.04 578.49 1490.33


DISCLAIMER: SPEC(tm) and the benchmark name SDET(tm) are registered
trademarks of the Standard Performance Evaluation Corporation. This
benchmarking was performed for research purposes only, and the run results
are non-compliant and not-comparable with any published results.

Results are shown as percentages of the first set displayed

SDET 1 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 3.0%
2.6.5-rc1-partial 101.4% 1.3%
2.6.5-rc1-hugh 100.0% 2.9%

SDET 2 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 1.3%
2.6.5-rc1-partial 107.7% 1.0%
2.6.5-rc1-hugh 108.7% 1.5%

SDET 4 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 0.7%
2.6.5-rc1-partial 110.5% 0.6%
2.6.5-rc1-hugh 114.6% 1.3%

SDET 8 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 0.9%
2.6.5-rc1-partial 119.4% 0.5%
2.6.5-rc1-hugh 120.2% 1.1%

SDET 16 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 0.1%
2.6.5-rc1-partial 118.1% 0.2%
2.6.5-rc1-hugh 119.8% 0.4%

SDET 32 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 0.2%
2.6.5-rc1-partial 119.2% 1.0%
2.6.5-rc1-hugh 120.4% 0.4%

SDET 64 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 0.3%
2.6.5-rc1-partial 122.1% 0.5%
2.6.5-rc1-hugh 123.5% 0.4%

SDET 128 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 0.2%
2.6.5-rc1-partial 123.1% 0.4%
2.6.5-rc1-hugh 124.7% 0.7%


diffprofile from virgin to partial (kernbench)

520 30.0% do_no_page
450 9.6% __copy_from_user_ll
307 5.1% __copy_to_user_ll
93 9.2% __wake_up
89 4.8% schedule
...
-147 -44.8% free_pages_and_swap_cache
-224 -1.6% do_anonymous_page
-319 -9.3% zap_pte_range
-352 -8.9% find_get_page
-388 -30.3% release_pages
-448 -75.4% __pte_chain_free
-555 -1.1% default_idle
-907 -50.4% kmem_cache_free
-4647 -69.4% page_add_rmap
-21876 -90.4% page_remove_rmap
-29063 -16.8% total

And from partial to full:

773 0.0% page_add_anon_rmap
656 0.0% page_add_obj_rmap
367 0.0% set_page_dirty
263 7.3% find_get_page
76 3.4% do_page_fault
64 13.5% .text.lock.file_table
58 2.5% atomic_dec_and_lock
...
-104 -2.0% __copy_from_user_ll
-126 -8.7% free_hot_cold_page
-135 -100.0% pte_chain_alloc
-146 -100.0% __pte_chain_free
-164 -1.9% __d_lookup
-282 -100.0% __set_page_dirty_buffers
-345 -2.4% do_anonymous_page
-549 -23.7% page_remove_rmap
-574 -25.4% do_no_page
-852 -1.7% default_idle
-2046 -100.0% page_add_rmap
-3336 -2.3% total


2004-03-20 12:29:19

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Fri, Mar 19, 2004 at 09:11:23AM -0800, Martin J. Bligh wrote:
> Well, I'm pleased to say not only is your code stable in my tests, it's
> also faster than partial objrmap (not by that much, but definitely
> measurable). And of course, the code's cleaner. Kernbench & SDET are both
> heavy on fork/exec/exit, so this should give these paths a heavy workout.
> (this was on 16-way NUMA-Q).
>
> Andrea, are you still working on your code at the moment, or is it ready
> for others to play with? I'll make a run at that as well if you say it's
> ready, though I think I might have lost track of the latest version ;-)

I'm working on my code yes, I think my code is finished, I prefer my
design for the various reasons explained in the other emails (you don't
swap so you can't appreciate the benefits, you only have to check that
performs as well as Hugh's code).

Hugh's and your code is unstable in objrmap, you can find the details in
the email I sent to Hugh, mine is stable (running such simulation for a
few days just fine on 4-way xeon, without my objrmap fixes it live locks
as soon as it hits swap).

You find my anon_vma in 2.6.5-rc1aa2, it's rock solid, just apply the
whole patch and compare it with your other below results. thanks.

2004-03-20 12:45:50

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Fri, Mar 19, 2004 at 07:08:26AM +0000, Hugh Dickins wrote:
> On Fri, 19 Mar 2004, Andrea Arcangeli wrote:
> > On Thu, Mar 18, 2004 at 11:21:07PM +0000, Hugh Dickins wrote:
> > > + if (!spin_trylock(&mm->page_table_lock))
> > > + return 1;
> > > +
> > [..]
> > > + if (down_trylock(&mapping->i_shared_sem))
> > > + return 1;
> > > +
> >
> > those two will hang your kernel in the workload I posted to the list a
> > few days ago.
>
> I missed the actual workload, will search the archives later.
> Fear I won't reproduce it exactly, and more anxious to plug
> the mremap-move and non-linear holes.
>
> > With previous kernels the above didn't matter, but starting with
> > 2.6.5-rc1 it does matter, if we cannot know if it's referenced or not,
> > we must assume it's not and return 0 or it lives locks hard with all
> > tasks stuck and one must click reboot.
>
> I don't much care whether we return 1 or 0 in that case, be happy to
> make the change if we understand _why_ it's suddenly become necessary.
> I don't remember seeing an explanation from you (and fair enough, you
> didn't want to get stuck on that detail) or anyone else.

it's the changes in the 2.6.5-rc1 page_referenced usage that requires us
to return 0, Andrew may want to elaborate those details.

if you don't fix it your set of patches will hang the box hard if you
hit swap with shared memory swap load.

> > I recommend you to share my objrmap patch, the objrmap should be exactly
> > the same for both of us.
>
> I can't take its mm/mmap.c (and if Martin keeps that page_table_lock
> avoidance in his tree, then I think he shouldn't have followed your
> advice to skip Dave's mmap_sem in unuse_process). But of course,
> I could have started from exactly yours and then a patch to change
> those back. Just so long as we're aware they're not identical.
>
> Hmm, where's page_test_and_clear_dirty gone in your final objrmap.c?

There's no such thing in Dave's objrmap patch.

>
> There's a lot that could be shared between the two approaches.
> Nice if we kept to the same struct page layout: I put int mapcount
> after atomic_t count because almost all arches have atomic_t as an
> int, so won't that placing save us 4 bytes on the 64-bit arches?

my mapcount is an unsigned long, so it doesn't matter, but I think I can
make it an unsigned int, that sounds a good idea since I doubt anybody
will ever fork >4G processes with 2.6. Only after making it an unsigned
it it will matter to position it near the atomic_t on 64bit.

2004-03-20 14:03:51

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Sat, Mar 20, 2004 at 01:30:09PM +0100, Andrea Arcangeli wrote:
> I'm working on my code yes, I think my code is finished, I prefer my
> design for the various reasons explained in the other emails (you don't
> swap so you can't appreciate the benefits, you only have to check that
> performs as well as Hugh's code).
> Hugh's and your code is unstable in objrmap, you can find the details in
> the email I sent to Hugh, mine is stable (running such simulation for a
> few days just fine on 4-way xeon, without my objrmap fixes it live locks
> as soon as it hits swap).
> You find my anon_vma in 2.6.5-rc1aa2, it's rock solid, just apply the
> whole patch and compare it with your other below results. thanks.

There's an outstanding issue that's biting people on ppc64, which is
that arch/ppc64/mm/tlb.c uses the mm pointer and virtual addresses that
used to be put into page->mapping and page->index respectively for
pagetable pages to assist updates to the inverted pagetable. Without
leaving that assignment and invalidation of page->mapping and page->index
in place or converting ppc64 to other methods of carrying out its
inverted pagetable updates, ppc64 (e.g. G5 Macs) support is broken.


-- wli

2004-03-20 14:28:14

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Sat, Mar 20, 2004 at 06:03:41AM -0800, William Lee Irwin III wrote:
> On Sat, Mar 20, 2004 at 01:30:09PM +0100, Andrea Arcangeli wrote:
> > I'm working on my code yes, I think my code is finished, I prefer my
> > design for the various reasons explained in the other emails (you don't
> > swap so you can't appreciate the benefits, you only have to check that
> > performs as well as Hugh's code).
> > Hugh's and your code is unstable in objrmap, you can find the details in
> > the email I sent to Hugh, mine is stable (running such simulation for a
> > few days just fine on 4-way xeon, without my objrmap fixes it live locks
> > as soon as it hits swap).
> > You find my anon_vma in 2.6.5-rc1aa2, it's rock solid, just apply the
> > whole patch and compare it with your other below results. thanks.
>
> There's an outstanding issue that's biting people on ppc64, which is
> that arch/ppc64/mm/tlb.c uses the mm pointer and virtual addresses that
> used to be put into page->mapping and page->index respectively for
> pagetable pages to assist updates to the inverted pagetable. Without
> leaving that assignment and invalidation of page->mapping and page->index
> in place or converting ppc64 to other methods of carrying out its
> inverted pagetable updates, ppc64 (e.g. G5 Macs) support is broken.

agreed, I was talking all archs but ppc of course when I said rock
solid. (it's not yet applied to ppc at this time), after reading Hugh's
and Paul's comments it should be easily fixable (those pages are not the
ones mapped to userspace, so their page->mapping and page->index can be
reused). I'll try to find a cross compiler soon to fix it. The other
archs seems already working fine.

2004-03-20 15:56:38

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

> I'm working on my code yes, I think my code is finished, I prefer my
> design for the various reasons explained in the other emails (you don't
> swap so you can't appreciate the benefits, you only have to check that
> performs as well as Hugh's code).
>
> Hugh's and your code is unstable in objrmap, you can find the details in
> the email I sent to Hugh, mine is stable (running such simulation for a
> few days just fine on 4-way xeon, without my objrmap fixes it live locks
> as soon as it hits swap).
>
> You find my anon_vma in 2.6.5-rc1aa2, it's rock solid, just apply the
> whole patch and compare it with your other below results. thanks.

Mmmm, if you have a broken out patch, it'd be preferable. If I were to
apply the whole of -mjb, I'll get a damned sight better results than
any of them, but that's not really a fair comparison ;-) I'll can at
least check it's stable for me that way though.

I did find your broken-out anon-vma patch, but it's against something
else, maybe half-way up your tree or something, and I didn't bother
trying to fix it ;-)

M.

2004-03-20 16:18:17

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Sat, Mar 20, 2004 at 07:56:37AM -0800, Martin J. Bligh wrote:
> > I'm working on my code yes, I think my code is finished, I prefer my
> > design for the various reasons explained in the other emails (you don't
> > swap so you can't appreciate the benefits, you only have to check that
> > performs as well as Hugh's code).
> >
> > Hugh's and your code is unstable in objrmap, you can find the details in
> > the email I sent to Hugh, mine is stable (running such simulation for a
> > few days just fine on 4-way xeon, without my objrmap fixes it live locks
> > as soon as it hits swap).
> >
> > You find my anon_vma in 2.6.5-rc1aa2, it's rock solid, just apply the
> > whole patch and compare it with your other below results. thanks.
>
> Mmmm, if you have a broken out patch, it'd be preferable. If I were to
> apply the whole of -mjb, I'll get a damned sight better results than
> any of them, but that's not really a fair comparison ;-) I'll can at
> least check it's stable for me that way though.
>
> I did find your broken-out anon-vma patch, but it's against something
> else, maybe half-way up your tree or something, and I didn't bother
> trying to fix it ;-)

this one is against mainline, but you must use my objrmap patch too
which is fixed so it doesn't crash in 2.6.5-rc1.

http://www.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.6/2.6.5-rc1-aa2/00100_objrmap-core-1.gz
http://www.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.6/2.6.5-rc1-aa2/00101_anon_vma-2.gz

just backout your objrmap and apply the above two, it should apply
pretty well.

2004-03-20 16:40:34

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

OK, first I did the whole of -aa2, it boots OK, but panics as soon as I try
to connect with ssh. I'll try the broken out bits next.

M.

Unable to handle kernel NULL pointer dereference at virtual address 00000003
printing eip:
c013f504
*pde = 2e820001
*pte = 00000000
Oops: 0000 [#1]
SMP
CPU: 15
EIP: 0060:[<c013f504>] Not tainted
EFLAGS: 00010292 (2.6.5-rc1-aa2)
EIP is at do_no_page+0xc4/0x45c
eax: 00000000 ebx: 00000000 ecx: 00000000 edx: 00000000
esi: ee5eea60 edi: ee5a3ec8 ebp: ee9c52a0 esp: ee5a3e94
ds: 007b es: 007b ss: 0068
Process sshd (pid: 19069, threadinfo=ee5a2000 task=ee69a870)
Stack: 00000000 eeb43340 00000001 ee9c52a0 c3ba2820 00000000 40268000 ee5a3ec8
ee9c52a0 ee69a870 ee5eea60 00000000 ef6a6134 00000001 c013fa0f ee9c52a0
ee5eea60 40268000 00000001 eeb43340 eeea8008 ee9c52a0 ee69a870 ee5eea60
Call Trace:
[<c013fa0f>] handle_mm_fault+0xc7/0x190
[<c0114fc3>] do_page_fault+0x13b/0x540
[<c0114e88>] do_page_fault+0x0/0x540
[<c0140feb>] do_mmap_pgoff+0x4b7/0x5fc
[<c010d24b>] sys_mmap2+0x67/0x98
[<c01075f9>] error_code+0x2d/0x38

Code: 0f b6 41 03 8b 14 85 80 a9 35 c0 89 c8 2b 82 84 0b 00 00 69

2004-03-20 16:54:23

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Sat, Mar 20, 2004 at 08:40:27AM -0800, Martin J. Bligh wrote:
> OK, first I did the whole of -aa2, it boots OK, but panics as soon as I try
> to connect with ssh. I'll try the broken out bits next.
>
> M.
>
> Unable to handle kernel NULL pointer dereference at virtual address 00000003
> printing eip:
> c013f504
> *pde = 2e820001
> *pte = 00000000
> Oops: 0000 [#1]
> SMP
> CPU: 15
> EIP: 0060:[<c013f504>] Not tainted
> EFLAGS: 00010292 (2.6.5-rc1-aa2)
> EIP is at do_no_page+0xc4/0x45c
> eax: 00000000 ebx: 00000000 ecx: 00000000 edx: 00000000
> esi: ee5eea60 edi: ee5a3ec8 ebp: ee9c52a0 esp: ee5a3e94
> ds: 007b es: 007b ss: 0068
> Process sshd (pid: 19069, threadinfo=ee5a2000 task=ee69a870)
> Stack: 00000000 eeb43340 00000001 ee9c52a0 c3ba2820 00000000 40268000 ee5a3ec8
> ee9c52a0 ee69a870 ee5eea60 00000000 ef6a6134 00000001 c013fa0f ee9c52a0
> ee5eea60 40268000 00000001 eeb43340 eeea8008 ee9c52a0 ee69a870 ee5eea60
> Call Trace:
> [<c013fa0f>] handle_mm_fault+0xc7/0x190
> [<c0114fc3>] do_page_fault+0x13b/0x540
> [<c0114e88>] do_page_fault+0x0/0x540
> [<c0140feb>] do_mmap_pgoff+0x4b7/0x5fc
> [<c010d24b>] sys_mmap2+0x67/0x98
> [<c01075f9>] error_code+0x2d/0x38
>
> Code: 0f b6 41 03 8b 14 85 80 a9 35 c0 89 c8 2b 82 84 0b 00 00 69

this looks strange:

Code; c013f504 <filp_open+24/70>
00000000 <_EIP>:
Code; c013f504 <filp_open+24/70> <=====
0: 0f b6 41 03 movzbl 0x3(%ecx),%eax <=====
Code; c013f508 <filp_open+28/70>
4: 8b 14 85 80 a9 35 c0 mov 0xc035a980(,%eax,4),%edx
Code; c013f50f <filp_open+2f/70>
b: 89 c8 mov %ecx,%eax
Code; c013f511 <filp_open+31/70>
d: 2b 82 84 0b 00 00 sub 0xb84(%edx),%eax
Code; c013f517 <filp_open+37/70>
13: 69 00 00 00 00 00 imul $0x0,(%eax),%eax

%ecx is zero, not sure what can actually look at a 3 byte offset in
do_no_page (infact it looks like it was miscompiled), can you send me
your vmlinux privately (or just the the whole assembler for do_no_page)?

looking at my vmlinux asm (do_no_page C source is the same for both of
us) there's not a single movzbl instruction in the whole function and
I can't see any dereference at 0x3 offset either, nor I can see what
could generate that from the C code.

my compiler is:

Reading specs from /usr/lib/gcc-lib/i586-suse-linux/3.3.1/specs
Configured with: ../configure --enable-threads=posix --prefix=/usr
--with-local-prefix=/usr/local --infodir=/usr/share/info
--mandir=/usr/share/man --libdir=/usr/lib
--enable-languages=c,c++,f77,objc,java,ada --disable-checking
--enable-libgcj --with-gxx-include-dir=/usr/include/g++
--with-slibdir=/lib --with-system-zlib --enable-shared
--enable-__cxa_atexit i586-suse-linux
Thread model: posix
gcc version 3.3.1 (SuSE Linux)

can you give a try with that one just in case? I made further use of
explicit regparm, old compiler miscompiles regparm.

2004-03-20 17:34:22

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

>> Unable to handle kernel NULL pointer dereference at virtual address 00000003
>> printing eip:
>> c013f504
>> *pde = 2e820001
>> *pte = 00000000
>> Oops: 0000 [#1]
>> SMP
>> CPU: 15
>> EIP: 0060:[<c013f504>] Not tainted
>> EFLAGS: 00010292 (2.6.5-rc1-aa2)
>> EIP is at do_no_page+0xc4/0x45c
>> eax: 00000000 ebx: 00000000 ecx: 00000000 edx: 00000000
>> esi: ee5eea60 edi: ee5a3ec8 ebp: ee9c52a0 esp: ee5a3e94
>> ds: 007b es: 007b ss: 0068
>> Process sshd (pid: 19069, threadinfo=ee5a2000 task=ee69a870)
>> Stack: 00000000 eeb43340 00000001 ee9c52a0 c3ba2820 00000000 40268000 ee5a3ec8
>> ee9c52a0 ee69a870 ee5eea60 00000000 ef6a6134 00000001 c013fa0f ee9c52a0
>> ee5eea60 40268000 00000001 eeb43340 eeea8008 ee9c52a0 ee69a870 ee5eea60
>> Call Trace:
>> [<c013fa0f>] handle_mm_fault+0xc7/0x190
>> [<c0114fc3>] do_page_fault+0x13b/0x540
>> [<c0114e88>] do_page_fault+0x0/0x540
>> [<c0140feb>] do_mmap_pgoff+0x4b7/0x5fc
>> [<c010d24b>] sys_mmap2+0x67/0x98
>> [<c01075f9>] error_code+0x2d/0x38
>>
>> Code: 0f b6 41 03 8b 14 85 80 a9 35 c0 89 c8 2b 82 84 0b 00 00 69
>
> this looks strange:
>
> Code; c013f504 <filp_open+24/70>
> 00000000 <_EIP>:
> Code; c013f504 <filp_open+24/70> <=====
> 0: 0f b6 41 03 movzbl 0x3(%ecx),%eax <=====
> Code; c013f508 <filp_open+28/70>
> 4: 8b 14 85 80 a9 35 c0 mov 0xc035a980(,%eax,4),%edx
> Code; c013f50f <filp_open+2f/70>
> b: 89 c8 mov %ecx,%eax
> Code; c013f511 <filp_open+31/70>
> d: 2b 82 84 0b 00 00 sub 0xb84(%edx),%eax
> Code; c013f517 <filp_open+37/70>
> 13: 69 00 00 00 00 00 imul $0x0,(%eax),%eax
>
> %ecx is zero, not sure what can actually look at a 3 byte offset in
> do_no_page (infact it looks like it was miscompiled), can you send me
> your vmlinux privately (or just the the whole assembler for do_no_page)?

I dumped stuff here:
ftp://ftp.kernel.org/pub/linux/kernel/people/mbligh/andrea
but it may take a few minutes to replicate. I'm using gcc 2.95.4, if you're
doing funny gcc 3.x only stuff, that might explain it ;-) I'll try a 3.3
build in a bit to confirm.

Dump of assembler code for function do_no_page:
0xc013f440 <do_no_page+0>: sub $0x28,%esp
0xc013f443 <do_no_page+3>: push %ebp
0xc013f444 <do_no_page+4>: push %edi
0xc013f445 <do_no_page+5>: push %esi
0xc013f446 <do_no_page+6>: push %ebx
0xc013f447 <do_no_page+7>: mov 0x4c(%esp,1),%edi
0xc013f44b <do_no_page+11>: mov 0x40(%esp,1),%edx
0xc013f44f <do_no_page+15>: movl $0x0,0x30(%esp,1)
0xc013f457 <do_no_page+23>: movl $0x0,0x2c(%esp,1)
0xc013f45f <do_no_page+31>: movl $0x1,0x34(%esp,1)
0xc013f467 <do_no_page+39>: mov 0x3c(%edx),%eax
0xc013f46a <do_no_page+42>: test %eax,%eax
0xc013f46c <do_no_page+44>: je 0xc013f474 <do_no_page+52>
0xc013f46e <do_no_page+46>: cmpl $0x0,0x8(%eax)
0xc013f472 <do_no_page+50>: jne 0xc013f4a0 <do_no_page+96>
0xc013f474 <do_no_page+52>: mov 0x44(%esp,1),%ecx
0xc013f478 <do_no_page+56>: push %ecx
0xc013f479 <do_no_page+57>: mov 0x4c(%esp,1),%esi
0xc013f47d <do_no_page+61>: push %esi
0xc013f47e <do_no_page+62>: mov 0x58(%esp,1),%eax
0xc013f482 <do_no_page+66>: push %eax
0xc013f483 <do_no_page+67>: push %edi
0xc013f484 <do_no_page+68>: mov 0x50(%esp,1),%edx
0xc013f488 <do_no_page+72>: push %edx
0xc013f489 <do_no_page+73>: mov 0x50(%esp,1),%ecx
0xc013f48d <do_no_page+77>: push %ecx
0xc013f48e <do_no_page+78>: call 0xc013f1d4 <do_anonymous_page>
0xc013f493 <do_no_page+83>: add $0x18,%esp
0xc013f496 <do_no_page+86>: jmp 0xc013f892 <do_no_page+1106>
0xc013f49b <do_no_page+91>: nop
0xc013f49c <do_no_page+92>: lea 0x0(%esi,1),%esi
0xc013f4a0 <do_no_page+96>: mov 0x3c(%esp,1),%esi
0xc013f4a4 <do_no_page+100>: movb $0x1,0x30(%esi)
0xc013f4a8 <do_no_page+104>: mov 0x40(%esp,1),%edi
0xc013f4ac <do_no_page+108>: mov 0x44(%edi),%eax
0xc013f4af <do_no_page+111>: test %eax,%eax
0xc013f4b1 <do_no_page+113>: je 0xc013f4c4 <do_no_page+132>
0xc013f4b3 <do_no_page+115>: mov 0x90(%eax),%eax
0xc013f4b9 <do_no_page+121>: mov %eax,0x30(%esp,1)
0xc013f4bd <do_no_page+125>: mov 0x60(%eax),%eax
0xc013f4c0 <do_no_page+128>: mov %eax,0x2c(%esp,1)
0xc013f4c4 <do_no_page+132>: lock addl $0x0,0x0(%esp,1)
0xc013f4ca <do_no_page+138>: mov 0x44(%esp,1),%ecx
0xc013f4ce <do_no_page+142>: lea 0x34(%esp,1),%edx
0xc013f4d2 <do_no_page+146>: mov %edx,0x1c(%esp,1)
0xc013f4d6 <do_no_page+150>: mov %ecx,0x18(%esp,1)
0xc013f4da <do_no_page+154>: andl $0xfffff000,0x18(%esp,1)
0xc013f4e2 <do_no_page+162>: mov 0x40(%esp,1),%esi
0xc013f4e6 <do_no_page+166>: mov 0x1c(%esp,1),%edi
0xc013f4ea <do_no_page+170>: mov 0x3c(%esi),%eax
0xc013f4ed <do_no_page+173>: push %edi
0xc013f4ee <do_no_page+174>: mov 0x1c(%esp,1),%edx
0xc013f4f2 <do_no_page+178>: push %edx
0xc013f4f3 <do_no_page+179>: push %esi
0xc013f4f4 <do_no_page+180>: mov 0x8(%eax),%eax
0xc013f4f7 <do_no_page+183>: call *%eax
0xc013f4f9 <do_no_page+185>: mov %eax,0x20(%esp,1)
0xc013f4fd <do_no_page+189>: add $0xc,%esp
0xc013f500 <do_no_page+192>: mov 0x14(%esp,1),%ecx
0xc013f504 <do_no_page+196>: movzbl 0x3(%ecx),%eax
0xc013f508 <do_no_page+200>: mov 0xc035a980(,%eax,4),%edx
0xc013f50f <do_no_page+207>: mov %ecx,%eax
0xc013f511 <do_no_page+209>: sub 0xb84(%edx),%eax
0xc013f517 <do_no_page+215>: imul $0xcccccccd,%eax,%eax
0xc013f51d <do_no_page+221>: sar $0x3,%eax
0xc013f520 <do_no_page+224>: add 0xb88(%edx),%eax
0xc013f526 <do_no_page+230>: cmp 0xc035e360,%eax
0xc013f52c <do_no_page+236>: jb 0xc013f536 <do_no_page+246>
0xc013f52e <do_no_page+238>: ud2a
0xc013f530 <do_no_page+240>: test %al,0xc0268680
0xc013f536 <do_no_page+246>: mov 0x14(%esp,1),%esi
0xc013f53a <do_no_page+250>: mov (%esi),%eax
0xc013f53c <do_no_page+252>: test $0x200000,%eax
0xc013f541 <do_no_page+257>: je 0xc013f550 <do_no_page+272>
0xc013f543 <do_no_page+259>: ud2a
0xc013f545 <do_no_page+261>: xchg %eax,0xc0268680
0xc013f54b <do_no_page+267>: nop
0xc013f54c <do_no_page+268>: lea 0x0(%esi,1),%esi
0xc013f550 <do_no_page+272>: mov 0x14(%esp,1),%edi
0xc013f554 <do_no_page+276>: mov (%edi),%eax
0xc013f556 <do_no_page+278>: test $0x100000,%eax
0xc013f55b <do_no_page+283>: je 0xc013f565 <do_no_page+293>
0xc013f55d <do_no_page+285>: ud2a
0xc013f55f <do_no_page+287>: mov %eax,0xc0268680
0xc013f565 <do_no_page+293>: mov 0x40(%esp,1),%edx
0xc013f569 <do_no_page+297>: mov 0x14(%edx),%eax
0xc013f56c <do_no_page+300>: mov %eax,0x28(%esp,1)
0xc013f570 <do_no_page+304>: andl $0x80000,0x28(%esp,1)
0xc013f578 <do_no_page+312>: mov %eax,%edx
0xc013f57a <do_no_page+314>: jne 0xc013f598 <do_no_page+344>
0xc013f57c <do_no_page+316>: mov 0x14(%esp,1),%ecx
0xc013f580 <do_no_page+320>: cmpl $0x0,0x1c(%ecx)
0xc013f584 <do_no_page+324>: je 0xc013f590 <do_no_page+336>
0xc013f586 <do_no_page+326>: mov (%ecx),%eax
0xc013f588 <do_no_page+328>: test $0x8,%ah
0xc013f58b <do_no_page+331>: je 0xc013f598 <do_no_page+344>
0xc013f58d <do_no_page+333>: lea 0x0(%esi),%esi
0xc013f590 <do_no_page+336>: ud2a
0xc013f592 <do_no_page+338>: xchg %eax,%ebx
0xc013f593 <do_no_page+339>: add $0xc0268680,%eax
0xc013f598 <do_no_page+344>: cmpl $0x0,0x14(%esp,1)
0xc013f59d <do_no_page+349>: jne 0xc013f5a6 <do_no_page+358>
0xc013f59f <do_no_page+351>: xor %eax,%eax
0xc013f5a1 <do_no_page+353>: jmp 0xc013f892 <do_no_page+1106>
0xc013f5a6 <do_no_page+358>: cmpl $0xffffffff,0x14(%esp,1)
0xc013f5ab <do_no_page+363>: jne 0xc013f5b6 <do_no_page+374>
0xc013f5ad <do_no_page+365>: mov 0x14(%esp,1),%eax
0xc013f5b1 <do_no_page+369>: jmp 0xc013f892 <do_no_page+1106>
0xc013f5b6 <do_no_page+374>: xor %ebp,%ebp
0xc013f5b8 <do_no_page+376>: cmpl $0x0,0x48(%esp,1)
0xc013f5bd <do_no_page+381>: je 0xc013f698 <do_no_page+600>
0xc013f5c3 <do_no_page+387>: test $0x8,%dl
0xc013f5c6 <do_no_page+390>: jne 0xc013f698 <do_no_page+600>
0xc013f5cc <do_no_page+396>: mov 0x40(%esp,1),%eax
0xc013f5d0 <do_no_page+400>: call 0xc0143a84 <anon_vma_prepare>
0xc013f5d5 <do_no_page+405>: test %eax,%eax
0xc013f5d7 <do_no_page+407>: jne 0xc013f856 <do_no_page+1046>
0xc013f5dd <do_no_page+413>: mov $0xffffe000,%eax
0xc013f5e2 <do_no_page+418>: and %esp,%eax
0xc013f5e4 <do_no_page+420>: mov 0x10(%eax),%eax
0xc013f5e7 <do_no_page+423>: shl $0x2,%eax
0xc013f5ea <do_no_page+426>: mov 0xc02b2cc0(%eax),%eax
0xc013f5f0 <do_no_page+432>: shl $0x2,%eax
0xc013f5f3 <do_no_page+435>: mov 0xc0347760(%eax),%ecx
0xc013f5f9 <do_no_page+441>: add $0x2588,%ecx
0xc013f5ff <do_no_page+447>: xor %edx,%edx
0xc013f601 <do_no_page+449>: mov $0xd2,%eax
0xc013f606 <do_no_page+454>: call 0xc0134eb0 <__alloc_pages>
0xc013f60b <do_no_page+459>: mov %eax,%ebp
0xc013f60d <do_no_page+461>: test %ebp,%ebp
0xc013f60f <do_no_page+463>: je 0xc013f856 <do_no_page+1046>
0xc013f615 <do_no_page+469>: push $0x3
0xc013f617 <do_no_page+471>: mov 0x18(%esp,1),%esi
0xc013f61b <do_no_page+475>: push %esi
0xc013f61c <do_no_page+476>: call 0xc0116028 <kmap_atomic>
0xc013f621 <do_no_page+481>: mov %eax,%ebx
0xc013f623 <do_no_page+483>: push $0x4
0xc013f625 <do_no_page+485>: push %ebp
0xc013f626 <do_no_page+486>: call 0xc0116028 <kmap_atomic>
0xc013f62b <do_no_page+491>: mov %eax,0x20(%esp,1)
0xc013f62f <do_no_page+495>: add $0x10,%esp
0xc013f632 <do_no_page+498>: mov 0x10(%esp,1),%edi
0xc013f636 <do_no_page+502>: mov $0x400,%ecx
0xc013f63b <do_no_page+507>: mov %ebx,%esi
0xc013f63d <do_no_page+509>: repz movsl %ds:(%esi),%es:(%edi)
0xc013f63f <do_no_page+511>: push $0x3
0xc013f641 <do_no_page+513>: push %ebx
0xc013f642 <do_no_page+514>: call 0xc01160b4 <kunmap_atomic>
0xc013f647 <do_no_page+519>: push $0x4
0xc013f649 <do_no_page+521>: mov 0x1c(%esp,1),%eax
0xc013f64d <do_no_page+525>: push %eax
0xc013f64e <do_no_page+526>: call 0xc01160b4 <kunmap_atomic>
0xc013f653 <do_no_page+531>: add $0x10,%esp
0xc013f656 <do_no_page+534>: mov 0x14(%esp,1),%edx
0xc013f65a <do_no_page+538>: mov (%edx),%eax
0xc013f65c <do_no_page+540>: test $0x8,%ah
0xc013f65f <do_no_page+543>: jne 0xc013f688 <do_no_page+584>
0xc013f661 <do_no_page+545>: mov 0x4(%edx),%eax
0xc013f664 <do_no_page+548>: test %eax,%eax
0xc013f666 <do_no_page+550>: jne 0xc013f670 <do_no_page+560>
0xc013f668 <do_no_page+552>: ud2a
0xc013f66a <do_no_page+554>: inc %ecx
0xc013f66b <do_no_page+555>: add %esp,0x8bc02686
0xc013f671 <do_no_page+561>: dec %esp
0xc013f672 <do_no_page+562>: and $0x14,%al
0xc013f674 <do_no_page+564>: lock decl 0x4(%ecx)
0xc013f678 <do_no_page+568>: sete %al
0xc013f67b <do_no_page+571>: test %al,%al
0xc013f67d <do_no_page+573>: je 0xc013f688 <do_no_page+584>
0xc013f67f <do_no_page+575>: mov 0x14(%esp,1),%eax
0xc013f683 <do_no_page+579>: call 0xc01399a0 <__page_cache_release>
0xc013f688 <do_no_page+584>: mov %ebp,%eax
0xc013f68a <do_no_page+586>: call 0xc0139914 <lru_cache_add_active>
0xc013f68f <do_no_page+591>: mov %ebp,0x14(%esp,1)
0xc013f693 <do_no_page+595>: mov $0x1,%ebp
0xc013f698 <do_no_page+600>: mov 0x3c(%esp,1),%esi
0xc013f69c <do_no_page+604>: lock decb 0x30(%esi)
0xc013f6a0 <do_no_page+608>: js 0xc013fd44 <.text.lock.memory+215>
0xc013f6a6 <do_no_page+614>: cmpl $0x0,0x30(%esp,1)
0xc013f6ab <do_no_page+619>: je 0xc013f704 <do_no_page+708>
0xc013f6ad <do_no_page+621>: mov 0x30(%esp,1),%edi
0xc013f6b1 <do_no_page+625>: mov 0x60(%edi),%eax
0xc013f6b4 <do_no_page+628>: cmp %eax,0x2c(%esp,1)
0xc013f6b8 <do_no_page+632>: je 0xc013f704 <do_no_page+708>
0xc013f6ba <do_no_page+634>: mov 0x60(%edi),%eax
0xc013f6bd <do_no_page+637>: mov %eax,0x2c(%esp,1)
0xc013f6c1 <do_no_page+641>: movb $0x1,0x30(%esi)
0xc013f6c5 <do_no_page+645>: mov 0x14(%esp,1),%edx
0xc013f6c9 <do_no_page+649>: mov (%edx),%eax
0xc013f6cb <do_no_page+651>: test $0x8,%ah
0xc013f6ce <do_no_page+654>: jne 0xc013f4e2 <do_no_page+162>
0xc013f6d4 <do_no_page+660>: mov 0x4(%edx),%eax
0xc013f6d7 <do_no_page+663>: test %eax,%eax
0xc013f6d9 <do_no_page+665>: jne 0xc013f6e3 <do_no_page+675>
0xc013f6db <do_no_page+667>: ud2a
0xc013f6dd <do_no_page+669>: inc %ecx
0xc013f6de <do_no_page+670>: add %esp,0x8bc02686
0xc013f6e4 <do_no_page+676>: dec %esp
0xc013f6e5 <do_no_page+677>: and $0x14,%al
0xc013f6e7 <do_no_page+679>: lock decl 0x4(%ecx)
0xc013f6eb <do_no_page+683>: sete %al
0xc013f6ee <do_no_page+686>: test %al,%al
0xc013f6f0 <do_no_page+688>: je 0xc013f4e2 <do_no_page+162>
0xc013f6f6 <do_no_page+694>: mov 0x14(%esp,1),%eax
0xc013f6fa <do_no_page+698>: call 0xc01399a0 <__page_cache_release>
0xc013f6ff <do_no_page+703>: jmp 0xc013f4e2 <do_no_page+162>
0xc013f704 <do_no_page+708>: mov 0x50(%esp,1),%esi
0xc013f708 <do_no_page+712>: mov (%esi),%eax
0xc013f70a <do_no_page+714>: mov 0x4(%esi),%edx
0xc013f70d <do_no_page+717>: shrd $0xc,%edx,%eax
0xc013f711 <do_no_page+721>: shr $0xc,%edx
0xc013f714 <do_no_page+724>: mov %eax,%edx
0xc013f716 <do_no_page+726>: shr $0x10,%eax
0xc013f719 <do_no_page+729>: movzbl 0xc02b3400(%eax),%eax
0xc013f720 <do_no_page+736>: mov 0xc0347760(,%eax,4),%eax
0xc013f727 <do_no_page+743>: sub 0x2658(%eax),%edx
0xc013f72d <do_no_page+749>: mov 0x2650(%eax),%eax
0xc013f733 <do_no_page+755>: lea (%edx,%edx,4),%edx
0xc013f736 <do_no_page+758>: lea (%eax,%edx,8),%edx
0xc013f739 <do_no_page+761>: push %edx
0xc013f73a <do_no_page+762>: call 0xc013cb8c <page_address>
0xc013f73f <do_no_page+767>: mov 0x48(%esp,1),%ecx
0xc013f743 <do_no_page+771>: shr $0x9,%ecx
0xc013f746 <do_no_page+774>: and $0xff8,%ecx
0xc013f74c <do_no_page+780>: lea (%ecx,%eax,1),%edi
0xc013f74f <do_no_page+783>: mov (%edi),%eax
0xc013f751 <do_no_page+785>: mov 0x4(%edi),%edx
0xc013f754 <do_no_page+788>: add $0x4,%esp
0xc013f757 <do_no_page+791>: xor %ecx,%ecx
0xc013f759 <do_no_page+793>: test %eax,%eax
0xc013f75b <do_no_page+795>: jne 0xc013f767 <do_no_page+807>
0xc013f75d <do_no_page+797>: mov $0x1,%eax
0xc013f762 <do_no_page+802>: test %edx,%edx
0xc013f764 <do_no_page+804>: cmove %eax,%ecx
0xc013f767 <do_no_page+807>: test %ecx,%ecx
0xc013f769 <do_no_page+809>: je 0xc013f806 <do_no_page+966>
0xc013f76f <do_no_page+815>: mov 0x14(%esp,1),%edx
0xc013f773 <do_no_page+819>: mov (%edx),%eax
0xc013f775 <do_no_page+821>: test $0x8,%ah
0xc013f778 <do_no_page+824>: jne 0xc013f781 <do_no_page+833>
0xc013f77a <do_no_page+826>: mov 0x3c(%esp,1),%ecx
0xc013f77e <do_no_page+830>: incl 0x68(%ecx)
0xc013f781 <do_no_page+833>: mov 0x14(%esp,1),%esi
0xc013f785 <do_no_page+837>: movzbl 0x3(%esi),%eax
0xc013f789 <do_no_page+841>: mov 0xc035a980(,%eax,4),%edx
0xc013f790 <do_no_page+848>: mov 0x40(%esp,1),%ecx
0xc013f794 <do_no_page+852>: mov %esi,%eax
0xc013f796 <do_no_page+854>: sub 0xb84(%edx),%eax
0xc013f79c <do_no_page+860>: imul $0xcccccccd,%eax,%eax
0xc013f7a2 <do_no_page+866>: sar $0x3,%eax
0xc013f7a5 <do_no_page+869>: add 0xb88(%edx),%eax
0xc013f7ab <do_no_page+875>: mov %eax,%edx
0xc013f7ad <do_no_page+877>: shr $0x14,%edx
0xc013f7b0 <do_no_page+880>: mov %edx,0x24(%esp,1)
0xc013f7b4 <do_no_page+884>: shl $0xc,%eax
0xc013f7b7 <do_no_page+887>: or 0x10(%ecx),%eax
0xc013f7ba <do_no_page+890>: mov %eax,0x20(%esp,1)
0xc013f7be <do_no_page+894>: mov 0x20(%esp,1),%ecx
0xc013f7c2 <do_no_page+898>: mov 0x24(%esp,1),%ebx
0xc013f7c6 <do_no_page+902>: cmpl $0x0,0x48(%esp,1)
0xc013f7cb <do_no_page+907>: je 0xc013f7e3 <do_no_page+931>
0xc013f7cd <do_no_page+909>: mov 0x40(%esp,1),%esi
0xc013f7d1 <do_no_page+913>: mov %ebx,%edx
0xc013f7d3 <do_no_page+915>: mov %ecx,%eax
0xc013f7d5 <do_no_page+917>: or $0x40,%al
0xc013f7d7 <do_no_page+919>: mov %eax,%ecx
0xc013f7d9 <do_no_page+921>: testb $0x2,0x14(%esi)
0xc013f7dd <do_no_page+925>: je 0xc013f7e3 <do_no_page+931>
0xc013f7df <do_no_page+927>: or $0x2,%al
0xc013f7e1 <do_no_page+929>: mov %eax,%ecx
0xc013f7e3 <do_no_page+931>: mov %ebx,0x4(%edi)
0xc013f7e6 <do_no_page+934>: mov %ecx,(%edi)
0xc013f7e8 <do_no_page+936>: cmpl $0x0,0x28(%esp,1)
0xc013f7ed <do_no_page+941>: jne 0xc013f842 <do_no_page+1026>
0xc013f7ef <do_no_page+943>: push %ebp
0xc013f7f0 <do_no_page+944>: mov 0x48(%esp,1),%ecx
0xc013f7f4 <do_no_page+948>: mov 0x44(%esp,1),%edx
0xc013f7f8 <do_no_page+952>: mov 0x18(%esp,1),%eax
0xc013f7fc <do_no_page+956>: call 0xc0143398 <page_add_rmap>
0xc013f801 <do_no_page+961>: add $0x4,%esp
0xc013f804 <do_no_page+964>: jmp 0xc013f842 <do_no_page+1026>
0xc013f806 <do_no_page+966>: mov 0x14(%esp,1),%edi
0xc013f80a <do_no_page+970>: mov (%edi),%eax
0xc013f80c <do_no_page+972>: test $0x8,%ah
0xc013f80f <do_no_page+975>: jne 0xc013f838 <do_no_page+1016>
0xc013f811 <do_no_page+977>: mov 0x4(%edi),%eax
0xc013f814 <do_no_page+980>: test %eax,%eax
0xc013f816 <do_no_page+982>: jne 0xc013f820 <do_no_page+992>
0xc013f818 <do_no_page+984>: ud2a
0xc013f81a <do_no_page+986>: inc %ecx
0xc013f81b <do_no_page+987>: add %esp,0x8bc02686
0xc013f821 <do_no_page+993>: push %esp
0xc013f822 <do_no_page+994>: and $0x14,%al
0xc013f824 <do_no_page+996>: lock decl 0x4(%edx)
0xc013f828 <do_no_page+1000>: sete %al
0xc013f82b <do_no_page+1003>: test %al,%al
0xc013f82d <do_no_page+1005>: je 0xc013f838 <do_no_page+1016>
0xc013f82f <do_no_page+1007>: mov 0x14(%esp,1),%eax
0xc013f833 <do_no_page+1011>: call 0xc01399a0 <__page_cache_release>
0xc013f838 <do_no_page+1016>: mov 0x3c(%esp,1),%ecx
0xc013f83c <do_no_page+1020>: movb $0x1,0x30(%ecx)
0xc013f840 <do_no_page+1024>: jmp 0xc013f850 <do_no_page+1040>
0xc013f842 <do_no_page+1026>: mov 0x3c(%esp,1),%esi
0xc013f846 <do_no_page+1030>: movb $0x1,0x30(%esi)
0xc013f84a <do_no_page+1034>: lea 0x0(%esi),%esi
0xc013f850 <do_no_page+1040>: mov 0x34(%esp,1),%eax
0xc013f854 <do_no_page+1044>: jmp 0xc013f892 <do_no_page+1106>
0xc013f856 <do_no_page+1046>: mov 0x14(%esp,1),%edi
0xc013f85a <do_no_page+1050>: mov (%edi),%eax
0xc013f85c <do_no_page+1052>: test $0x8,%ah
0xc013f85f <do_no_page+1055>: jne 0xc013f888 <do_no_page+1096>
0xc013f861 <do_no_page+1057>: mov 0x4(%edi),%eax
0xc013f864 <do_no_page+1060>: test %eax,%eax
0xc013f866 <do_no_page+1062>: jne 0xc013f870 <do_no_page+1072>
0xc013f868 <do_no_page+1064>: ud2a
0xc013f86a <do_no_page+1066>: inc %ecx
0xc013f86b <do_no_page+1067>: add %esp,0x8bc02686
0xc013f871 <do_no_page+1073>: push %esp
0xc013f872 <do_no_page+1074>: and $0x14,%al
0xc013f874 <do_no_page+1076>: lock decl 0x4(%edx)
0xc013f878 <do_no_page+1080>: sete %al
0xc013f87b <do_no_page+1083>: test %al,%al
0xc013f87d <do_no_page+1085>: je 0xc013f888 <do_no_page+1096>
0xc013f87f <do_no_page+1087>: mov 0x14(%esp,1),%eax
0xc013f883 <do_no_page+1091>: call 0xc01399a0 <__page_cache_release>
0xc013f888 <do_no_page+1096>: movl $0xffffffff,0x34(%esp,1)
0xc013f890 <do_no_page+1104>: jmp 0xc013f850 <do_no_page+1040>
0xc013f892 <do_no_page+1106>: pop %ebx
0xc013f893 <do_no_page+1107>: pop %esi
0xc013f894 <do_no_page+1108>: pop %edi
0xc013f895 <do_no_page+1109>: pop %ebp
0xc013f896 <do_no_page+1110>: add $0x28,%esp
0xc013f899 <do_no_page+1113>: ret
0xc013f89a <do_no_page+1114>: mov %esi,%esi
End of assembler dump.

2004-03-20 18:49:44

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

sorry, it crashed in the pfn_valid because NOPAGE_SIGBUS was returned,
so it was my mistake, will be correctd in -aa3.

2004-03-21 16:30:37

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

>> Mmmm, if you have a broken out patch, it'd be preferable. If I were to
>> apply the whole of -mjb, I'll get a damned sight better results than
>> any of them, but that's not really a fair comparison ;-) I'll can at
>> least check it's stable for me that way though.
>>
>> I did find your broken-out anon-vma patch, but it's against something
>> else, maybe half-way up your tree or something, and I didn't bother
>> trying to fix it ;-)
>
> this one is against mainline, but you must use my objrmap patch too
> which is fixed so it doesn't crash in 2.6.5-rc1.
>
> http://www.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.6/2.6.5-rc1-aa2/00100_objrmap-core-1.gz
> http://www.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.6/2.6.5-rc1-aa2/00101_anon_vma-2.gz
>
> just backout your objrmap and apply the above two, it should apply
> pretty well.

I tried the aa3 equiv of the above, just on top of virgin 2.6.5-rc1, but
it doesn't work cleanly. Your whole aa3 tree runs nicely, but I'd prefer
to have the broken out patch before publishing comparisons, as otherwise
it's a bit unfair ;-) I'm not sure if the results come from your anon_vma
approach, or other patches in your tree ...

I'm presuming you shifted the cost of find_get_page into find_trylock_page
and pgd_ctor into pgd_alloc from the profiles below ...

diffprofile from partial objrmap to aa3:

3809 27207.1% find_trylock_page
569 2845.0% pgd_alloc
242 60.7% dentry_open
100 21.5% do_page_cache_readahead
95 0.0% anon_vma_unlink
67 0.0% anon_vma_prepare
...
-118 -8.1% free_hot_cold_page
-119 -6.7% buffered_rmqueue
-120 -0.8% do_anonymous_page
-131 -2.1% __copy_to_user_ll
-135 -100.0% pte_chain_alloc
-143 -13.3% clear_page_tables
-146 -100.0% __pte_chain_free
-149 -10.7% link_path_walk
-221 -100.0% radix_tree_lookup
-275 -100.0% .text.lock.filemap
-372 -12.0% zap_pte_range
-397 -100.0% pgd_ctor
-584 -25.9% do_no_page
-807 -34.8% page_remove_rmap
-1171 -57.2% page_add_rmap
-1664 -3.4% default_idle
-3564 -99.3% find_get_page
-6182 -4.3% total

diffprofile from hugh's to aa3:

3809 27207.1% find_trylock_page
875 0.0% page_add_rmap
568 2704.8% pgd_alloc
264 0.0% __set_page_dirty_buffers
256 66.5% dentry_open
225 1.6% do_anonymous_page
138 1.7% __d_lookup
97 20.7% do_page_cache_readahead
95 0.0% anon_vma_unlink
76 6.5% file_move
67 0.0% anon_vma_prepare
61 1.2% __copy_from_user_ll
...
-51 -32.7% vma_link
-52 -40.0% fd_install
-52 -2.3% do_page_fault
-55 -10.3% kmap_atomic
-64 -11.9% .text.lock.file_table
-74 -3.2% atomic_dec_and_lock
-77 -16.6% copy_page_range
-79 -1.3% __copy_to_user_ll
-89 -4.4% path_lookup
-92 -11.0% pte_alloc_one
-98 -7.3% link_path_walk
-102 -5.8% buffered_rmqueue
-116 -12.7% release_pages
-129 -12.2% clear_page_tables
-230 -100.0% radix_tree_lookup
-258 -14.6% page_remove_rmap
-303 -100.0% .text.lock.filemap
-342 -93.2% set_page_dirty
-394 -12.6% zap_pte_range
-404 -100.0% pgd_ctor
-656 -100.0% page_add_obj_rmap
-773 -100.0% page_add_anon_rmap
-812 -1.7% default_idle
-2846 -2.0% total
-3827 -99.4% find_get_page



2004-03-21 23:51:22

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Sun, Mar 21, 2004 at 08:30:34AM -0800, Martin J. Bligh wrote:
> >> Mmmm, if you have a broken out patch, it'd be preferable. If I were to
> >> apply the whole of -mjb, I'll get a damned sight better results than
> >> any of them, but that's not really a fair comparison ;-) I'll can at
> >> least check it's stable for me that way though.
> >>
> >> I did find your broken-out anon-vma patch, but it's against something
> >> else, maybe half-way up your tree or something, and I didn't bother
> >> trying to fix it ;-)
> >
> > this one is against mainline, but you must use my objrmap patch too
> > which is fixed so it doesn't crash in 2.6.5-rc1.
> >
> > http://www.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.6/2.6.5-rc1-aa2/00100_objrmap-core-1.gz
> > http://www.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.6/2.6.5-rc1-aa2/00101_anon_vma-2.gz
> >
> > just backout your objrmap and apply the above two, it should apply
> > pretty well.
>
> I tried the aa3 equiv of the above, just on top of virgin 2.6.5-rc1, but
> it doesn't work cleanly. Your whole aa3 tree runs nicely, but I'd prefer
> to have the broken out patch before publishing comparisons, as otherwise
> it's a bit unfair ;-) I'm not sure if the results come from your anon_vma
> approach, or other patches in your tree ...
>
> I'm presuming you shifted the cost of find_get_page into find_trylock_page
> and pgd_ctor into pgd_alloc from the profiles below ...

I cannot see how can find_trylock_page be affected by my anon_vma
changes. The only difference I can see is taht Andrew's -mm writeback
code is adding the _irq to the spinlocks there and I don't see other
obvious changes in that function. I included all -mm writeback changes
primarly to avoid me to maintain two slightly different versions of
anon_vma and secondly to nuke the page->list. Other trees I'm dealing
with daily have those applied already. At the very least that
additional cost that you measured cannot be associated in any way with
the allocation and maintainace of the anon_vma, since that
find_trylock_page cost is a per-page pagecache thing absolutely
unrelated to the anon_vmas costs.

It's probably best that I port my version of objrmap (basically the same
as yours but with the shm swapout fixes) + anon_vma to your tree, it's
not a big effort to do the porting once, I applied Andrew's patches
primarly to avoid porting back and forth all the time.

Just tell me which is exactly the codebase I should port against and
I'll send you a patch shortly.

Thanks!

2004-03-22 15:53:07

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

>> I tried the aa3 equiv of the above, just on top of virgin 2.6.5-rc1, but
>> it doesn't work cleanly. Your whole aa3 tree runs nicely, but I'd prefer
>> to have the broken out patch before publishing comparisons, as otherwise
>> it's a bit unfair ;-) I'm not sure if the results come from your anon_vma
>> approach, or other patches in your tree ...
>>
>> I'm presuming you shifted the cost of find_get_page into find_trylock_page
>> and pgd_ctor into pgd_alloc from the profiles below ...
>
> I cannot see how can find_trylock_page be affected by my anon_vma
> changes. The only difference I can see is taht Andrew's -mm writeback
> code is adding the _irq to the spinlocks there and I don't see other
> obvious changes in that function. I included all -mm writeback changes
> primarly to avoid me to maintain two slightly different versions of
> anon_vma and secondly to nuke the page->list. Other trees I'm dealing
> with daily have those applied already. At the very least that
> additional cost that you measured cannot be associated in any way with
> the allocation and maintainace of the anon_vma, since that
> find_trylock_page cost is a per-page pagecache thing absolutely
> unrelated to the anon_vmas costs.
>
> It's probably best that I port my version of objrmap (basically the same
> as yours but with the shm swapout fixes) + anon_vma to your tree, it's
> not a big effort to do the porting once, I applied Andrew's patches
> primarly to avoid porting back and forth all the time.
>
> Just tell me which is exactly the codebase I should port against and
> I'll send you a patch shortly.

Just against 2.6.5-rc1 virgin is easiest - that's what I was doing the
rest of it against ...

Thanks,

M.

2004-03-22 20:38:18

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH] anobjrmap 7/6 mremap moves

anobjrmap 7/6 handle mremap movements

The six anobjrmap patches I posted last week, based on 2.6.5-rc1,
left two issues outstanding: locating anon pages after mremap move
of inherited vma, and locating obj pages in a non-linear vma. I still
haven't done the non-linear, but this handles the mremap move issue,
following Linus' suggestion to just copy pages in the rare case.

It looks a lot more complicated than Linus was imagining, for several
unrelated reasons. The prime reason is that pages might already have
been swapped out, we'd need to copy those too if shared with another.
So... we swap them back in to make it easy to find them later if we
need to swap them out... hmmm, have we got our priorities right?
Maybe: it is in practice a very rare case, we're just making sure
nobody can clog up the system with mysteriously unswappable pages.

The awkward part is identifying when it is that rare case. I doubt
anyone else is going to love my rmap_needs_broken_cow function; but
even if this version of rmap goes nowhere, or it does something else
than copying the pages, I think the alternative will always involve
some penalty in the rare shared anon case, so we'll still want to
identify that case, so the rmap_needs_broken_cow code still be
needed (under another name). Please examine it with scepticism,
especially that smp_rmb() - a giveaway that I'm out of my depth.

There's also more change than expected because, if we're using up
memory to move the vma, and that's a large vma, then we may well
need to free memory from that vma: but in the past the new copy
of the vma has been hidden from the page stealer until afterwards.
Any good reason? I don't see why, and have changed that.

On previous visits to move_vma I've just changed what I had to and
run away again. This time I've made it much more how I think it
should be: check map_count at the start, let vma_merge decide on
vma merging, apply do_munmap to old if successful or new if failed,
get accounting right on failure. Even if anobjrmap codes ends up
in the bin, there's changes here I'd want to put back in mainline.
But I admit that so far this has had less testing than it needs.

Unrelated, but fixes to a couple of points from Rajesh: we need to
be careful about the ordering of the i_mmap lists in case there's a
racing vmtruncate. There was a good comment in fork.c, but someone
chose exactly the wrong way round when converting to list_add_tail.

One non-issue: I thought there would be a strict commit accounting
difficulty with copying the moved pages in a readonly mapping. But
in fact not: if they're now anon pages (not file or zero), ancestor
must have been writable in the past, VM_ACCOUNT set then and now
inherited, dup_mmap has already made the pessimistic deduction.

--- anobjrmap6/include/linux/mm.h 2004-03-17 22:00:34.000000000 +0000
+++ anobjrmap7/include/linux/mm.h 2004-03-21 21:45:36.741181120 +0000
@@ -511,6 +511,9 @@ extern void si_meminfo_node(struct sysin
extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
struct rb_node **, struct rb_node *);
+extern struct vm_area_struct *copy_vma(struct vm_area_struct *,
+ unsigned long addr, unsigned long len, unsigned long pgoff);
+extern void vma_relink_file(struct vm_area_struct *, struct vm_area_struct *);
extern void exit_mmap(struct mm_struct *);

extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
--- anobjrmap6/include/linux/swap.h 2004-03-17 22:00:34.000000000 +0000
+++ anobjrmap7/include/linux/swap.h 2004-03-21 20:18:35.688901712 +0000
@@ -214,6 +214,8 @@ extern void swap_free(swp_entry_t);
extern void free_swap_and_cache(swp_entry_t);
extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
extern struct swap_info_struct *get_swap_info_struct(unsigned);
+extern struct swap_info_struct *swap_info_get(swp_entry_t);
+extern void swap_info_put(struct swap_info_struct *);
extern int can_share_swap_page(struct page *);
extern int remove_exclusive_swap_page(struct page *);

--- anobjrmap6/kernel/fork.c 2004-03-17 22:00:34.000000000 +0000
+++ anobjrmap7/kernel/fork.c 2004-03-21 15:39:59.147202328 +0000
@@ -323,7 +323,7 @@ static inline int dup_mmap(struct mm_str

/* insert tmp into the share list, just after mpnt */
down(&file->f_mapping->i_shared_sem);
- list_add_tail(&tmp->shared, &mpnt->shared);
+ list_add(&tmp->shared, &mpnt->shared);
up(&file->f_mapping->i_shared_sem);
}

--- anobjrmap6/mm/memory.c 2004-03-17 22:32:46.000000000 +0000
+++ anobjrmap7/mm/memory.c 2004-03-21 16:56:15.291522648 +0000
@@ -1256,11 +1256,13 @@ static int do_swap_page(struct mm_struct

/* The page isn't present yet, go ahead with the fault. */

+ mm->rss++;
+ page_add_anon_rmap(page, mm, address);
+
swap_free(entry);
if (vm_swap_full())
remove_exclusive_swap_page(page);

- mm->rss++;
pte = mk_pte(page, vma->vm_page_prot);
if (write_access && can_share_swap_page(page))
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1268,7 +1270,6 @@ static int do_swap_page(struct mm_struct

flush_icache_page(vma, page);
set_pte(page_table, pte);
- page_add_anon_rmap(page, mm, address);

/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
--- anobjrmap6/mm/mmap.c 2004-03-16 07:00:20.000000000 +0000
+++ anobjrmap7/mm/mmap.c 2004-03-22 18:06:48.622957992 +0000
@@ -383,7 +383,8 @@ can_vma_merge_after(struct vm_area_struc
* whether that can be merged with its predecessor or its successor. Or
* both (it neatly fills a hole).
*/
-static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
+static struct vm_area_struct *vma_merge(struct mm_struct *mm,
+ struct vm_area_struct *prev,
struct rb_node *rb_parent, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct file *file, unsigned long pgoff)
@@ -397,7 +398,7 @@ static int vma_merge(struct mm_struct *m
* vma->vm_flags & VM_SPECIAL, too.
*/
if (vm_flags & VM_SPECIAL)
- return 0;
+ return NULL;

i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL;

@@ -410,7 +411,6 @@ static int vma_merge(struct mm_struct *m
* Can it merge with the predecessor?
*/
if (prev->vm_end == addr &&
- is_mergeable_vma(prev, file, vm_flags) &&
can_vma_merge_after(prev, vm_flags, file, pgoff)) {
struct vm_area_struct *next;
int need_up = 0;
@@ -441,12 +441,12 @@ static int vma_merge(struct mm_struct *m

mm->map_count--;
kmem_cache_free(vm_area_cachep, next);
- return 1;
+ return prev;
}
spin_unlock(lock);
if (need_up)
up(i_shared_sem);
- return 1;
+ return prev;
}

/*
@@ -457,7 +457,7 @@ static int vma_merge(struct mm_struct *m
merge_next:
if (!can_vma_merge_before(prev, vm_flags, file,
pgoff, (end - addr) >> PAGE_SHIFT))
- return 0;
+ return NULL;
if (end == prev->vm_start) {
if (file)
down(i_shared_sem);
@@ -467,11 +467,11 @@ static int vma_merge(struct mm_struct *m
spin_unlock(lock);
if (file)
up(i_shared_sem);
- return 1;
+ return prev;
}
}

- return 0;
+ return NULL;
}

/*
@@ -1484,5 +1484,57 @@ void insert_vm_struct(struct mm_struct *
if (__vma && __vma->vm_start < vma->vm_end)
BUG();
vma_link(mm, vma, prev, rb_link, rb_parent);
- validate_mm(mm);
+}
+
+/*
+ * Copy the vma structure to a new location in the same mm,
+ * prior to moving page table entries, to effect an mremap move.
+ */
+struct vm_area_struct *copy_vma(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long len, unsigned long pgoff)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *new_vma, *prev;
+ struct rb_node **rb_link, *rb_parent;
+
+ find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ new_vma = vma_merge(mm, prev, rb_parent, addr, addr + len,
+ vma->vm_flags, vma->vm_file, pgoff);
+ if (!new_vma) {
+ new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (new_vma) {
+ *new_vma = *vma;
+ INIT_LIST_HEAD(&new_vma->shared);
+ new_vma->vm_start = addr;
+ new_vma->vm_end = addr + len;
+ new_vma->vm_pgoff = pgoff;
+ if (new_vma->vm_file)
+ get_file(new_vma->vm_file);
+ if (new_vma->vm_ops && new_vma->vm_ops->open)
+ new_vma->vm_ops->open(new_vma);
+ vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ }
+ }
+ return new_vma;
+}
+
+/*
+ * Position vma after prev in shared file list:
+ * for mremap move error recovery racing against vmtruncate.
+ */
+void vma_relink_file(struct vm_area_struct *vma, struct vm_area_struct *prev)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct address_space *mapping;
+
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ if (mapping) {
+ down(&mapping->i_shared_sem);
+ spin_lock(&mm->page_table_lock);
+ list_move(&vma->shared, &prev->shared);
+ spin_unlock(&mm->page_table_lock);
+ up(&mapping->i_shared_sem);
+ }
+ }
}
--- anobjrmap6/mm/mremap.c 2004-03-17 22:00:34.000000000 +0000
+++ anobjrmap7/mm/mremap.c 2004-03-22 19:44:32.880455344 +0000
@@ -16,6 +16,8 @@
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/rmap.h>
+#include <linux/pagemap.h>
+#include <linux/swapops.h>
#include <linux/security.h>

#include <asm/uaccess.h>
@@ -79,30 +81,102 @@ static inline pte_t *alloc_one_pte_map(s
return pte;
}

-static void
-copy_one_pte(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pte_t *src, pte_t *dst)
+#ifdef CONFIG_SWAP
+/*
+ * rmap_needs_broken_cow is for mremap MAYMOVE's move_one_page.
+ * The anonmm objrmap can only track anon page movements if the
+ * page (or swap entry) is exclusive to the mm, but we don't
+ * want the waste of early COW break unless it's necessary.
+ * This tells us, with side-effect to update anon rmap if okay.
+ * page_table_lock (and mmap_sem) are held throughout.
+ */
+static int rmap_needs_broken_cow(pte_t *ptep, unsigned long new_addr)
{
- pte_t pte;
-
- pte = ptep_clear_flush(vma, old_addr, src);
- set_pte(dst, pte);
+ pte_t pte = *ptep;
+ unsigned long pfn;
+ struct page *page;
+ swp_entry_t entry;
+ struct swap_info_struct *si;
+ unsigned int mapcount = 0;

- /*
- * This block handles a common case, but is grossly inadequate
- * for the general case: what if the anon page is shared with
- * parent or child? what if it's currently swapped out?
- * Return to handle mremap moving rmap in a later patch.
- */
if (pte_present(pte)) {
- unsigned long pfn = pte_pfn(pte);
- if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
- if (PageAnon(page))
- page_update_anon_rmap(page, vma->vm_mm, new_addr);
+ pfn = pte_pfn(pte);
+ if (!pfn_valid(pfn))
+ return 0;
+ page = pfn_to_page(pfn);
+ if (!PageAnon(page))
+ return 0;
+ if (pte_write(pte))
+ goto update;
+again:
+ /*
+ * page->private on a PageAnon page is always the
+ * swap entry (if PageSwapCache) or 0 (if not):
+ * so we can peep at page->private without taking
+ * a lock, no need to check PageSwapCache too.
+ */
+ entry.val = page->private;
+ smp_rmb();
+ mapcount = page->mapcount;
+ if (mapcount > 1)
+ return 1;
+ if (!entry.val)
+ goto update;
+ /*
+ * This is tricky: entry can get freed right here,
+ * since we don't hold the page lock (and cannot wait
+ * for it). Use swap_duplicate which, already allows
+ * for that, before the less forgiving swap_info_get.
+ */
+ if (!swap_duplicate(entry))
+ goto again;
+ si = swap_info_get(entry);
+ if (si) {
+ mapcount = si->swap_map[swp_offset(entry)] +
+ page->mapcount - 2;
+ swap_info_put(si);
+ } else
+ mapcount = 0;
+ swap_free(entry);
+ if (entry.val != page->private)
+ goto again;
+ if (mapcount > 1)
+ return 1;
+update:
+ /* Before we forget the struct page, update its rmap */
+ page_update_anon_rmap(page, current->mm, new_addr);
+ return 0;
+ }
+
+ if (!pte_file(pte) && !pte_none(pte)) {
+ entry = pte_to_swp_entry(pte);
+ si = swap_info_get(entry);
+ if (si) {
+ page = NULL;
+ mapcount = si->swap_map[swp_offset(entry)];
+ if (mapcount == 2) {
+ page = lookup_swap_cache(entry);
+ if (page)
+ mapcount = page->mapcount + 1;
+ }
+ swap_info_put(si);
+ if (page)
+ page_cache_release(page);
}
}
+
+ return mapcount > 1;
}
+#else /* !CONFIG_SWAP */
+
+/*
+ * The swap interfaces used above are not available. Actually,
+ * all of the anonymous rmap is just a waste of space-time in this case.
+ * But no enthusiam for peppering the code with #ifdefs right now.
+ */
+#define rmap_needs_broken_cow(ptep, new_addr) 0
+
+#endif /* CONFIG_SWAP */

static int
move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
@@ -132,10 +206,15 @@ move_one_page(struct vm_area_struct *vma
* page_table_lock, we should re-check the src entry...
*/
if (src) {
- if (dst)
- copy_one_pte(vma, old_addr, new_addr, src, dst);
- else
+ if (!dst)
error = -ENOMEM;
+ else if (rmap_needs_broken_cow(src, new_addr))
+ error = -EAGAIN;
+ else {
+ pte_t pte;
+ pte = ptep_clear_flush(vma, old_addr, src);
+ set_pte(dst, pte);
+ }
pte_unmap_nested(src);
}
pte_unmap(dst);
@@ -147,7 +226,8 @@ move_one_page(struct vm_area_struct *vma
static int move_page_tables(struct vm_area_struct *vma,
unsigned long new_addr, unsigned long old_addr, unsigned long len)
{
- unsigned long offset = len;
+ unsigned long offset = 0;
+ int ret;

flush_cache_range(vma, old_addr, old_addr + len);

@@ -156,137 +236,107 @@ static int move_page_tables(struct vm_ar
* easy way out on the assumption that most remappings will be
* only a few pages.. This also makes error recovery easier.
*/
- while (offset) {
- offset -= PAGE_SIZE;
- if (move_one_page(vma, old_addr + offset, new_addr + offset))
- goto oops_we_failed;
+ while (offset < len) {
+ ret = move_one_page(vma, old_addr+offset, new_addr+offset);
+ if (!ret) {
+ offset += PAGE_SIZE;
+ continue;
+ }
+ if (ret != -EAGAIN)
+ break;
+ /*
+ * The anonmm objrmap can only track anon page movements
+ * if the page (or swap entry) is exclusive to this mm.
+ * In the very unusual case when it's shared, break COW
+ * (take a copy of the page) to make it exclusive. If
+ * the page is shared and on swap, move_one_page will
+ * normally succeed on the third attempt (do_swap_page
+ * does not break COW); but under very great pressure it
+ * could get swapped out again and need more attempts.
+ */
+ ret = handle_mm_fault(vma->vm_mm, vma, old_addr+offset, 1);
+ if (ret != VM_FAULT_MINOR && ret != VM_FAULT_MAJOR)
+ break;
}
- return 0;
-
- /*
- * Ok, the move failed because we didn't have enough pages for
- * the new page table tree. This is unlikely, but we have to
- * take the possibility into account. In that case we just move
- * all the pages back (this will work, because we still have
- * the old page tables)
- */
-oops_we_failed:
- flush_cache_range(vma, new_addr, new_addr + len);
- while ((offset += PAGE_SIZE) < len)
- move_one_page(vma, new_addr + offset, old_addr + offset);
- zap_page_range(vma, new_addr, len);
- return -1;
+ return offset;
}

static unsigned long move_vma(struct vm_area_struct *vma,
- unsigned long addr, unsigned long old_len, unsigned long new_len,
- unsigned long new_addr)
+ unsigned long old_addr, unsigned long old_len,
+ unsigned long new_len, unsigned long new_addr)
{
struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *new_vma, *next, *prev;
- int allocated_vma;
+ struct vm_area_struct *new_vma;
+ unsigned long vm_flags = vma->vm_flags;
+ unsigned long new_pgoff;
+ unsigned long moved_len;
+ unsigned long excess = 0;
int split = 0;

- new_vma = NULL;
- next = find_vma_prev(mm, new_addr, &prev);
- if (next) {
- if (prev && prev->vm_end == new_addr &&
- can_vma_merge(prev, vma->vm_flags) && !vma->vm_file &&
- !(vma->vm_flags & VM_SHARED)) {
- spin_lock(&mm->page_table_lock);
- prev->vm_end = new_addr + new_len;
- spin_unlock(&mm->page_table_lock);
- new_vma = prev;
- if (next != prev->vm_next)
- BUG();
- if (prev->vm_end == next->vm_start &&
- can_vma_merge(next, prev->vm_flags)) {
- spin_lock(&mm->page_table_lock);
- prev->vm_end = next->vm_end;
- __vma_unlink(mm, next, prev);
- spin_unlock(&mm->page_table_lock);
- if (vma == next)
- vma = prev;
- mm->map_count--;
- kmem_cache_free(vm_area_cachep, next);
- }
- } else if (next->vm_start == new_addr + new_len &&
- can_vma_merge(next, vma->vm_flags) &&
- !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
- spin_lock(&mm->page_table_lock);
- next->vm_start = new_addr;
- spin_unlock(&mm->page_table_lock);
- new_vma = next;
- }
- } else {
- prev = find_vma(mm, new_addr-1);
- if (prev && prev->vm_end == new_addr &&
- can_vma_merge(prev, vma->vm_flags) && !vma->vm_file &&
- !(vma->vm_flags & VM_SHARED)) {
- spin_lock(&mm->page_table_lock);
- prev->vm_end = new_addr + new_len;
- spin_unlock(&mm->page_table_lock);
- new_vma = prev;
- }
- }
+ /*
+ * We'd prefer to avoid failure later on in do_munmap:
+ * which may split one vma into three before unmapping.
+ */
+ if (mm->map_count >= MAX_MAP_COUNT - 3)
+ return -ENOMEM;

- allocated_vma = 0;
- if (!new_vma) {
- new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!new_vma)
- goto out;
- allocated_vma = 1;
- }
-
- if (!move_page_tables(vma, new_addr, addr, old_len)) {
- unsigned long vm_locked = vma->vm_flags & VM_LOCKED;
-
- if (allocated_vma) {
- *new_vma = *vma;
- INIT_LIST_HEAD(&new_vma->shared);
- new_vma->vm_start = new_addr;
- new_vma->vm_end = new_addr+new_len;
- new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT;
- if (new_vma->vm_file)
- get_file(new_vma->vm_file);
- if (new_vma->vm_ops && new_vma->vm_ops->open)
- new_vma->vm_ops->open(new_vma);
- insert_vm_struct(current->mm, new_vma);
- }
+ new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
+ new_vma = copy_vma(vma, new_addr, new_len, new_pgoff);
+ if (!new_vma)
+ return -ENOMEM;

- /* Conceal VM_ACCOUNT so old reservation is not undone */
- if (vma->vm_flags & VM_ACCOUNT) {
- vma->vm_flags &= ~VM_ACCOUNT;
- if (addr > vma->vm_start) {
- if (addr + old_len < vma->vm_end)
- split = 1;
- } else if (addr + old_len == vma->vm_end)
- vma = NULL; /* it will be removed */
- } else
- vma = NULL; /* nothing more to do */
+ moved_len = move_page_tables(vma, new_addr, old_addr, old_len);
+ if (moved_len < old_len) {
+ /*
+ * On error, move entries back from new area to old,
+ * which will succeed since page tables still there,
+ * and then proceed to unmap new area instead of old.
+ *
+ * Subtle point from Rajesh Venkatasubramanian: before
+ * moving file-based ptes, move new_vma before old vma
+ * in the i_mmap or i_mmap_shared list, so when racing
+ * against vmtruncate we cannot propagate pages to be
+ * truncated back from new_vma into just cleaned old.
+ */
+ vma_relink_file(vma, new_vma);
+ move_page_tables(new_vma, old_addr, new_addr, moved_len);
+ vma = new_vma;
+ old_len = new_len;
+ old_addr = new_addr;
+ new_addr = -ENOMEM;
+ }

- do_munmap(current->mm, addr, old_len);
+ /* Conceal VM_ACCOUNT so old reservation is not undone */
+ if (vm_flags & VM_ACCOUNT) {
+ vma->vm_flags &= ~VM_ACCOUNT;
+ excess = vma->vm_end - vma->vm_start - old_len;
+ if (old_addr > vma->vm_start &&
+ old_addr + old_len < vma->vm_end)
+ split = 1;
+ }

- /* Restore VM_ACCOUNT if one or two pieces of vma left */
- if (vma) {
- vma->vm_flags |= VM_ACCOUNT;
- if (split)
- vma->vm_next->vm_flags |= VM_ACCOUNT;
- }
+ if (do_munmap(mm, old_addr, old_len) < 0) {
+ /* OOM: unable to split vma, just get accounts right */
+ vm_unacct_memory(excess >> PAGE_SHIFT);
+ excess = 0;
+ }

- current->mm->total_vm += new_len >> PAGE_SHIFT;
- if (vm_locked) {
- current->mm->locked_vm += new_len >> PAGE_SHIFT;
- if (new_len > old_len)
- make_pages_present(new_addr + old_len,
- new_addr + new_len);
- }
- return new_addr;
+ /* Restore VM_ACCOUNT if one or two pieces of vma left */
+ if (excess) {
+ vma->vm_flags |= VM_ACCOUNT;
+ if (split)
+ vma->vm_next->vm_flags |= VM_ACCOUNT;
}
- if (allocated_vma)
- kmem_cache_free(vm_area_cachep, new_vma);
- out:
- return -ENOMEM;
+
+ mm->total_vm += new_len >> PAGE_SHIFT;
+ if (vm_flags & VM_LOCKED) {
+ mm->locked_vm += new_len >> PAGE_SHIFT;
+ if (new_len > old_len)
+ make_pages_present(new_addr + old_len,
+ new_addr + new_len);
+ }
+
+ return new_addr;
}

/*
@@ -430,6 +480,7 @@ unsigned long do_mremap(unsigned long ad
if (flags & MREMAP_MAYMOVE) {
if (!(flags & MREMAP_FIXED)) {
unsigned long map_flags = 0;
+
if (vma->vm_flags & VM_MAYSHARE)
map_flags |= MAP_SHARED;

--- anobjrmap6/mm/rmap.c 2004-03-17 22:00:34.000000000 +0000
+++ anobjrmap7/mm/rmap.c 2004-03-22 18:08:27.851872904 +0000
@@ -25,6 +25,8 @@
#include <linux/init.h>
#include <linux/rmap.h>

+#include <asm/tlbflush.h>
+
/*
* struct anonmm: to track a bundle of anonymous memory mappings.
*
@@ -483,7 +485,6 @@ static int try_to_unmap_one(struct page

/*
* If the page is mlock()d, we cannot swap it out.
- * During mremap, it's possible pages are not in a VMA.
*/
if (!vma)
vma = find_vma(mm, address);
--- anobjrmap6/mm/swap_state.c 2004-03-17 22:00:34.000000000 +0000
+++ anobjrmap7/mm/swap_state.c 2004-03-21 19:14:35.492700272 +0000
@@ -120,6 +120,7 @@ void __delete_from_swap_cache(struct pag
BUG_ON(PageWriteback(page));

radix_tree_delete(&swapper_space.page_tree, page->private);
+ page->private = 0;
ClearPageSwapCache(page);
total_swapcache_pages--;
pagecache_acct(-1);
--- anobjrmap6/mm/swapfile.c 2004-03-17 22:00:34.000000000 +0000
+++ anobjrmap7/mm/swapfile.c 2004-03-21 20:19:30.156621360 +0000
@@ -158,7 +158,7 @@ out:
return entry;
}

-static struct swap_info_struct * swap_info_get(swp_entry_t entry)
+struct swap_info_struct * swap_info_get(swp_entry_t entry)
{
struct swap_info_struct * p;
unsigned long offset, type;
@@ -197,7 +197,7 @@ out:
return NULL;
}

-static void swap_info_put(struct swap_info_struct * p)
+void swap_info_put(struct swap_info_struct * p)
{
swap_device_unlock(p);
swap_list_unlock();
@@ -254,7 +254,7 @@ static int exclusive_swap_page(struct pa
if (p->swap_map[swp_offset(entry)] == 1) {
/* Recheck the page count with the pagecache lock held.. */
spin_lock(&swapper_space.page_lock);
- if (page_count(page) - !!PagePrivate(page) == 2)
+ if (page_count(page) == 2)
retval = 1;
spin_unlock(&swapper_space.page_lock);
}

Subject: Re: [PATCH] anobjrmap 7/6 mremap moves


> +struct vm_area_struct *copy_vma(struct vm_area_struct *vma,
> + unsigned long addr, unsigned long len, unsigned long pgoff)
> +{
[snip]

> + find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
> + new_vma = vma_merge(mm, prev, rb_parent, addr, addr + len,
> + vma->vm_flags, vma->vm_file, pgoff);
[snip]
> +}
>
> static unsigned long move_vma(struct vm_area_struct *vma,
> {
[snip]
> + new_vma = copy_vma(vma, new_addr, new_len, new_pgoff);
> + if (!new_vma)
> + return -ENOMEM;
[snip]
> + moved_len = move_page_tables(vma, new_addr, old_addr, old_len);
> + if (moved_len < old_len) {
> + /*
> + * On error, move entries back from new area to old,
> + * which will succeed since page tables still there,
> + * and then proceed to unmap new area instead of old.
> + *
> + * Subtle point from Rajesh Venkatasubramanian: before
> + * moving file-based ptes, move new_vma before old vma
> + * in the i_mmap or i_mmap_shared list, so when racing
> + * against vmtruncate we cannot propagate pages to be
> + * truncated back from new_vma into just cleaned old.
> + */
> + vma_relink_file(vma, new_vma);
> + move_page_tables(new_vma, old_addr, new_addr, moved_len);
> + vma = new_vma;
> + old_len = new_len;
> + old_addr = new_addr;
> + new_addr = -ENOMEM;
> + }

IF prio_tree gets merged and IF we plan to go by this vma ordering
solution for fixing vmtruncate vs. mremap race, then the vma_merge
in the copy_vma can bite us. The prio_tree only keeps the vmas that
_exactly_ map the same file pages in a list. If two vmas map different
set of file pages, then the ordering between them can change under
us if we drop i_shared_sem.

If we decide to go with the ordering solution, then my plan is to
setup an identical new_vma (that exactly maps same file pages as vma)
at new_addr and move the page tables. If the move fails, then use
vma_relink_file. If the move is successful, then we can do vma_merge.
I think we can think about these changes later. No hurry at this point.

Thanks,
Rajesh

2004-03-24 06:19:07

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Mon, Mar 22, 2004 at 07:53:02AM -0800, Martin J. Bligh wrote:
> Just against 2.6.5-rc1 virgin is easiest - that's what I was doing the
> rest of it against ...

here it is:

http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.5-rc1/anon-vma-2.6.5-rc2-aa2.gz
http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.5-rc1/objrmap-core-2.6.5-rc2-aa2.gz

2004-03-24 15:57:01

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap



--Andrea Arcangeli <[email protected]> wrote (on Wednesday, March 24, 2004 07:19:57 +0100):

> On Mon, Mar 22, 2004 at 07:53:02AM -0800, Martin J. Bligh wrote:
>> Just against 2.6.5-rc1 virgin is easiest - that's what I was doing the
>> rest of it against ...
>
> here it is:
>
> http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.5-rc1/anon-vma-2.6.5-rc2-aa2.gz
> http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.5-rc1/objrmap-core-2.6.5-rc2-aa2.gz
>
>

Yay, that works ;-) Without the rest of your tree, performance of anon_vma
is almost exactly = anon_mm ... of course all this is under no mem pressure,
I'll have to do some more tests on another machine without infinite ram to
see what happens as we start to reclaim ;-)

Kernbench: (make -j N vmlinux, where N = 2 x num_cpus)
Elapsed System User CPU
2.6.5-rc1 45.75 102.49 577.39 1486.00
2.6.5-rc1-partial 44.84 85.75 576.63 1476.67
2.6.5-rc1-hugh 44.79 83.85 576.71 1474.67
2.6.5-rc1-anon_vma 44.66 83.69 577.14 1479.00
2.6.5-rc1-aa3 44.57 81.57 577.45 1477.67

Kernbench: (make -j N vmlinux, where N = 16 x num_cpus)
Elapsed System User CPU
2.6.5-rc1 46.99 121.95 580.82 1495.33
2.6.5-rc1-partial 45.09 97.16 579.59 1501.00
2.6.5-rc1-hugh 45.00 95.45 579.05 1498.67
2.6.5-rc1-anon_vma 44.90 96.17 579.60 1503.67
2.6.5-rc1-aa3 45.03 93.27 579.84 1494.33

Kernbench: (make -j vmlinux, maximal tasks)
Elapsed System User CPU
2.6.5-rc1 46.96 122.43 580.65 1495.00
2.6.5-rc1-partial 45.18 93.60 579.10 1488.33
2.6.5-rc1-hugh 44.89 91.04 578.49 1490.33
2.6.5-rc1-anon_vma 44.92 91.96 578.86 1493.33
2.6.5-rc1-aa3 44.77 89.29 578.61 1491.33


DISCLAIMER: SPEC(tm) and the benchmark name SDET(tm) are registered
trademarks of the Standard Performance Evaluation Corporation. This
benchmarking was performed for research purposes only, and the run results
are non-compliant and not-comparable with any published results.

Results are shown as percentages of the first set displayed

SDET 1 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 3.0%
2.6.5-rc1-partial 101.4% 1.3%
2.6.5-rc1-hugh 100.0% 2.9%
2.6.5-rc1-anon_vma 101.4% 1.9%
2.6.5-rc1-aa3 104.1% 4.0%

SDET 2 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 1.3%
2.6.5-rc1-partial 107.7% 1.0%
2.6.5-rc1-hugh 108.7% 1.5%
2.6.5-rc1-anon_vma 109.5% 0.7%
2.6.5-rc1-aa3 107.4% 1.3%

SDET 4 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 0.7%
2.6.5-rc1-partial 110.5% 0.6%
2.6.5-rc1-hugh 114.6% 1.3%
2.6.5-rc1-anon_vma 113.3% 0.3%
2.6.5-rc1-aa3 116.1% 1.5%

SDET 8 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 0.9%
2.6.5-rc1-partial 119.4% 0.5%
2.6.5-rc1-hugh 120.2% 1.1%
2.6.5-rc1-anon_vma 119.6% 0.0%
2.6.5-rc1-aa3 124.4% 0.2%

SDET 16 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 0.1%
2.6.5-rc1-partial 118.1% 0.2%
2.6.5-rc1-hugh 119.8% 0.4%
2.6.5-rc1-anon_vma 119.9% 0.8%
2.6.5-rc1-aa3 122.1% 1.1%

SDET 32 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 0.2%
2.6.5-rc1-partial 119.2% 1.0%
2.6.5-rc1-hugh 120.4% 0.4%
2.6.5-rc1-anon_vma 121.8% 0.6%
2.6.5-rc1-aa3 121.1% 0.8%

SDET 64 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 0.3%
2.6.5-rc1-partial 122.1% 0.5%
2.6.5-rc1-hugh 123.5% 0.4%
2.6.5-rc1-anon_vma 123.3% 0.8%
2.6.5-rc1-aa3 123.0% 0.6%

SDET 128 (see disclaimer)
Throughput Std. Dev
2.6.5-rc1 100.0% 0.2%
2.6.5-rc1-partial 123.1% 0.4%
2.6.5-rc1-hugh 124.7% 0.7%
2.6.5-rc1-anon_vma 123.9% 0.3%
2.6.5-rc1-aa3 124.4% 0.1%


For interest's sake, here's the diffprofile for kernbench from
anon_mm to the whole -aa tree ...

3808 25386.7% find_trylock_page
568 2704.8% pgd_alloc
273 74.2% dentry_open
125 11.2% file_move
106 23.1% do_page_cache_readahead
64 0.5% do_anonymous_page
...
-64 -1.0% __copy_to_user_ll
-66 -12.2% .text.lock.file_table
-72 -0.8% __d_lookup
-78 -3.9% path_lookup
-84 -14.9% kmap_atomic
-92 -11.0% pte_alloc_one
-97 -13.7% generic_file_open
-106 -11.2% kmem_cache_free
-121 -13.2% release_pages
-126 -12.6% page_add_rmap
-137 -12.9% clear_page_tables
-212 -7.2% zap_pte_range
-235 -100.0% radix_tree_lookup
-239 -12.5% buffered_rmqueue
-268 -17.8% link_path_walk
-291 -100.0% .text.lock.filemap
-397 -20.8% page_remove_rmap
-398 -100.0% pgd_ctor
-461 -21.6% do_no_page
-669 -1.4% default_idle
-3508 -2.5% total
-3719 -99.4% find_get_page

zap_pte_range and page_remove_rmap and do_no_page are cheaper ... are we
setting up and tearing down pages less frequently somehow? Would be
curious to know which patch that is ...

M.

2004-03-24 16:20:31

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Wed, Mar 24, 2004 at 07:56:39AM -0800, Martin J. Bligh wrote:
>
>
> --Andrea Arcangeli <[email protected]> wrote (on Wednesday, March 24, 2004 07:19:57 +0100):
>
> > On Mon, Mar 22, 2004 at 07:53:02AM -0800, Martin J. Bligh wrote:
> >> Just against 2.6.5-rc1 virgin is easiest - that's what I was doing the
> >> rest of it against ...
> >
> > here it is:
> >
> > http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.5-rc1/anon-vma-2.6.5-rc2-aa2.gz
> > http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.5-rc1/objrmap-core-2.6.5-rc2-aa2.gz
> >
> >
>
> Yay, that works ;-) Without the rest of your tree, performance of anon_vma
> is almost exactly = anon_mm ... of course all this is under no mem pressure,
> I'll have to do some more tests on another machine without infinite ram to
> see what happens as we start to reclaim ;-)

excellent. under reclaim at least in theory you should see less cpu
utilization with anon_vma since the page links directly to the vmas.

> Kernbench: (make -j N vmlinux, where N = 2 x num_cpus)
> Elapsed System User CPU
> 2.6.5-rc1 45.75 102.49 577.39 1486.00
> 2.6.5-rc1-partial 44.84 85.75 576.63 1476.67
> 2.6.5-rc1-hugh 44.79 83.85 576.71 1474.67
> 2.6.5-rc1-anon_vma 44.66 83.69 577.14 1479.00

anonvma is the fastest here.

> Kernbench: (make -j N vmlinux, where N = 16 x num_cpus)
> Elapsed System User CPU
> 2.6.5-rc1 46.99 121.95 580.82 1495.33
> 2.6.5-rc1-partial 45.09 97.16 579.59 1501.00
> 2.6.5-rc1-hugh 45.00 95.45 579.05 1498.67
> 2.6.5-rc1-anon_vma 44.90 96.17 579.60 1503.67

here again the fastest.

>
> Kernbench: (make -j vmlinux, maximal tasks)
> Elapsed System User CPU
> 2.6.5-rc1 46.96 122.43 580.65 1495.00
> 2.6.5-rc1-partial 45.18 93.60 579.10 1488.33
> 2.6.5-rc1-hugh 44.89 91.04 578.49 1490.33
> 2.6.5-rc1-anon_vma 44.92 91.96 578.86 1493.33

here it's not the fastest (though a 0.03 difference should be in the
error range with an unlimited -j)

I also left a zillon of BUG_ON enabled (in cpu-bound fast paths, of
page_add_rmap/page-faults/pagecache etc..), those in theory can be all
removed.

> SDET 1 (see disclaimer)
> Throughput Std. Dev
> 2.6.5-rc1 100.0% 3.0%
> 2.6.5-rc1-partial 101.4% 1.3%
> 2.6.5-rc1-hugh 100.0% 2.9%
> 2.6.5-rc1-anon_vma 101.4% 1.9%

here it's as fast as plain objrmap.

> SDET 2 (see disclaimer)
> Throughput Std. Dev
> 2.6.5-rc1 100.0% 1.3%
> 2.6.5-rc1-partial 107.7% 1.0%
> 2.6.5-rc1-hugh 108.7% 1.5%
> 2.6.5-rc1-anon_vma 109.5% 0.7%

here it's the fastest.

> SDET 4 (see disclaimer)
> Throughput Std. Dev
> 2.6.5-rc1 100.0% 0.7%
> 2.6.5-rc1-partial 110.5% 0.6%
> 2.6.5-rc1-hugh 114.6% 1.3%
> 2.6.5-rc1-anon_vma 113.3% 0.3%

here a 1% slower though anonmm has 1% of standard deviation (higher than
all the others).

> SDET 8 (see disclaimer)
> Throughput Std. Dev
> 2.6.5-rc1 100.0% 0.9%
> 2.6.5-rc1-partial 119.4% 0.5%
> 2.6.5-rc1-hugh 120.2% 1.1%
> 2.6.5-rc1-anon_vma 119.6% 0.0%

here 1% slower tha anonmm though anonm still has a 1% std deviation.

it's interesting I get 0 standard deviation. Is it possible I get lower
standard deviation because you run it less times? just wondering. I'd
expect SDET has a default number of passes, so I expect the answer is no
of course.

> SDET 16 (see disclaimer)
> Throughput Std. Dev
> 2.6.5-rc1 100.0% 0.1%
> 2.6.5-rc1-partial 118.1% 0.2%
> 2.6.5-rc1-hugh 119.8% 0.4%
> 2.6.5-rc1-anon_vma 119.9% 0.8%

here the fastest.

> SDET 32 (see disclaimer)
> Throughput Std. Dev
> 2.6.5-rc1 100.0% 0.2%
> 2.6.5-rc1-partial 119.2% 1.0%
> 2.6.5-rc1-hugh 120.4% 0.4%
> 2.6.5-rc1-anon_vma 121.8% 0.6%

more than 1% faster.

> SDET 64 (see disclaimer)
> Throughput Std. Dev
> 2.6.5-rc1 100.0% 0.3%
> 2.6.5-rc1-partial 122.1% 0.5%
> 2.6.5-rc1-hugh 123.5% 0.4%
> 2.6.5-rc1-anon_vma 123.3% 0.8%

here .2% slower.

> SDET 128 (see disclaimer)
> Throughput Std. Dev
> 2.6.5-rc1 100.0% 0.2%
> 2.6.5-rc1-partial 123.1% 0.4%
> 2.6.5-rc1-hugh 124.7% 0.7%
> 2.6.5-rc1-anon_vma 123.9% 0.3%

around 1% slower here.

overall I think for the fast path we can conclude they're at least
equally fast.

Using Christoph's teqniques of splitting out the swapper_space checks
from the pagecache paths I can boost some more cpu cycle into anon_vma
btw (very low prio at this time, much better to keep the patch smaller
and more robust while it's out of the mainline tree).

> For interest's sake, here's the diffprofile for kernbench from
> anon_mm to the whole -aa tree ...
>
> 3808 25386.7% find_trylock_page
> 568 2704.8% pgd_alloc
> 273 74.2% dentry_open
> 125 11.2% file_move
> 106 23.1% do_page_cache_readahead
> 64 0.5% do_anonymous_page
> ...
> -64 -1.0% __copy_to_user_ll
> -66 -12.2% .text.lock.file_table
> -72 -0.8% __d_lookup
> -78 -3.9% path_lookup
> -84 -14.9% kmap_atomic
> -92 -11.0% pte_alloc_one
> -97 -13.7% generic_file_open
> -106 -11.2% kmem_cache_free
> -121 -13.2% release_pages
> -126 -12.6% page_add_rmap
> -137 -12.9% clear_page_tables
> -212 -7.2% zap_pte_range
> -235 -100.0% radix_tree_lookup
> -239 -12.5% buffered_rmqueue
> -268 -17.8% link_path_walk
> -291 -100.0% .text.lock.filemap
> -397 -20.8% page_remove_rmap
> -398 -100.0% pgd_ctor
> -461 -21.6% do_no_page
> -669 -1.4% default_idle
> -3508 -2.5% total
> -3719 -99.4% find_get_page
>
> zap_pte_range and page_remove_rmap and do_no_page are cheaper ... are we
> setting up and tearing down pages less frequently somehow? Would be
> curious to know which patch that is ...

it's one of the -mm patches probably that boosts those bits (the
cost page_add_rmap and the page faults should be the same with both
anon-vma and anonmm). as for the regression, the pgd_alloc slowdown is
the unslabify one from andrew that releases 8 bytes per page in 32bit
archs and 16 bytes per page in 64bit archs.

My current page_t is now 36 bytes (compared to 48bytes of 2.4) in 32bit
archs, and 56bytes on 64bit archs (hope I counted right this time, Hugh
says I'm counting wrong the page_t, methinks we were looking different
source trees instead but maybe I was really counting wrong ;).

2004-03-24 16:35:59

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

> here it's not the fastest (though a 0.03 difference should be in the
> error range with an unlimited -j)
>
> overall I think for the fast path we can conclude they're at least
> equally fast.

Yup, I think they're all within the noise level between anon_mm and anon_vma,
though both are faster than partial by something at lease statistically
significant (though maybe not even enough to care about for these
workloads).

> it's interesting I get 0 standard deviation. Is it possible I get lower
> standard deviation because you run it less times? just wondering. I'd
> expect SDET has a default number of passes, so I expect the answer is no
> of course.

Yeah, it does 5 passes, and throws away the fastest and slowest. So it's
only 3 it's calculating off ... I think you just got lucky with a 0.0% ;-)
That's the most stable way I found of getting consistent results.

> it's one of the -mm patches probably that boosts those bits (the
> cost page_add_rmap and the page faults should be the same with both
> anon-vma and anonmm). as for the regression, the pgd_alloc slowdown is
> the unslabify one from andrew that releases 8 bytes per page in 32bit
> archs and 16 bytes per page in 64bit archs.

OK, great ... thanks for the info. I think I'd happily pay that cost in
pgd_alloc for the space gain - kernbench & SDET are about as bad as it
gets on pgd_alloc, so that seems like a good tradeoff.

> My current page_t is now 36 bytes (compared to 48bytes of 2.4) in 32bit
> archs, and 56bytes on 64bit archs (hope I counted right this time, Hugh
> says I'm counting wrong the page_t, methinks we were looking different
> source trees instead but maybe I was really counting wrong ;).

IIRC, with PAE etc on, mainline is 44 bytes. So if we saved 8 from Andrew's
change, and 4 from objrmap, I'd be hoping for 32?

M.


2004-03-24 17:07:51

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Wed, Mar 24, 2004 at 08:35:46AM -0800, Martin J. Bligh wrote:
> > here it's not the fastest (though a 0.03 difference should be in the
> > error range with an unlimited -j)
> >
> > overall I think for the fast path we can conclude they're at least
> > equally fast.
>
> Yup, I think they're all within the noise level between anon_mm and anon_vma,
> though both are faster than partial by something at lease statistically
> significant (though maybe not even enough to care about for these
> workloads).

they are, and those workloads aren't anonymous memory intensive that's
why the difference is not significant (see the benchmark I posted a few
days ago comparing anon_vma with mainline, the boost is huge on a 1G
box, on a 64bit multi-gigabox the boost will be even bigger, close to
100% speedup infact).

> Yeah, it does 5 passes, and throws away the fastest and slowest. So it's
> only 3 it's calculating off ... I think you just got lucky with a 0.0% ;-)

I guess so ;)

> > it's one of the -mm patches probably that boosts those bits (the
> > cost page_add_rmap and the page faults should be the same with both
> > anon-vma and anonmm). as for the regression, the pgd_alloc slowdown is
> > the unslabify one from andrew that releases 8 bytes per page in 32bit
> > archs and 16 bytes per page in 64bit archs.
>
> OK, great ... thanks for the info. I think I'd happily pay that cost in
> pgd_alloc for the space gain - kernbench & SDET are about as bad as it

that's Andrew's point too and I cannot agree more ;)

> gets on pgd_alloc, so that seems like a good tradeoff.

pgd_alloc is the fork path, so yes it's a very good tradeoff (really I
didn't check if any memory is being wasted by the bigger allocations,
but that is solvable even without the slab and without page->list by
just chaining the pages like poll does).

However Wli suggested that we shouldn't stop using the slab for those
minuscle allocations, and that the slab should not use the lists
instead. You may want to ask Wli for details. For now I enjoy the
36bytes page_t ;)

> > My current page_t is now 36 bytes (compared to 48bytes of 2.4) in 32bit
> > archs, and 56bytes on 64bit archs (hope I counted right this time, Hugh
> > says I'm counting wrong the page_t, methinks we were looking different
> > source trees instead but maybe I was really counting wrong ;).
>
> IIRC, with PAE etc on, mainline is 44 bytes. So if we saved 8 from Andrew's

nitpick, it's not PAE but highmem that makes it worse (even with PAE off).

> change, and 4 from objrmap, I'd be hoping for 32?

I giveup counting, I used the compiler this time, and yes it's 32bytes
for every page_t of 2.6-aa compared to 48bytes of 2.4.

2004-03-24 20:01:08

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Wed, Mar 24, 2004 at 06:08:41PM +0100, Andrea Arcangeli wrote:
> nitpick, it's not PAE but highmem that makes it worse (even with PAE off).

Please give me a little more credit than that. This is largely over,
but when assessing it, do note:

#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G)
typedef u32 pte_addr_t;
#endif

#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G)
typedef u64 pte_addr_t;
#endif

#if !defined(CONFIG_HIGHPTE)
typedef pte_t *pte_addr_t;
#endif

Yes, I also realized that in principle, one could have only used
PG_direct if the pagetable fell into the lower 32GB or stuffed the 33rd
bit into PG_arch and so on and so forth wrt. the 33rd bit.

-- wli

2004-03-24 20:03:50

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Wed, Mar 24, 2004 at 05:21:16PM +0100, Andrea Arcangeli wrote:
> it's one of the -mm patches probably that boosts those bits (the
> cost page_add_rmap and the page faults should be the same with both
> anon-vma and anonmm). as for the regression, the pgd_alloc slowdown is
> the unslabify one from andrew that releases 8 bytes per page in 32bit
> archs and 16 bytes per page in 64bit archs.
> My current page_t is now 36 bytes (compared to 48bytes of 2.4) in 32bit
> archs, and 56bytes on 64bit archs (hope I counted right this time, Hugh
> says I'm counting wrong the page_t, methinks we were looking different
> source trees instead but maybe I was really counting wrong ;).

Don't confuse unslabify and the ->list removal. The ->list removal went
around insisting the known universe stop using ->lru because of the
relatively arbitrary choice that slab.c use ->lru. The unslabify patch
attempts to update one user of ->lru by backing out the code using it.
Do note that non-list-heads like ->index, ->private, or ->mapping are
also unused on slab pages, and could have saved some pain for this
former user of ->list had they been chosen.


-- wli

2004-03-24 20:17:52

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] anobjrmap 1/6 objrmap

On Wed, Mar 24, 2004 at 12:01:56PM -0800, William Lee Irwin III wrote:
> Don't confuse unslabify and the ->list removal. The ->list removal went
> around insisting the known universe stop using ->lru because of the
> relatively arbitrary choice that slab.c use ->lru. The unslabify patch
> attempts to update one user of ->lru by backing out the code using it.
> Do note that non-list-heads like ->index, ->private, or ->mapping are
> also unused on slab pages, and could have saved some pain for this
> former user of ->list had they been chosen.

Updating the user instead of backing them out would have looked
something like the following (totally untested, not even compiletested):


-- wli


Index: mm2-2.6.5-rc2-1/arch/i386/mm/pageattr.c
===================================================================
--- mm2-2.6.5-rc2-1.orig/arch/i386/mm/pageattr.c 2004-03-19 16:11:06.000000000 -0800
+++ mm2-2.6.5-rc2-1/arch/i386/mm/pageattr.c 2004-03-21 07:15:40.000000000 -0800
@@ -75,7 +75,7 @@
return;

spin_lock_irqsave(&pgd_lock, flags);
- list_for_each_entry(page, &pgd_list, lru) {
+ for (page = pgd_list; page; page = (struct page *)page->index) {
pgd_t *pgd;
pmd_t *pmd;
pgd = (pgd_t *)page_address(page) + pgd_index(address);
Index: mm2-2.6.5-rc2-1/arch/i386/mm/pgtable.c
===================================================================
--- mm2-2.6.5-rc2-1.orig/arch/i386/mm/pgtable.c 2004-03-21 07:14:22.000000000 -0800
+++ mm2-2.6.5-rc2-1/arch/i386/mm/pgtable.c 2004-03-21 07:18:41.000000000 -0800
@@ -172,7 +172,27 @@
* -- wli
*/
spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED;
-LIST_HEAD(pgd_list);
+struct page *pgd_list;
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+ page->index = (unsigned long)pgd_list;
+ if (pgd_list)
+ pgd_list->private = (unsigned long)&page->index;
+ pgd_list = page;
+ page->private = (unsigned long)&pgd_list;
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+ struct page *next, **pprev, *page = virt_to_page(pgd);
+ next = (struct page *)page->index;
+ pprev = (struct page **)page->private;
+ *pprev = next;
+ if (next)
+ next->private = (unsigned long)pprev;
+}

void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
{
@@ -188,7 +208,7 @@
if (PTRS_PER_PMD > 1)
return;

- list_add(&virt_to_page(pgd)->lru, &pgd_list);
+ pgd_list_add(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
}
@@ -199,7 +219,7 @@
unsigned long flags; /* can be called from interrupt context */

spin_lock_irqsave(&pgd_lock, flags);
- list_del(&virt_to_page(pgd)->lru);
+ pgd_list_del(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
}

Index: mm2-2.6.5-rc2-1/include/asm-i386/pgtable.h
===================================================================
--- mm2-2.6.5-rc2-1.orig/include/asm-i386/pgtable.h 2004-03-19 16:11:34.000000000 -0800
+++ mm2-2.6.5-rc2-1/include/asm-i386/pgtable.h 2004-03-21 08:27:30.000000000 -0800
@@ -35,7 +35,7 @@
extern kmem_cache_t *pgd_cache;
extern kmem_cache_t *pmd_cache;
extern spinlock_t pgd_lock;
-extern struct list_head pgd_list;
+extern struct page *pgd_list;

void pmd_ctor(void *, kmem_cache_t *, unsigned long);
void pgd_ctor(void *, kmem_cache_t *, unsigned long);

2004-03-26 14:30:33

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH] anobjrmap 8/6 unmap nonlinear

anobjrmap 8/6 unmap nonlinear

The six anobjrmap patches I posted last week, based 2.6.5-rc1,
left two issues outstanding. Patch 7/6 dealt with mremap move
earlier this week, now this 8/6 handles try_to_unmap on nonlinear
vmas (those to which sys_remap_file_pages has been applied).

Less draconian than Andrea's solution, which punished users of
unlocked nonlinear vmas by unmapping every pte of every nonlinear
vma of the file whenever any page of the file reached try_to_unmap
(later version reprieves ptes with referenced flag set, but they'll
tend to get unmapped as soon as the next page comes down). If you
find this method works well, Andrea, and you're in a mood to forgive
the users of unlocked nonlinear vmas, please grab it for your tree:
nothing specific to anonmm about it.

Ignoring the page requested, try to unmap cluster of 32 neighbouring
ptes (in worst case all empty slots) in a nonlinear vma, then move
on to the next vma; stopping when we've unmapped at least as many
maps as the requested page had (vague guide of how hard to try),
or have reached the end. Use vm_private_data a little like the old
mm->swap_address, as a cursor recording how far we got, so we don't
attack the same ptes next time around (earlier tried inserting an
empty marker vma in the list, but that got messy).

Existing users of vm_private_data have either VM_RESERVED or
VM_DONTEXPAND set, both of which are in the VM_SPECIAL category
where we never try to merge vmas: so removed vm_private_data
test from is_mergeable_vma, so we can still merge VM_NONLINEARs.
Of course, we could instead add another field to vm_area_struct.

In addition, page_referenced report page as unreferenced if it
cannot get the semaphore or spinlock, so try_to_unmap will try
for it (but check referenced flag): following Andrea, to avoid
shm livelock he encountered.

fremap.c | 5 +
mmap.c | 2
rmap.c | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
3 files changed, 219 insertions(+), 38 deletions(-)

--- anobjrmap7/mm/fremap.c 2004-03-17 22:00:34.000000000 +0000
+++ anobjrmap8/mm/fremap.c 2004-03-26 13:33:21.504036832 +0000
@@ -186,9 +186,12 @@ asmlinkage long sys_remap_file_pages(uns
/*
* Make sure the vma is shared, that it supports prefaulting,
* and that the remapped range is valid and fully within
- * the single existing vma:
+ * the single existing vma. vm_private_data is used as a
+ * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED
+ * or VM_LOCKED, but VM_LOCKED could be revoked later on).
*/
if (vma && (vma->vm_flags & VM_SHARED) &&
+ (!vma->vm_private_data || (vma->vm_flags & VM_RESERVED)) &&
vma->vm_ops && vma->vm_ops->populate &&
end > start && start >= vma->vm_start &&
end <= vma->vm_end) {
--- anobjrmap7/mm/mmap.c 2004-03-25 21:42:20.698394304 +0000
+++ anobjrmap8/mm/mmap.c 2004-03-26 13:33:21.505036680 +0000
@@ -331,8 +331,6 @@ static inline int is_mergeable_vma(struc
return 0;
if (vma->vm_flags != vm_flags)
return 0;
- if (vma->vm_private_data)
- return 0;
return 1;
}

--- anobjrmap7/mm/rmap.c 2004-03-25 21:42:20.703393544 +0000
+++ anobjrmap8/mm/rmap.c 2004-03-26 13:33:21.509036072 +0000
@@ -182,10 +182,8 @@ static int page_referenced_one(struct pa
pte_t *pte;
int referenced = 0;

- if (!spin_trylock(&mm->page_table_lock)) {
- referenced++;
- goto out;
- }
+ if (!spin_trylock(&mm->page_table_lock))
+ return 0;

pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -212,8 +210,6 @@ out_unmap:

out_unlock:
spin_unlock(&mm->page_table_lock);
-
-out:
return referenced;
}

@@ -267,7 +263,7 @@ out:
* This function is only called from page_referenced for object-based pages.
*
* The semaphore address_space->i_shared_sem is tried. If it can't be gotten,
- * assume a reference count of 1.
+ * assume a reference count of 0, so try_to_unmap will then have a go.
*/
static inline int page_referenced_obj(struct page *page, int *mapcount)
{
@@ -277,30 +273,39 @@ static inline int page_referenced_obj(st
int referenced = 0;

if (down_trylock(&mapping->i_shared_sem))
- return 1;
+ return 0;

list_for_each_entry(vma, &mapping->i_mmap, shared) {
if (!vma->vm_mm->rss)
continue;
address = vma_address(page, vma);
- if (address != NOADDR) {
- referenced += page_referenced_one(
- page, vma->vm_mm, address, mapcount);
- if (!*mapcount)
- goto out;
+ if (address == NOADDR)
+ continue;
+ if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) ==
+ (VM_LOCKED|VM_MAYSHARE)) {
+ referenced++;
+ goto out;
}
+ referenced += page_referenced_one(
+ page, vma->vm_mm, address, mapcount);
+ if (!*mapcount)
+ goto out;
}

list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- if (!vma->vm_mm->rss)
+ if (!vma->vm_mm->rss || (vma->vm_flags & VM_NONLINEAR))
continue;
address = vma_address(page, vma);
- if (address != NOADDR) {
- referenced += page_referenced_one(
- page, vma->vm_mm, address, mapcount);
- if (!*mapcount)
- goto out;
+ if (address == NOADDR)
+ continue;
+ if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) {
+ referenced++;
+ goto out;
}
+ referenced += page_referenced_one(
+ page, vma->vm_mm, address, mapcount);
+ if (!*mapcount)
+ goto out;
}
out:
up(&mapping->i_shared_sem);
@@ -483,13 +488,21 @@ static int try_to_unmap_one(struct page

(*mapcount)--;

+ if (!vma) {
+ vma = find_vma(mm, address);
+ /* unmap_vmas drops page_table_lock with vma unlinked */
+ if (!vma)
+ goto out_unmap;
+ }
+
/*
* If the page is mlock()d, we cannot swap it out.
+ * If it's recently referenced (perhaps page_referenced
+ * skipped over this mm) then we should reactivate it.
*/
- if (!vma)
- vma = find_vma(mm, address);
- if (!vma || (vma->vm_flags & (VM_LOCKED|VM_RESERVED))) {
- ret = SWAP_FAIL;
+ if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
+ ptep_test_and_clear_young(pte)) {
+ ret = SWAP_FAIL;
goto out_unmap;
}

@@ -528,6 +541,100 @@ out:
return ret;
}

+/*
+ * try_to_unmap_cluster is only used on VM_NONLINEAR shared object vmas,
+ * in which objrmap is unable to predict where a page will be found.
+ */
+#define CLUSTER_SIZE (32 * PAGE_SIZE)
+#if CLUSTER_SIZE > PMD_SIZE
+#undef CLUSTER_SIZE
+#define CLUSTER_SIZE PMD_SIZE
+#endif
+#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
+
+static int try_to_unmap_cluster(struct mm_struct *mm,
+ unsigned long cursor, int *mapcount, struct vm_area_struct *vma)
+{
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+ pte_t pteval;
+ struct page *page;
+ unsigned long address;
+ unsigned long end;
+ unsigned long pfn;
+ unsigned long pgidx;
+
+ /*
+ * We need the page_table_lock to protect us from page faults,
+ * munmap, fork, etc...
+ */
+ if (!spin_trylock(&mm->page_table_lock))
+ return SWAP_FAIL;
+
+ address = (vma->vm_start + cursor) & CLUSTER_MASK;
+ end = address + CLUSTER_SIZE;
+ if (address < vma->vm_start)
+ address = vma->vm_start;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out_unlock;
+
+ pmd = pmd_offset(pgd, address);
+ if (!pmd_present(*pmd))
+ goto out_unlock;
+
+ for (pte = pte_offset_map(pmd, address);
+ address < end; pte++, address += PAGE_SIZE) {
+
+ if (!pte_present(*pte))
+ continue;
+
+ pfn = pte_pfn(*pte);
+ if (!pfn_valid(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ BUG_ON(PageAnon(page));
+ if (PageReserved(page))
+ continue;
+
+ if (ptep_test_and_clear_young(pte))
+ continue;
+
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address);
+ pteval = ptep_clear_flush(vma, address, pte);
+
+ /* If nonlinear, store the file page offset in the pte. */
+ pgidx = (address - vma->vm_start) >> PAGE_SHIFT;
+ pgidx += vma->vm_pgoff;
+ pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
+ if (page->index != pgidx) {
+ set_pte(pte, pgoff_to_pte(page->index));
+ BUG_ON(!pte_file(*pte));
+ }
+
+ /* Move the dirty bit to the physical page now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
+
+ page_remove_rmap(page);
+ page_cache_release(page);
+ mm->rss--;
+ (*mapcount)--;
+ }
+
+ pte_unmap(pte);
+
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+ return SWAP_AGAIN;
+}
+
static inline int try_to_unmap_anon(struct page *page, int *mapcount)
{
struct anonmm *anonmm = (struct anonmm *) page->mapping;
@@ -584,6 +691,9 @@ static inline int try_to_unmap_obj(struc
struct vm_area_struct *vma;
unsigned long address;
int ret = SWAP_AGAIN;
+ unsigned long cursor;
+ unsigned long max_nl_cursor = 0;
+ unsigned long max_nl_size = 0;

if (down_trylock(&mapping->i_shared_sem))
return ret;
@@ -592,26 +702,96 @@ static inline int try_to_unmap_obj(struc
if (!vma->vm_mm->rss)
continue;
address = vma_address(page, vma);
- if (address != NOADDR) {
- ret = try_to_unmap_one(
- page, vma->vm_mm, address, mapcount, vma);
- if (ret == SWAP_FAIL || !*mapcount)
- goto out;
- }
+ if (address == NOADDR)
+ continue;
+ ret = try_to_unmap_one(
+ page, vma->vm_mm, address, mapcount, vma);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
}

list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+ if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+ /*
+ * Defer unmapping nonlinear to the next loop,
+ * but take notes while we're here e.g. don't
+ * want to loop again when no nonlinear vmas.
+ */
+ if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
+ continue;
+ cursor = (unsigned long) vma->vm_private_data;
+ if (cursor > max_nl_cursor)
+ max_nl_cursor = cursor;
+ cursor = vma->vm_end - vma->vm_start;
+ if (cursor > max_nl_size)
+ max_nl_size = cursor;
+ continue;
+ }
if (!vma->vm_mm->rss)
continue;
address = vma_address(page, vma);
- if (address != NOADDR) {
- ret = try_to_unmap_one(
- page, vma->vm_mm, address, mapcount, vma);
- if (ret == SWAP_FAIL || !*mapcount)
- goto out;
- }
+ if (address == NOADDR)
+ continue;
+ ret = try_to_unmap_one(
+ page, vma->vm_mm, address, mapcount, vma);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
}

+ if (max_nl_size == 0) /* no nonlinear vmas of this file */
+ goto out;
+
+ /*
+ * We don't try to search for this page in the nonlinear vmas,
+ * and page_referenced wouldn't have found it anyway. Instead
+ * just walk the nonlinear vmas trying to age and unmap some.
+ * The mapcount of the page we came in with is irrelevant,
+ * but even so use it as a guide to how hard we should try?
+ */
+ rmap_unlock(page);
+
+ max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
+ if (max_nl_cursor == 0)
+ max_nl_cursor = CLUSTER_SIZE;
+
+ do {
+ list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+ if (VM_NONLINEAR != (vma->vm_flags &
+ (VM_NONLINEAR|VM_LOCKED|VM_RESERVED)))
+ continue;
+ cursor = (unsigned long) vma->vm_private_data;
+ while (vma->vm_mm->rss &&
+ cursor < max_nl_cursor &&
+ cursor < vma->vm_end - vma->vm_start) {
+ ret = try_to_unmap_cluster(vma->vm_mm,
+ cursor, mapcount, vma);
+ if (ret == SWAP_FAIL)
+ break;
+ cursor += CLUSTER_SIZE;
+ vma->vm_private_data = (void *) cursor;
+ if (*mapcount <= 0)
+ goto relock;
+ }
+ if (ret != SWAP_FAIL)
+ vma->vm_private_data =
+ (void *) max_nl_cursor;
+ ret = SWAP_AGAIN;
+ }
+ max_nl_cursor += CLUSTER_SIZE;
+ } while (max_nl_cursor <= max_nl_size);
+
+ /*
+ * Don't loop forever (perhaps all the remaining pages are
+ * in locked vmas). Reset cursor on all unreserved nonlinear
+ * vmas, now forgetting on which ones it had fallen behind.
+ */
+ list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+ if ((vma->vm_flags & (VM_NONLINEAR|VM_RESERVED)) ==
+ VM_NONLINEAR)
+ vma->vm_private_data = 0;
+ }
+relock:
+ rmap_lock(page);
out:
up(&mapping->i_shared_sem);
return ret;