First of a sequence of six patches, extending Dave McCracken's
objrmap to handle anonymous memory too, eliminating pte_chains.
Based upon 2.5.65-mm2, the aggregate has
81 files changed, 1140 insertions(+), 1634 deletions(-)
anobjrmap 1/6 create include/linux/rmap.h
anobjrmap 2/6 free page->mapping for use by anon
anobjrmap 3/6 remove pte-pointer-based rmap
anobjrmap 4/6 add anonmm to track anonymous pages
anonjrmap 5/6 virtual address chains for odd cases
anonjrmap 6/6 updates to arches other than i386
I've not done any timings, hope others can do that better than
I would. My guess is that Dave has already covered the worst
cases, but this should cut the rmap overhead when forking.
anobjrmap 1/6 create include/linux/rmap.h
Start small: linux/rmap-locking.h has already gathered some
declarations unrelated to locking, then the rest of the rmap
declarations were over in linux/swap.h: gather them all
together in linux/rmap.h.
Omit SWAP_ERROR (unused), page_over_rsslimit (non-existent).
Fix a couple of missed unlocks in rmap.c page_convert_anon,
before the whole function is removed in the next patch.
fs/exec.c | 2 -
include/linux/rmap-locking.h | 49 ----------------------------
include/linux/rmap.h | 73 +++++++++++++++++++++++++++++++++++++++++++
include/linux/swap.h | 19 -----------
mm/fremap.c | 2 -
mm/memory.c | 2 -
mm/mremap.c | 2 -
mm/rmap.c | 9 +----
mm/swapfile.c | 2 -
mm/vmscan.c | 3 -
10 files changed, 82 insertions(+), 81 deletions(-)
--- 2.5.65-mm2/fs/exec.c Wed Mar 19 11:05:11 2003
+++ anobjrmap1/fs/exec.c Thu Mar 20 17:09:50 2003
@@ -45,7 +45,7 @@
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/security.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
--- 2.5.65-mm2/include/linux/rmap-locking.h Wed Mar 19 11:05:16 2003
+++ anobjrmap1/include/linux/rmap-locking.h Thu Jan 1 01:00:00 1970
@@ -1,49 +0,0 @@
-/*
- * include/linux/rmap-locking.h
- *
- * Locking primitives for exclusive access to a page's reverse-mapping
- * pte chain.
- */
-
-#include <linux/slab.h>
-
-struct pte_chain;
-extern kmem_cache_t *pte_chain_cache;
-
-static inline void pte_chain_lock(struct page *page)
-{
- /*
- * Assuming the lock is uncontended, this never enters
- * the body of the outer loop. If it is contended, then
- * within the inner loop a non-atomic test is used to
- * busywait with less bus contention for a good time to
- * attempt to acquire the lock bit.
- */
- preempt_disable();
-#ifdef CONFIG_SMP
- while (test_and_set_bit(PG_chainlock, &page->flags)) {
- while (test_bit(PG_chainlock, &page->flags))
- cpu_relax();
- }
-#endif
-}
-
-static inline void pte_chain_unlock(struct page *page)
-{
-#ifdef CONFIG_SMP
- smp_mb__before_clear_bit();
- clear_bit(PG_chainlock, &page->flags);
-#endif
- preempt_enable();
-}
-
-struct pte_chain *pte_chain_alloc(int gfp_flags);
-void __pte_chain_free(struct pte_chain *pte_chain);
-
-static inline void pte_chain_free(struct pte_chain *pte_chain)
-{
- if (pte_chain)
- __pte_chain_free(pte_chain);
-}
-
-void page_convert_anon(struct page *page);
--- 2.5.65-mm2/include/linux/rmap.h Thu Jan 1 01:00:00 1970
+++ anobjrmap1/include/linux/rmap.h Thu Mar 20 17:09:50 2003
@@ -0,0 +1,73 @@
+#ifndef _LINUX_RMAP_H
+#define _LINUX_RMAP_H
+/*
+ * Declarations for Reverse Mapping functions in mm/rmap.c
+ * Its structures are declared within that file.
+ */
+#include <linux/config.h>
+#include <linux/linkage.h>
+
+#ifdef CONFIG_MMU
+
+struct pte_chain;
+struct pte_chain *pte_chain_alloc(int gfp_flags);
+void __pte_chain_free(struct pte_chain *pte_chain);
+
+static inline void pte_chain_free(struct pte_chain *pte_chain)
+{
+ if (pte_chain)
+ __pte_chain_free(pte_chain);
+}
+
+struct pte_chain *FASTCALL(
+ page_add_rmap(struct page *, pte_t *, struct pte_chain *));
+void FASTCALL(page_remove_rmap(struct page *, pte_t *));
+void page_convert_anon(struct page *page);
+
+/*
+ * Called from mm/vmscan.c to handle paging out
+ */
+int FASTCALL(page_referenced(struct page *));
+int FASTCALL(try_to_unmap(struct page *));
+
+/*
+ * Return values of try_to_unmap
+ */
+#define SWAP_SUCCESS 0
+#define SWAP_AGAIN 1
+#define SWAP_FAIL 2
+
+#else /* !CONFIG_MMU */
+
+#define page_referenced(page) TestClearPageReferenced(page)
+
+#endif /* CONFIG_MMU */
+
+static inline void pte_chain_lock(struct page *page)
+{
+ /*
+ * Assuming the lock is uncontended, this never enters
+ * the body of the outer loop. If it is contended, then
+ * within the inner loop a non-atomic test is used to
+ * busywait with less bus contention for a good time to
+ * attempt to acquire the lock bit.
+ */
+ preempt_disable();
+#ifdef CONFIG_SMP
+ while (test_and_set_bit(PG_chainlock, &page->flags)) {
+ while (test_bit(PG_chainlock, &page->flags))
+ cpu_relax();
+ }
+#endif
+}
+
+static inline void pte_chain_unlock(struct page *page)
+{
+#ifdef CONFIG_SMP
+ smp_mb__before_clear_bit();
+ clear_bit(PG_chainlock, &page->flags);
+#endif
+ preempt_enable();
+}
+
+#endif /* _LINUX_RMAP_H */
--- 2.5.65-mm2/include/linux/swap.h Wed Mar 5 07:26:34 2003
+++ anobjrmap1/include/linux/swap.h Thu Mar 20 17:09:50 2003
@@ -69,7 +69,6 @@
#ifdef __KERNEL__
struct address_space;
-struct pte_chain;
struct sysinfo;
struct writeback_control;
struct zone;
@@ -167,27 +166,9 @@
extern int shrink_all_memory(int);
extern int vm_swappiness;
-/* linux/mm/rmap.c */
#ifdef CONFIG_MMU
-int FASTCALL(page_referenced(struct page *));
-struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *,
- struct pte_chain *));
-void FASTCALL(page_remove_rmap(struct page *, pte_t *));
-int FASTCALL(try_to_unmap(struct page *));
-int FASTCALL(page_over_rsslimit(struct page *));
-
-/* return values of try_to_unmap */
-#define SWAP_SUCCESS 0
-#define SWAP_AGAIN 1
-#define SWAP_FAIL 2
-#define SWAP_ERROR 3
-
/* linux/mm/shmem.c */
extern int shmem_unuse(swp_entry_t entry, struct page *page);
-
-#else
-#define page_referenced(page) \
- TestClearPageReferenced(page)
#endif /* CONFIG_MMU */
#ifdef CONFIG_SWAP
--- 2.5.65-mm2/mm/fremap.c Wed Mar 19 11:05:16 2003
+++ anobjrmap1/mm/fremap.c Thu Mar 20 17:09:50 2003
@@ -11,7 +11,7 @@
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swapops.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
--- 2.5.65-mm2/mm/memory.c Wed Mar 19 11:05:16 2003
+++ anobjrmap1/mm/memory.c Thu Mar 20 17:09:50 2003
@@ -44,7 +44,7 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/vcache.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
#include <asm/pgalloc.h>
#include <asm/rmap.h>
--- 2.5.65-mm2/mm/mremap.c Tue Mar 18 07:38:45 2003
+++ anobjrmap1/mm/mremap.c Thu Mar 20 17:09:50 2003
@@ -15,7 +15,7 @@
#include <linux/swap.h>
#include <linux/fs.h>
#include <linux/highmem.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
--- 2.5.65-mm2/mm/rmap.c Wed Mar 19 11:05:16 2003
+++ anobjrmap1/mm/rmap.c Thu Mar 20 17:09:50 2003
@@ -25,7 +25,7 @@
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
#include <linux/cache.h>
#include <linux/percpu.h>
@@ -677,7 +677,6 @@
* SWAP_SUCCESS - we succeeded in removing all mappings
* SWAP_AGAIN - we missed a trylock, try again later
* SWAP_FAIL - the page is unswappable
- * SWAP_ERROR - an error occurred
*/
int try_to_unmap(struct page * page)
{
@@ -754,9 +753,6 @@
case SWAP_FAIL:
ret = SWAP_FAIL;
goto out;
- case SWAP_ERROR:
- ret = SWAP_ERROR;
- goto out;
}
}
}
@@ -812,6 +808,7 @@
*/
if (mapcount < page->pte.mapcount) {
pte_chain_unlock(page);
+ up(&mapping->i_shared_sem);
goto retry;
} else if ((mapcount > page->pte.mapcount) && (mapcount > 1)) {
mapcount = page->pte.mapcount;
@@ -827,7 +824,7 @@
SetPageAnon(page);
if (mapcount == 0)
- goto out;
+ goto out_unlock;
else if (mapcount == 1) {
SetPageDirect(page);
page->pte.direct = 0;
--- 2.5.65-mm2/mm/swapfile.c Wed Mar 19 11:05:16 2003
+++ anobjrmap1/mm/swapfile.c Thu Mar 20 17:09:50 2003
@@ -20,7 +20,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
#include <asm/pgtable.h>
#include <linux/swapops.h>
--- 2.5.65-mm2/mm/vmscan.c Tue Feb 18 02:14:32 2003
+++ anobjrmap1/mm/vmscan.c Thu Mar 20 17:09:50 2003
@@ -26,7 +26,7 @@
#include <linux/mm_inline.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -284,7 +284,6 @@
*/
if (page_mapped(page) && mapping) {
switch (try_to_unmap(page)) {
- case SWAP_ERROR:
case SWAP_FAIL:
pte_chain_unlock(page);
goto activate_locked;
On Thu, Mar 20, 2003 at 11:15:06PM +0000, Hugh Dickins wrote:
> Removed nr_reverse_maps, ReverseMaps: easily reverted if that
> poses a vmstat or meminfo compatibility problem, or someone is
> still interested in that number; but objrmap wasn't maintaining
> it, and if they don't occupy space, is it worth showing?
> Besides, look at page_dup_rmap for copy_page_range: I don't
> want to clutter that with inc_page_state(nr_reverse_maps).
It was mostly to determine space savings and internal fragmentation
on the pte_chain objects. It also helps get some notion of internal
fragmentation on pagetables. It's of low importance; delete at will.
-- wli
anonjrmap 6/6 updates to arches other than i386
Some arches refer to page->mapping for their cache flushing,
generally use page_mapping instead: it appears that they're
coping with shared pagecache issues, rather than anon swap.
Change put_dirty_page(current,,) to put_stack_page(mpnt,,).
No special page table initialization needed for rmap.
Delete pte_addr_t from asm/pgtable.h. Delete asm/rmap.h.
There's some peculiar __users code in asm-s390*/pgtable.h,
looks bogus to me (those counts are included in page_count),
I've just deleted but should confirm with Martin. And a
fix in asm-sh/pgalloc.h: use list_empty on i_mmap_shared.
No i386 files in this patch: but note that with pte_addr_t
gone, its PAE struct page is the same size as without PAE.
arch/arm/mm/fault-armv.c | 4 ++--
arch/arm/mm/mm-armv.c | 3 +--
arch/ia64/ia32/binfmt_elf32.c | 3 +--
arch/ia64/mm/init.c | 2 +-
arch/parisc/kernel/cache.c | 4 ++--
arch/ppc/mm/init.c | 12 ------------
arch/s390x/kernel/exec32.c | 4 +---
arch/sparc64/kernel/smp.c | 8 ++++----
arch/sparc64/mm/init.c | 12 ++++++------
arch/sparc64/mm/ultra.S | 2 +-
arch/x86_64/ia32/ia32_binfmt.c | 5 +----
include/asm-alpha/pgtable.h | 2 --
include/asm-alpha/rmap.h | 7 -------
include/asm-arm/pgtable.h | 2 --
include/asm-arm/proc-armv/cache.h | 4 ++--
include/asm-arm/rmap.h | 6 ------
include/asm-cris/pgtable.h | 2 --
include/asm-cris/rmap.h | 7 -------
include/asm-ia64/pgtable.h | 2 --
include/asm-ia64/rmap.h | 7 -------
include/asm-m68k/pgtable.h | 2 --
include/asm-m68k/rmap.h | 7 -------
include/asm-m68knommu/pgtable.h | 2 --
include/asm-m68knommu/rmap.h | 2 --
include/asm-mips/pgtable.h | 2 --
include/asm-mips/rmap.h | 7 -------
include/asm-mips64/pgtable.h | 2 --
include/asm-mips64/rmap.h | 7 -------
include/asm-parisc/cacheflush.h | 2 +-
include/asm-parisc/pgtable.h | 2 --
include/asm-parisc/rmap.h | 7 -------
include/asm-ppc/pgtable.h | 2 --
include/asm-ppc/rmap.h | 9 ---------
include/asm-ppc64/pgtable.h | 2 --
include/asm-ppc64/rmap.h | 9 ---------
include/asm-s390/pgtable.h | 8 ++------
include/asm-s390/rmap.h | 7 -------
include/asm-s390x/pgtable.h | 8 ++------
include/asm-s390x/rmap.h | 7 -------
include/asm-sh/pgalloc.h | 2 +-
include/asm-sh/pgtable.h | 2 --
include/asm-sh/rmap.h | 7 -------
include/asm-sparc/pgtable.h | 2 --
include/asm-sparc/rmap.h | 7 -------
include/asm-sparc64/pgtable.h | 2 --
include/asm-sparc64/rmap.h | 7 -------
include/asm-um/pgtable.h | 12 ------------
include/asm-um/rmap.h | 6 ------
include/asm-v850/pgtable.h | 2 --
include/asm-v850/rmap.h | 1 -
include/asm-x86_64/pgtable.h | 2 --
include/asm-x86_64/rmap.h | 7 -------
52 files changed, 28 insertions(+), 223 deletions(-)
--- anobjrmap5/arch/arm/mm/fault-armv.c Mon Nov 18 06:02:39 2002
+++ anobjrmap6/arch/arm/mm/fault-armv.c Thu Mar 20 17:10:45 2003
@@ -188,7 +188,7 @@
cpu_cache_clean_invalidate_range(kaddr, kaddr + PAGE_SIZE, 0);
- if (!page->mapping)
+ if (!page_mapping(page))
return;
/*
@@ -289,7 +289,7 @@
if (!pfn_valid(pfn))
return;
page = pfn_to_page(pfn);
- if (page->mapping) {
+ if (page_mapping(page)) {
int dirty = test_and_clear_bit(PG_dcache_dirty, &page->flags);
unsigned long kaddr = (unsigned long)page_address(page);
--- anobjrmap5/arch/arm/mm/mm-armv.c Mon Nov 18 06:02:39 2002
+++ anobjrmap6/arch/arm/mm/mm-armv.c Thu Mar 20 17:10:45 2003
@@ -17,7 +17,6 @@
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/page.h>
-#include <asm/rmap.h>
#include <asm/io.h>
#include <asm/setup.h>
#include <asm/tlbflush.h>
@@ -151,7 +150,7 @@
pte = pmd_page(*pmd);
pmd_clear(pmd);
- pgtable_remove_rmap(pte);
+ dec_page_state(nr_page_table_pages);
pte_free(pte);
pmd_free(pmd);
free:
--- anobjrmap5/arch/ia64/ia32/binfmt_elf32.c Mon Feb 10 20:12:34 2003
+++ anobjrmap6/arch/ia64/ia32/binfmt_elf32.c Thu Mar 20 17:10:45 2003
@@ -40,7 +40,6 @@
#define CLOCKS_PER_SEC IA32_CLOCKS_PER_SEC
extern void ia64_elf32_init (struct pt_regs *regs);
-extern void put_dirty_page (struct task_struct * tsk, struct page *page, unsigned long address);
static void elf32_set_personality (void);
@@ -200,7 +199,7 @@
struct page *page = bprm->page[i];
if (page) {
bprm->page[i] = NULL;
- put_dirty_page(current, page, stack_base);
+ put_stack_page(mpnt, page, stack_base);
}
stack_base += PAGE_SIZE;
}
--- anobjrmap5/arch/ia64/mm/init.c Mon Feb 10 20:12:35 2003
+++ anobjrmap6/arch/ia64/mm/init.c Thu Mar 20 17:10:45 2003
@@ -223,7 +223,7 @@
}
/*
- * This is like put_dirty_page() but installs a clean page with PAGE_GATE protection
+ * This is like put_stack_page() but installs a clean page with PAGE_GATE protection
* (execute-only, typically).
*/
struct page *
--- anobjrmap5/arch/parisc/kernel/cache.c Wed Mar 19 11:05:09 2003
+++ anobjrmap6/arch/parisc/kernel/cache.c Thu Mar 20 17:10:45 2003
@@ -64,7 +64,7 @@
{
struct page *page = pte_page(pte);
- if (VALID_PAGE(page) && page->mapping &&
+ if (VALID_PAGE(page) && page_mapping(page) &&
test_bit(PG_dcache_dirty, &page->flags)) {
flush_kernel_dcache_page(page_address(page));
@@ -230,7 +230,7 @@
flush_kernel_dcache_page(page_address(page));
- if (!page->mapping)
+ if (!page_mapping(page))
return;
list_for_each(l, &page->mapping->i_mmap_shared) {
--- anobjrmap5/arch/ppc/mm/init.c Tue Mar 18 07:38:32 2003
+++ anobjrmap6/arch/ppc/mm/init.c Thu Mar 20 17:10:45 2003
@@ -472,18 +472,6 @@
printk(KERN_INFO "AGP special page: 0x%08lx\n", agp_special_page);
#endif /* defined(CONFIG_ALL_PPC) */
- /* Make sure all our pagetable pages have page->mapping
- and page->index set correctly. */
- for (addr = KERNELBASE; addr != 0; addr += PGDIR_SIZE) {
- struct page *pg;
- pmd_t *pmd = pmd_offset(pgd_offset_k(addr), addr);
- if (pmd_present(*pmd)) {
- pg = pmd_page(*pmd);
- pg->mapping = (void *) &init_mm;
- pg->index = addr;
- }
- }
-
mem_init_done = 1;
}
--- anobjrmap5/arch/s390x/kernel/exec32.c Wed Mar 5 07:26:17 2003
+++ anobjrmap6/arch/s390x/kernel/exec32.c Thu Mar 20 17:10:45 2003
@@ -33,8 +33,6 @@
#endif
-extern void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address);
-
#undef STACK_TOP
#define STACK_TOP TASK31_SIZE
@@ -82,7 +80,7 @@
struct page *page = bprm->page[i];
if (page) {
bprm->page[i] = NULL;
- put_dirty_page(current,page,stack_base);
+ put_stack_page(mpnt,page,stack_base);
}
stack_base += PAGE_SIZE;
}
--- anobjrmap5/arch/sparc64/kernel/smp.c Tue Mar 18 07:38:33 2003
+++ anobjrmap6/arch/sparc64/kernel/smp.c Thu Mar 20 17:10:45 2003
@@ -750,9 +750,9 @@
#if (L1DCACHE_SIZE > PAGE_SIZE)
__flush_dcache_page(page->virtual,
((tlb_type == spitfire) &&
- page->mapping != NULL));
+ page_mapping(page) != NULL));
#else
- if (page->mapping != NULL &&
+ if (page_mapping(page) != NULL &&
tlb_type == spitfire)
__flush_icache_page(__pa(page->virtual));
#endif
@@ -773,7 +773,7 @@
if (tlb_type == spitfire) {
data0 =
((u64)&xcall_flush_dcache_page_spitfire);
- if (page->mapping != NULL)
+ if (page_mapping(page) != NULL)
data0 |= ((u64)1 << 32);
spitfire_xcall_deliver(data0,
__pa(page->virtual),
@@ -804,7 +804,7 @@
goto flush_self;
if (tlb_type == spitfire) {
data0 = ((u64)&xcall_flush_dcache_page_spitfire);
- if (page->mapping != NULL)
+ if (page_mapping(page) != NULL)
data0 |= ((u64)1 << 32);
spitfire_xcall_deliver(data0,
__pa(page->virtual),
--- anobjrmap5/arch/sparc64/mm/init.c Mon Jan 13 19:31:43 2003
+++ anobjrmap6/arch/sparc64/mm/init.c Thu Mar 20 17:10:45 2003
@@ -129,9 +129,9 @@
#if (L1DCACHE_SIZE > PAGE_SIZE)
__flush_dcache_page(page->virtual,
((tlb_type == spitfire) &&
- page->mapping != NULL));
+ page_mapping(page) != NULL));
#else
- if (page->mapping != NULL &&
+ if (page_mapping(page) != NULL &&
tlb_type == spitfire)
__flush_icache_page(__pa(page->virtual));
#endif
@@ -193,7 +193,7 @@
pfn = pte_pfn(pte);
if (pfn_valid(pfn) &&
- (page = pfn_to_page(pfn), page->mapping) &&
+ (page = pfn_to_page(pfn), page_mapping(page)) &&
((pg_flags = page->flags) & (1UL << PG_dcache_dirty))) {
int cpu = ((pg_flags >> 24) & (NR_CPUS - 1UL));
@@ -217,7 +217,7 @@
int dirty = test_bit(PG_dcache_dirty, &page->flags);
int dirty_cpu = dcache_dirty_cpu(page);
- if (page->mapping &&
+ if (page_mapping(page) &&
list_empty(&page->mapping->i_mmap) &&
list_empty(&page->mapping->i_mmap_shared)) {
if (dirty) {
@@ -227,7 +227,7 @@
}
set_dcache_dirty(page);
} else {
- /* We could delay the flush for the !page->mapping
+ /* We could delay the flush for the !page_mapping
* case too. But that case is for exec env/arg
* pages and those are %99 certainly going to get
* faulted into the tlb (and thus flushed) anyways.
@@ -269,7 +269,7 @@
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
- if (PageReserved(page) || !page->mapping)
+ if (PageReserved(page) || !page_mapping(page))
continue;
pgaddr = (unsigned long) page_address(page);
uaddr = address + offset;
--- anobjrmap5/arch/sparc64/mm/ultra.S Mon Feb 24 20:03:29 2003
+++ anobjrmap6/arch/sparc64/mm/ultra.S Thu Mar 20 17:10:45 2003
@@ -615,7 +615,7 @@
.globl xcall_flush_dcache_page_spitfire
xcall_flush_dcache_page_spitfire: /* %g1 == physical page address
%g7 == kernel page virtual address
- %g5 == (page->mapping != NULL) */
+ %g5 == (page_mapping != NULL) */
#if (L1DCACHE_SIZE > PAGE_SIZE)
srlx %g1, (13 - 2), %g1 ! Form tag comparitor
sethi %hi(L1DCACHE_SIZE), %g3 ! D$ size == 16K
--- anobjrmap5/arch/x86_64/ia32/ia32_binfmt.c Wed Mar 19 11:05:09 2003
+++ anobjrmap6/arch/x86_64/ia32/ia32_binfmt.c Thu Mar 20 17:10:45 2003
@@ -272,9 +272,6 @@
set_thread_flag(TIF_IA32);
}
-extern void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address);
-
-
int setup_arg_pages(struct linux_binprm *bprm)
{
unsigned long stack_base;
@@ -319,7 +316,7 @@
struct page *page = bprm->page[i];
if (page) {
bprm->page[i] = NULL;
- put_dirty_page(current,page,stack_base);
+ put_stack_page(mpnt,page,stack_base);
}
stack_base += PAGE_SIZE;
}
--- anobjrmap5/include/asm-alpha/pgtable.h Tue Mar 18 07:38:41 2003
+++ anobjrmap6/include/asm-alpha/pgtable.h Thu Mar 20 17:10:45 2003
@@ -343,6 +343,4 @@
/* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */
#define HAVE_ARCH_UNMAPPED_AREA
-typedef pte_t *pte_addr_t;
-
#endif /* _ALPHA_PGTABLE_H */
--- anobjrmap5/include/asm-alpha/rmap.h Sat Jul 20 20:56:06 2002
+++ anobjrmap6/include/asm-alpha/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,7 +0,0 @@
-#ifndef _ALPHA_RMAP_H
-#define _ALPHA_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-arm/pgtable.h Tue Mar 18 07:38:42 2003
+++ anobjrmap6/include/asm-arm/pgtable.h Thu Mar 20 17:10:45 2003
@@ -162,8 +162,6 @@
#define io_remap_page_range(vma,from,phys,size,prot) \
remap_page_range(vma,from,phys,size,prot)
-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */
#endif /* _ASMARM_PGTABLE_H */
--- anobjrmap5/include/asm-arm/proc-armv/cache.h Mon Nov 18 06:02:49 2002
+++ anobjrmap6/include/asm-arm/proc-armv/cache.h Thu Mar 20 17:10:45 2003
@@ -81,7 +81,7 @@
* flush_dcache_page is used when the kernel has written to the page
* cache page at virtual address page->virtual.
*
- * If this page isn't mapped (ie, page->mapping = NULL), or it has
+ * If this page isn't mapped (ie, page_mapping == NULL), or it has
* userspace mappings (page->mapping->i_mmap or page->mapping->i_mmap_shared)
* then we _must_ always clean + invalidate the dcache entries associated
* with the kernel mapping.
@@ -97,7 +97,7 @@
static inline void flush_dcache_page(struct page *page)
{
- if (page->mapping && !mapping_mapped(page->mapping))
+ if (page_mapping(page) && !mapping_mapped(page->mapping))
set_bit(PG_dcache_dirty, &page->flags);
else
__flush_dcache_page(page);
--- anobjrmap5/include/asm-arm/rmap.h Thu Aug 1 23:58:27 2002
+++ anobjrmap6/include/asm-arm/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,6 +0,0 @@
-#ifndef _ARM_RMAP_H
-#define _ARM_RMAP_H
-
-#include <asm-generic/rmap.h>
-
-#endif /* _ARM_RMAP_H */
--- anobjrmap5/include/asm-cris/pgtable.h Mon Sep 16 05:51:50 2002
+++ anobjrmap6/include/asm-cris/pgtable.h Thu Mar 20 17:10:45 2003
@@ -515,6 +515,4 @@
*/
#define pgtable_cache_init() do { } while (0)
-typedef pte_t *pte_addr_t;
-
#endif /* _CRIS_PGTABLE_H */
--- anobjrmap5/include/asm-cris/rmap.h Sat Jul 20 20:56:06 2002
+++ anobjrmap6/include/asm-cris/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,7 +0,0 @@
-#ifndef _CRIS_RMAP_H
-#define _CRIS_RMAP_H
-
-/* nothing to see, move along :) */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-ia64/pgtable.h Tue Mar 18 07:38:42 2003
+++ anobjrmap6/include/asm-ia64/pgtable.h Thu Mar 20 17:10:45 2003
@@ -420,8 +420,6 @@
/* We provide our own get_unmapped_area to cope with VA holes for userland */
#define HAVE_ARCH_UNMAPPED_AREA
-typedef pte_t *pte_addr_t;
-
# endif /* !__ASSEMBLY__ */
/*
--- anobjrmap5/include/asm-ia64/rmap.h Wed Aug 28 06:38:20 2002
+++ anobjrmap6/include/asm-ia64/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,7 +0,0 @@
-#ifndef _ASM_IA64_RMAP_H
-#define _ASM_IA64_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif /* _ASM_IA64_RMAP_H */
--- anobjrmap5/include/asm-m68k/pgtable.h Mon Sep 16 05:51:50 2002
+++ anobjrmap6/include/asm-m68k/pgtable.h Thu Mar 20 17:10:45 2003
@@ -172,8 +172,6 @@
#ifndef __ASSEMBLY__
#include <asm-generic/pgtable.h>
-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */
/*
--- anobjrmap5/include/asm-m68k/rmap.h Sat Jul 20 20:56:06 2002
+++ anobjrmap6/include/asm-m68k/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,7 +0,0 @@
-#ifndef _M68K_RMAP_H
-#define _M68K_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-m68knommu/pgtable.h Tue Nov 5 00:03:09 2002
+++ anobjrmap6/include/asm-m68knommu/pgtable.h Thu Mar 20 17:10:45 2003
@@ -11,8 +11,6 @@
#include <asm/page.h>
#include <asm/io.h>
-typedef pte_t *pte_addr_t;
-
/*
* Trivial page table functions.
*/
--- anobjrmap5/include/asm-m68knommu/rmap.h Tue Nov 5 00:03:09 2002
+++ anobjrmap6/include/asm-m68knommu/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,2 +0,0 @@
-/* Do not need anything here */
-
--- anobjrmap5/include/asm-mips/pgtable.h Wed Mar 5 07:26:32 2003
+++ anobjrmap6/include/asm-mips/pgtable.h Thu Mar 20 17:10:45 2003
@@ -771,8 +771,6 @@
#include <asm-generic/pgtable.h>
-typedef pte_t *pte_addr_t;
-
#endif /* !defined (_LANGUAGE_ASSEMBLY) */
#define io_remap_page_range remap_page_range
--- anobjrmap5/include/asm-mips/rmap.h Sat Jul 20 20:56:06 2002
+++ anobjrmap6/include/asm-mips/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,7 +0,0 @@
-#ifndef _MIPS_RMAP_H
-#define _MIPS_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-mips64/pgtable.h Wed Mar 5 07:26:32 2003
+++ anobjrmap6/include/asm-mips64/pgtable.h Thu Mar 20 17:10:45 2003
@@ -811,8 +811,6 @@
#include <asm-generic/pgtable.h>
-typedef pte_t *pte_addr_t;
-
#endif /* !defined (_LANGUAGE_ASSEMBLY) */
/*
--- anobjrmap5/include/asm-mips64/rmap.h Sat Jul 20 20:56:06 2002
+++ anobjrmap6/include/asm-mips64/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,7 +0,0 @@
-#ifndef _MIPS64_RMAP_H
-#define _MIPS64_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-parisc/cacheflush.h Wed Mar 19 11:05:12 2003
+++ anobjrmap6/include/asm-parisc/cacheflush.h Thu Mar 20 17:10:45 2003
@@ -71,7 +71,7 @@
static inline void flush_dcache_page(struct page *page)
{
- if (page->mapping && list_empty(&page->mapping->i_mmap) &&
+ if (page_mapping(page) && list_empty(&page->mapping->i_mmap) &&
list_empty(&page->mapping->i_mmap_shared)) {
set_bit(PG_dcache_dirty, &page->flags);
} else {
--- anobjrmap5/include/asm-parisc/pgtable.h Tue Mar 18 07:38:43 2003
+++ anobjrmap6/include/asm-parisc/pgtable.h Thu Mar 20 17:10:45 2003
@@ -434,8 +434,6 @@
#define pte_same(A,B) (pte_val(A) == pte_val(B))
-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */
#define io_remap_page_range remap_page_range
--- anobjrmap5/include/asm-parisc/rmap.h Sat Jul 20 20:56:06 2002
+++ anobjrmap6/include/asm-parisc/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,7 +0,0 @@
-#ifndef _PARISC_RMAP_H
-#define _PARISC_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-ppc/pgtable.h Tue Mar 18 07:38:43 2003
+++ anobjrmap6/include/asm-ppc/pgtable.h Thu Mar 20 17:10:45 2003
@@ -570,8 +570,6 @@
*/
#define pgtable_cache_init() do { } while (0)
-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */
#endif /* _PPC_PGTABLE_H */
#endif /* __KERNEL__ */
--- anobjrmap5/include/asm-ppc/rmap.h Sat Jul 20 20:56:06 2002
+++ anobjrmap6/include/asm-ppc/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,9 +0,0 @@
-#ifndef _PPC_RMAP_H
-#define _PPC_RMAP_H
-
-/* PPC calls pte_alloc() before mem_map[] is setup ... */
-#define BROKEN_PPC_PTE_ALLOC_ONE
-
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-ppc64/pgtable.h Wed Mar 19 11:05:12 2003
+++ anobjrmap6/include/asm-ppc64/pgtable.h Thu Mar 20 17:10:45 2003
@@ -375,7 +375,5 @@
extern void hpte_init_pSeries(void);
extern void hpte_init_iSeries(void);
-typedef pte_t *pte_addr_t;
-
#endif /* __ASSEMBLY__ */
#endif /* _PPC64_PGTABLE_H */
--- anobjrmap5/include/asm-ppc64/rmap.h Thu Jul 25 13:04:58 2002
+++ anobjrmap6/include/asm-ppc64/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,9 +0,0 @@
-#ifndef _PPC64_RMAP_H
-#define _PPC64_RMAP_H
-
-/* PPC64 calls pte_alloc() before mem_map[] is setup ... */
-#define BROKEN_PPC_PTE_ALLOC_ONE
-
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-s390/pgtable.h Tue Mar 18 07:38:43 2003
+++ anobjrmap6/include/asm-s390/pgtable.h Thu Mar 20 17:10:45 2003
@@ -422,8 +422,7 @@
pte_t __pte = mk_pte_phys(__physpage, __pgprot); \
\
if (!(pgprot_val(__pgprot) & _PAGE_ISCLEAN)) { \
- int __users = !!PagePrivate(__page) + !!__page->mapping; \
- if (__users + page_count(__page) == 1) \
+ if (page_count(__page) == 1) \
pte_val(__pte) |= _PAGE_MKCLEAN; \
} \
__pte; \
@@ -437,8 +436,7 @@
pte_t __pte = mk_pte_phys(__physpage, __pgprot); \
\
if (!(pgprot_val(__pgprot) & _PAGE_ISCLEAN)) { \
- int __users = !!PagePrivate(__page) + !!__page->mapping; \
- if (__users + page_count(__page) == 1) \
+ if (page_count(__page) == 1) \
pte_val(__pte) |= _PAGE_MKCLEAN; \
} \
__pte; \
@@ -507,8 +505,6 @@
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */
#define kern_addr_valid(addr) (1)
--- anobjrmap5/include/asm-s390/rmap.h Sat Jul 20 20:56:06 2002
+++ anobjrmap6/include/asm-s390/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,7 +0,0 @@
-#ifndef _S390_RMAP_H
-#define _S390_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-s390x/pgtable.h Tue Mar 18 07:38:43 2003
+++ anobjrmap6/include/asm-s390x/pgtable.h Thu Mar 20 17:10:45 2003
@@ -441,8 +441,7 @@
pte_t __pte = mk_pte_phys(__physpage, __pgprot); \
\
if (!(pgprot_val(__pgprot) & _PAGE_ISCLEAN)) { \
- int __users = !!PagePrivate(__page) + !!__page->mapping; \
- if (__users + page_count(__page) == 1) \
+ if (page_count(__page) == 1) \
pte_val(__pte) |= _PAGE_MKCLEAN; \
} \
__pte; \
@@ -456,8 +455,7 @@
pte_t __pte = mk_pte_phys(__physpage, __pgprot); \
\
if (!(pgprot_val(__pgprot) & _PAGE_ISCLEAN)) { \
- int __users = !!PagePrivate(__page) + !!__page->mapping; \
- if (__users + page_count(__page) == 1) \
+ if (page_count(__page) == 1) \
pte_val(__pte) |= _PAGE_MKCLEAN; \
} \
__pte; \
@@ -533,8 +531,6 @@
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */
#define kern_addr_valid(addr) (1)
--- anobjrmap5/include/asm-s390x/rmap.h Sat Jul 20 20:56:06 2002
+++ anobjrmap6/include/asm-s390x/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,7 +0,0 @@
-#ifndef _S390X_RMAP_H
-#define _S390X_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-sh/pgalloc.h Wed May 8 20:42:40 2002
+++ anobjrmap6/include/asm-sh/pgalloc.h Thu Mar 20 17:10:45 2003
@@ -109,7 +109,7 @@
unsigned long pfn = pte_pfn(pte);
if (pfn_valid(pfn)) {
page = pfn_to_page(page);
- if (!page->mapping || !page->mapping->i_mmap_shared)
+ if (!page_mapping(page) || list_empty(&page->mapping->i_mmap_shared))
__clear_bit(PG_mapped, &page->flags);
}
}
--- anobjrmap5/include/asm-sh/pgtable.h Tue Mar 18 07:38:43 2003
+++ anobjrmap6/include/asm-sh/pgtable.h Thu Mar 20 17:10:45 2003
@@ -307,8 +307,6 @@
#define pte_same(A,B) (pte_val(A) == pte_val(B))
-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */
#define kern_addr_valid(addr) (1)
--- anobjrmap5/include/asm-sh/rmap.h Sat Jul 20 20:56:06 2002
+++ anobjrmap6/include/asm-sh/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,7 +0,0 @@
-#ifndef _SH_RMAP_H
-#define _SH_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-sparc/pgtable.h Mon Sep 16 05:51:51 2002
+++ anobjrmap6/include/asm-sparc/pgtable.h Thu Mar 20 17:10:46 2003
@@ -442,8 +442,6 @@
#include <asm-generic/pgtable.h>
-typedef pte_t *pte_addr_t;
-
#endif /* !(__ASSEMBLY__) */
/* We provide our own get_unmapped_area to cope with VA holes for userland */
--- anobjrmap5/include/asm-sparc/rmap.h Sat Jul 20 20:56:06 2002
+++ anobjrmap6/include/asm-sparc/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,7 +0,0 @@
-#ifndef _SPARC_RMAP_H
-#define _SPARC_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-sparc64/pgtable.h Tue Mar 18 07:38:44 2003
+++ anobjrmap6/include/asm-sparc64/pgtable.h Thu Mar 20 17:10:46 2003
@@ -369,8 +369,6 @@
extern void check_pgt_cache(void);
-typedef pte_t *pte_addr_t;
-
#endif /* !(__ASSEMBLY__) */
#endif /* !(_SPARC64_PGTABLE_H) */
--- anobjrmap5/include/asm-sparc64/rmap.h Sat Jul 20 20:56:06 2002
+++ anobjrmap6/include/asm-sparc64/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,7 +0,0 @@
-#ifndef _SPARC64_RMAP_H
-#define _SPARC64_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
--- anobjrmap5/include/asm-um/pgtable.h Tue Mar 18 07:38:44 2003
+++ anobjrmap6/include/asm-um/pgtable.h Thu Mar 20 17:10:46 2003
@@ -385,18 +385,6 @@
#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
-#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G)
-typedef u32 pte_addr_t;
-#endif
-
-#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G)
-typedef u64 pte_addr_t;
-#endif
-
-#if !defined(CONFIG_HIGHPTE)
-typedef pte_t *pte_addr_t;
-#endif
-
#define update_mmu_cache(vma,address,pte) do ; while (0)
/* Encode and de-code a swap entry */
--- anobjrmap5/include/asm-um/rmap.h Mon Sep 16 05:51:51 2002
+++ anobjrmap6/include/asm-um/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,6 +0,0 @@
-#ifndef __UM_RMAP_H
-#define __UM_RMAP_H
-
-#include "asm/arch/rmap.h"
-
-#endif
--- anobjrmap5/include/asm-v850/pgtable.h Tue Nov 5 00:03:09 2002
+++ anobjrmap6/include/asm-v850/pgtable.h Thu Mar 20 17:10:46 2003
@@ -5,8 +5,6 @@
#include <asm/page.h>
-typedef pte_t *pte_addr_t;
-
#define pgd_present(pgd) (1) /* pages are always present on NO_MM */
#define pgd_none(pgd) (0)
#define pgd_bad(pgd) (0)
--- anobjrmap5/include/asm-v850/rmap.h Tue Nov 5 00:03:09 2002
+++ anobjrmap6/include/asm-v850/rmap.h Thu Jan 1 01:00:00 1970
@@ -1 +0,0 @@
-/* Do not need anything here */
--- anobjrmap5/include/asm-x86_64/pgtable.h Wed Mar 19 11:05:15 2003
+++ anobjrmap6/include/asm-x86_64/pgtable.h Thu Mar 20 17:10:46 2003
@@ -380,8 +380,6 @@
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-typedef pte_t *pte_addr_t;
-
#endif /* !__ASSEMBLY__ */
#ifndef CONFIG_DISCONTIGMEM
--- anobjrmap5/include/asm-x86_64/rmap.h Wed Oct 16 06:31:03 2002
+++ anobjrmap6/include/asm-x86_64/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,7 +0,0 @@
-#ifndef _X8664_RMAP_H
-#define _X8664_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#endif
anobjrmap 4/6 add anonmm to track anonymous pages
Introduce struct anonmm per mm to track anonymous pages,
all forks from one exec share same bundle of linked anonmms:
anonymous pages may start out in one mm but get forked into
another later. Callouts from fork.c to rmap.c to allocate,
dup and exit anonmm.
page_add_rmap now takes page*, vma*, uvaddr, anon args.
In file-backed case, vma will be used to check whether uvaddr
matches vma_address from page->index and vm_pgoff: nonlinear
handled in next the patch. In anonymous case, vma used to find
vma->vm_mm->anonmm to save in page->mapping, uvaddr in index.
page_referenced and try_to_unmap call _anon or _obj variants
to process lists, which call _one for each vma or anonmm.
put_dirty_page (put_dirty_page? odd, let's call it put_stack_page)
in exec.c take vma* instead of tsk*, it's always on current anyway.
Make a habit of raising rss before page_add_rmap, the loops skip
rss 0, partly to save time, but also to avoid catching child mm
when inconsistent between dup_rmap and dup_mmap.
fs/exec.c | 27 +--
include/linux/mm.h | 7
include/linux/rmap.h | 16 +-
include/linux/sched.h | 1
kernel/fork.c | 21 ++
mm/fremap.c | 2
mm/memory.c | 9 -
mm/mremap.c | 7
mm/rmap.c | 380 +++++++++++++++++++++++++++++++++++++++++---------
mm/swapfile.c | 7
10 files changed, 381 insertions(+), 96 deletions(-)
--- anobjrmap3/fs/exec.c Thu Mar 20 17:10:12 2003
+++ anobjrmap4/fs/exec.c Thu Mar 20 17:10:23 2003
@@ -286,10 +286,12 @@
* This routine is used to map in a page into an address space: needed by
* execve() for the initial stack and environment pages.
*
- * tsk->mmap_sem is held for writing.
+ * mm->mmap_sem is held for writing.
*/
-void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address)
+void put_stack_page(struct vm_area_struct *mpnt,
+ struct page *page, unsigned long address)
{
+ struct mm_struct *mm = mpnt->vm_mm;
pgd_t * pgd;
pmd_t * pmd;
pte_t * pte;
@@ -297,33 +299,33 @@
if (page_count(page) != 1)
printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address);
- pgd = pgd_offset(tsk->mm, address);
- spin_lock(&tsk->mm->page_table_lock);
- pmd = pmd_alloc(tsk->mm, pgd, address);
+ pgd = pgd_offset(mm, address);
+ spin_lock(&mm->page_table_lock);
+ pmd = pmd_alloc(mm, pgd, address);
if (!pmd)
goto out;
- pte = pte_alloc_map(tsk->mm, pmd, address);
+ pte = pte_alloc_map(mm, pmd, address);
if (!pte)
goto out;
if (!pte_none(*pte)) {
pte_unmap(pte);
goto out;
}
+ mm->rss++;
lru_cache_add_active(page);
flush_dcache_page(page);
flush_page_to_ram(page);
set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
- page_add_rmap(page, 1);
+ page_add_rmap(page, mpnt, address, 1);
pte_unmap(pte);
- tsk->mm->rss++;
- spin_unlock(&tsk->mm->page_table_lock);
+ spin_unlock(&mm->page_table_lock);
/* no need for flush_tlb */
return;
out:
- spin_unlock(&tsk->mm->page_table_lock);
+ spin_unlock(&mm->page_table_lock);
__free_page(page);
- force_sig(SIGKILL, tsk);
+ force_sig(SIGKILL, current);
return;
}
@@ -416,7 +418,7 @@
struct page *page = bprm->page[i];
if (page) {
bprm->page[i] = NULL;
- put_dirty_page(current,page,stack_base);
+ put_stack_page(mpnt,page,stack_base);
}
stack_base += PAGE_SIZE;
}
@@ -429,7 +431,6 @@
#else
-#define put_dirty_page(tsk, page, address)
#define setup_arg_pages(bprm) (0)
static inline void free_arg_pages(struct linux_binprm *bprm)
{
--- anobjrmap3/include/linux/mm.h Thu Mar 20 17:10:12 2003
+++ anobjrmap4/include/linux/mm.h Thu Mar 20 17:10:23 2003
@@ -577,12 +577,19 @@
extern unsigned int nr_used_zone_pages(void);
#ifdef CONFIG_MMU
+extern void put_stack_page(struct vm_area_struct *,
+ struct page *, unsigned long);
extern struct page * vmalloc_to_page(void *addr);
extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
int write);
extern int remap_page_range(struct vm_area_struct *vma, unsigned long from,
unsigned long to, unsigned long size, pgprot_t prot);
#else
+static inline void put_stack_page(struct vm_area_struct *,
+ struct page *, unsigned long)
+{
+ return;
+}
static inline struct page * vmalloc_to_page(void *addr)
{
return NULL;
--- anobjrmap3/include/linux/rmap.h Thu Mar 20 17:10:12 2003
+++ anobjrmap4/include/linux/rmap.h Thu Mar 20 17:10:23 2003
@@ -8,7 +8,9 @@
#include <linux/linkage.h>
#ifdef CONFIG_MMU
-void FASTCALL(page_add_rmap(struct page *, int anon));
+void page_add_rmap(struct page *, struct vm_area_struct *,
+ unsigned long addr, int anon);
+void page_turn_rmap(struct page *, struct vm_area_struct *);
void FASTCALL(page_dup_rmap(struct page *));
void FASTCALL(page_remove_rmap(struct page *));
@@ -18,10 +20,22 @@
int FASTCALL(page_referenced(struct page *));
int FASTCALL(try_to_unmap(struct page *));
+/*
+ * Called from kernel/fork.c to manage anonymous memory
+ */
+void init_rmap(void);
+int exec_rmap(struct mm_struct *);
+int dup_rmap(struct mm_struct *, struct mm_struct *oldmm);
+void exit_rmap(struct mm_struct *);
+
#else /* !CONFIG_MMU */
#define page_referenced(page) TestClearPageReferenced(page)
#define try_to_unmap(page) SWAP_FAIL
+#define init_rmap() do {} while (0)
+#define exec_rmap(mm) (0)
+#define dup_rmap(mm, oldmm) (0)
+#define exit_rmap(mm) do {} while (0)
#endif /* CONFIG_MMU */
--- anobjrmap3/include/linux/sched.h Wed Mar 19 11:05:16 2003
+++ anobjrmap4/include/linux/sched.h Thu Mar 20 17:10:23 2003
@@ -198,6 +198,7 @@
* together off init_mm.mmlist, and are protected
* by mmlist_lock
*/
+ struct anonmm *anonmm; /* For rmap to track anon mem */
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
--- anobjrmap3/kernel/fork.c Wed Mar 19 11:05:16 2003
+++ anobjrmap4/kernel/fork.c Thu Mar 20 17:10:23 2003
@@ -30,6 +30,7 @@
#include <linux/futex.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
+#include <linux/rmap.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -333,6 +334,7 @@
vm_unacct_memory(charge);
goto out;
}
+
static inline int mm_alloc_pgd(struct mm_struct * mm)
{
mm->pgd = pgd_alloc(mm);
@@ -377,7 +379,6 @@
free_mm(mm);
return NULL;
}
-
/*
* Allocate and initialize an mm_struct.
@@ -389,9 +390,14 @@
mm = allocate_mm();
if (mm) {
memset(mm, 0, sizeof(*mm));
- return mm_init(mm);
+ mm = mm_init(mm);
+ if (mm && exec_rmap(mm)) {
+ mm_free_pgd(mm);
+ free_mm(mm);
+ mm = NULL;
+ }
}
- return NULL;
+ return mm;
}
/*
@@ -418,6 +424,7 @@
spin_unlock(&mmlist_lock);
exit_aio(mm);
exit_mmap(mm);
+ exit_rmap(mm);
mmdrop(mm);
}
}
@@ -504,6 +511,12 @@
if (!mm_init(mm))
goto fail_nomem;
+ if (dup_rmap(mm, oldmm)) {
+ mm_free_pgd(mm);
+ free_mm(mm);
+ goto fail_nomem;
+ }
+
if (init_new_context(tsk,mm))
goto free_pt;
@@ -1177,4 +1190,6 @@
SLAB_HWCACHE_ALIGN, NULL, NULL);
if(!mm_cachep)
panic("vma_init: Cannot alloc mm_struct SLAB cache");
+
+ init_rmap();
}
--- anobjrmap3/mm/fremap.c Thu Mar 20 17:10:12 2003
+++ anobjrmap4/mm/fremap.c Thu Mar 20 17:10:23 2003
@@ -75,7 +75,7 @@
flush_icache_page(vma, page);
entry = mk_pte(page, prot);
set_pte(pte, entry);
- page_add_rmap(page, 0);
+ page_add_rmap(page, vma, addr, 0);
pte_unmap(pte);
if (flush)
flush_tlb_page(vma, addr);
--- anobjrmap3/mm/memory.c Thu Mar 20 17:10:12 2003
+++ anobjrmap4/mm/memory.c Thu Mar 20 17:10:23 2003
@@ -940,6 +940,7 @@
flush_cache_page(vma, address);
establish_pte(vma, address, page_table,
pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
+ page_turn_rmap(old_page, vma);
pte_unmap(page_table);
ret = VM_FAULT_MINOR;
goto out;
@@ -969,7 +970,7 @@
else
page_remove_rmap(old_page);
break_cow(vma, new_page, address, page_table);
- page_add_rmap(new_page, 1);
+ page_add_rmap(new_page, vma, address, 1);
lru_cache_add_active(new_page);
/* Free the old page.. */
@@ -1170,7 +1171,7 @@
flush_page_to_ram(page);
flush_icache_page(vma, page);
set_pte(page_table, pte);
- page_add_rmap(page, 1);
+ page_add_rmap(page, vma, address, 1);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
@@ -1227,7 +1228,7 @@
set_pte(page_table, entry);
/* ignores ZERO_PAGE */
- page_add_rmap(page, 1);
+ page_add_rmap(page, vma, addr, 1);
pte_unmap(page_table);
/* No need to invalidate - it was non-present before */
@@ -1312,7 +1313,7 @@
if (write_access)
entry = pte_mkwrite(pte_mkdirty(entry));
set_pte(page_table, entry);
- page_add_rmap(new_page, anon);
+ page_add_rmap(new_page, vma, address, anon);
pte_unmap(page_table);
} else {
/* One of our sibling threads was faster, back out. */
--- anobjrmap3/mm/mremap.c Thu Mar 20 17:10:12 2003
+++ anobjrmap4/mm/mremap.c Thu Mar 20 17:10:23 2003
@@ -83,7 +83,8 @@
}
static int
-copy_one_pte(struct mm_struct *mm, pte_t *src, pte_t *dst)
+copy_one_pte(struct vm_area_struct *vma, pte_t *src, pte_t *dst,
+ unsigned long old_addr, unsigned long new_addr)
{
pte_t pte;
struct page *page = NULL;
@@ -98,7 +99,7 @@
if (page) {
int anon = PageAnon(page);
page_remove_rmap(page);
- page_add_rmap(page, anon);
+ page_add_rmap(page, vma, new_addr, anon);
}
}
return 0;
@@ -127,7 +128,7 @@
dst = alloc_one_pte_map(mm, new_addr);
if (src == NULL)
src = get_one_pte_map_nested(mm, old_addr);
- error = copy_one_pte(mm, src, dst);
+ error = copy_one_pte(vma, src, dst, old_addr, new_addr);
pte_unmap_nested(src);
pte_unmap(dst);
}
--- anobjrmap3/mm/rmap.c Thu Mar 20 17:10:12 2003
+++ anobjrmap4/mm/rmap.c Thu Mar 20 17:10:23 2003
@@ -29,45 +29,165 @@
#define page_mapcount(page) ((page)->rmap_count)
+#define NOADDR (~0UL) /* impossible user virtual address */
+
+/*
+ * struct anonmm: to track a bundle of anonymous memory mappings.
+ *
+ * Could be embedded in mm_struct, but mm_struct is rather heavyweight,
+ * and we may need the anonmm to stay around long after the mm_struct
+ * and its pgd have been freed: because pages originally faulted into
+ * that mm have been duped into forked mms, and still need tracking.
+ */
+struct anonmm {
+ atomic_t count; /* ref count, incl. 1 per page */
+ spinlock_t lock; /* head's locks list; others unused */
+ struct mm_struct *mm; /* assoc mm_struct, NULL when gone */
+ struct anonmm *head; /* exec starts new chain from head */
+ struct list_head list; /* chain of associated anonmms */
+};
+static kmem_cache_t *anonmm_cachep;
+
/*
- * Something oopsable to put for now in the page->mapping
- * of an anonymous page, to test that it is ignored.
+ * At what user virtual address is page expected in file-backed vma?
*/
-#define ANON_MAPPING_DEBUG ((struct address_space *) 1)
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ unsigned long pgoff;
+ unsigned long address;
+
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ return (address >= vma->vm_start && address < vma->vm_end)?
+ address: NOADDR;
+}
+
+/**
+ ** Functions for creating and destroying struct anonmm.
+ **/
+
+void __init
+init_rmap(void)
+{
+ anonmm_cachep = kmem_cache_create("anonmm",
+ sizeof(struct anonmm), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!anonmm_cachep)
+ panic("init_rmap: Cannot alloc anonmm SLAB cache");
+}
+
+int
+exec_rmap(struct mm_struct *mm)
+{
+ struct anonmm *anonmm;
+
+ anonmm = kmem_cache_alloc(anonmm_cachep, SLAB_KERNEL);
+ if (unlikely(!anonmm))
+ return -ENOMEM;
+
+ atomic_set(&anonmm->count, 2); /* ref by mm and head */
+ anonmm->lock = SPIN_LOCK_UNLOCKED; /* this lock is used */
+ anonmm->mm = mm;
+ anonmm->head = anonmm;
+ INIT_LIST_HEAD(&anonmm->list);
+ mm->anonmm = anonmm;
+ return 0;
+}
+
+int
+dup_rmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ struct anonmm *anonmm;
+ struct anonmm *anonhd = oldmm->anonmm->head;
+
+ anonmm = kmem_cache_alloc(anonmm_cachep, SLAB_KERNEL);
+ if (unlikely(!anonmm))
+ return -ENOMEM;
+
+ /*
+ * copy_mm calls us before dup_mmap has reset the mm fields,
+ * so reset rss ourselves before adding to anonhd's list,
+ * to keep away from this mm until it's worth examining.
+ */
+ mm->rss = 0;
+
+ atomic_set(&anonmm->count, 1); /* ref by mm */
+ anonmm->lock = SPIN_LOCK_UNLOCKED; /* this lock is not used */
+ anonmm->mm = mm;
+ anonmm->head = anonhd;
+ spin_lock(&anonhd->lock);
+ atomic_inc(&anonhd->count); /* ref by anonmm's head */
+ list_add_tail(&anonmm->list, &anonhd->list);
+ spin_unlock(&anonhd->lock);
+ mm->anonmm = anonmm;
+ return 0;
+}
+
+void
+exit_rmap(struct mm_struct *mm)
+{
+ struct anonmm *anonmm = mm->anonmm;
+ struct anonmm *anonhd = anonmm->head;
+
+ mm->anonmm = NULL;
+ spin_lock(&anonhd->lock);
+ anonmm->mm = NULL;
+ if (atomic_dec_and_test(&anonmm->count)) {
+ BUG_ON(anonmm == anonhd);
+ list_del(&anonmm->list);
+ kmem_cache_free(anonmm_cachep, anonmm);
+ if (atomic_dec_and_test(&anonhd->count))
+ BUG();
+ }
+ spin_unlock(&anonhd->lock);
+ if (atomic_read(&anonhd->count) == 1) {
+ BUG_ON(anonhd->mm);
+ BUG_ON(!list_empty(&anonhd->list));
+ kmem_cache_free(anonmm_cachep, anonhd);
+ }
+}
+
+static void
+free_anonmm(struct anonmm *anonmm)
+{
+ struct anonmm *anonhd = anonmm->head;
+
+ BUG_ON(anonmm->mm);
+ BUG_ON(anonmm == anonhd);
+ spin_lock(&anonhd->lock);
+ list_del(&anonmm->list);
+ if (atomic_dec_and_test(&anonhd->count))
+ BUG();
+ spin_unlock(&anonhd->lock);
+ kmem_cache_free(anonmm_cachep, anonmm);
+}
static inline void
clear_page_anon(struct page *page)
{
- BUG_ON(page->mapping != ANON_MAPPING_DEBUG);
+ struct anonmm *anonmm = (struct anonmm *) page->mapping;
+
page->mapping = NULL;
ClearPageAnon(page);
+ if (atomic_dec_and_test(&anonmm->count))
+ free_anonmm(anonmm);
}
/**
** Subfunctions of page_referenced: page_referenced_one called
- ** repeatedly from page_referenced_obj.
+ ** repeatedly from either page_referenced_anon or page_referenced_obj.
**/
static int
-page_referenced_one(struct page *page, struct vm_area_struct *vma)
+page_referenced_one(struct page *page, struct mm_struct *mm,
+ unsigned long address, unsigned long *mapcount)
{
- struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
- unsigned long loffset;
- unsigned long address;
int referenced = 0;
- loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
- if (loffset < vma->vm_pgoff)
- goto out;
-
- address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT);
-
- if (address >= vma->vm_end)
- goto out;
-
if (!spin_trylock(&mm->page_table_lock)) {
referenced = 1;
goto out;
@@ -91,6 +211,8 @@
if (ptep_test_and_clear_young(pte))
referenced++;
+ (*mapcount)--;
+
out_unmap:
pte_unmap(pte);
@@ -102,21 +224,69 @@
}
static inline int
-page_referenced_obj(struct page *page)
+page_referenced_anon(struct page *page, unsigned long *mapcount)
+{
+ struct anonmm *anonmm = (struct anonmm *) page->mapping;
+ struct anonmm *anonhd = anonmm->head;
+ struct list_head *seek_head;
+ int referenced = 0;
+
+ spin_lock(&anonhd->lock);
+ if (anonmm->mm && anonmm->mm->rss) {
+ referenced += page_referenced_one(
+ page, anonmm->mm, page->index, mapcount);
+ if (!*mapcount)
+ goto out;
+ }
+ seek_head = &anonmm->list;
+ list_for_each_entry(anonmm, seek_head, list) {
+ if (!anonmm->mm || !anonmm->mm->rss)
+ continue;
+ referenced += page_referenced_one(
+ page, anonmm->mm, page->index, mapcount);
+ if (!*mapcount)
+ goto out;
+ }
+out:
+ spin_unlock(&anonhd->lock);
+ return referenced;
+}
+
+static inline int
+page_referenced_obj(struct page *page, unsigned long *mapcount)
{
struct address_space *mapping = page->mapping;
struct vm_area_struct *vma;
+ unsigned long address;
int referenced = 0;
if (down_trylock(&mapping->i_shared_sem))
return 1;
- list_for_each_entry(vma, &mapping->i_mmap, shared)
- referenced += page_referenced_one(page, vma);
-
- list_for_each_entry(vma, &mapping->i_mmap_shared, shared)
- referenced += page_referenced_one(page, vma);
+ list_for_each_entry(vma, &mapping->i_mmap, shared) {
+ if (!vma->vm_mm->rss)
+ continue;
+ address = vma_address(page, vma);
+ if (address != NOADDR) {
+ referenced += page_referenced_one(
+ page, vma->vm_mm, address, mapcount);
+ if (!*mapcount)
+ goto out;
+ }
+ }
+ list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+ if (!vma->vm_mm->rss)
+ continue;
+ address = vma_address(page, vma);
+ if (address != NOADDR) {
+ referenced += page_referenced_one(
+ page, vma->vm_mm, address, mapcount);
+ if (!*mapcount)
+ goto out;
+ }
+ }
+out:
up(&mapping->i_shared_sem);
return referenced;
}
@@ -132,28 +302,38 @@
int
page_referenced(struct page *page)
{
+ unsigned long mapcount;
int referenced;
referenced = !!TestClearPageReferenced(page);
- if (page_mapcount(page) && page->mapping && !PageAnon(page))
- referenced += page_referenced_obj(page);
+ mapcount = page_mapcount(page);
+ if (mapcount && page->mapping) {
+ referenced += PageAnon(page)?
+ page_referenced_anon(page, &mapcount):
+ page_referenced_obj(page, &mapcount);
+ }
return referenced;
}
/**
* page_add_rmap - add reverse mapping entry to a page
* @page: the page to add the mapping to
+ * @vma: the vma into which this page is being mapped
+ * @address: the virtual address at which page is mapped
* @anon: is this an anonymous (not file-backed) page?
*
* For general use: Add a new reverse mapping to a page.
* The caller needs to hold the mm->page_table_lock.
*/
void
-page_add_rmap(struct page *page, int anon)
+page_add_rmap(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, int anon)
{
if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
return;
+ address &= PAGE_MASK;
+
rmap_lock(page);
if (!page_mapped(page))
@@ -168,8 +348,11 @@
}
} else {
if (anon) {
+ struct anonmm *anonmm = vma->vm_mm->anonmm;
SetPageAnon(page);
- page->mapping = ANON_MAPPING_DEBUG;
+ page->index = address;
+ page->mapping = (struct address_space *) anonmm;
+ atomic_inc(&anonmm->count);
} else {
/*
* Driver did not assign page->mapping,
@@ -198,6 +381,35 @@
}
/**
+ * page_turn_rmap - turn reverse mapping from one mm to another.
+ * @page: the anonymous page originally mapped into some vma
+ * @vma: the new vma into which this page is now being mapped
+ *
+ * For do_wp_page only: update exclusive page with new mm,
+ * so that it can be located more quickly later on.
+ */
+void
+page_turn_rmap(struct page *page, struct vm_area_struct *vma)
+{
+ struct anonmm *old_anonmm = (struct anonmm *) page->mapping;
+ struct anonmm *new_anonmm = vma->vm_mm->anonmm;
+
+ BUG_ON(!PageAnon(page));
+ BUG_ON(page_mapcount(page) != 1);
+ if (new_anonmm == old_anonmm)
+ return;
+ /*
+ * Take rmap_lock since we don't hold old mm's page_table_lock.
+ */
+ rmap_lock(page);
+ clear_page_anon(page);
+ SetPageAnon(page);
+ page->mapping = (struct address_space *) new_anonmm;
+ atomic_inc(&new_anonmm->count);
+ rmap_unlock(page);
+}
+
+/**
* page_remove_rmap - take down reverse mapping to a page
* @page: page to remove mapping from
*
@@ -227,30 +439,20 @@
/**
** Subfunctions of try_to_unmap: try_to_unmap_one called
- ** repeatedly from try_to_unmap_obj.
+ ** repeatedly from either try_to_unmap_anon or try_to_unmap_obj.
**/
static int
-try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
+try_to_unmap_one(struct page *page, struct mm_struct *mm,
+ unsigned long address, unsigned long *mapcount,
+ struct vm_area_struct *vma)
{
- struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
pte_t pteval;
- unsigned long loffset;
- unsigned long address;
int ret = SWAP_AGAIN;
- loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
- if (loffset < vma->vm_pgoff)
- goto out;
-
- address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT);
-
- if (address >= vma->vm_end)
- goto out;
-
/*
* We need the page_table_lock to protect us from page faults,
* munmap, fork, etc...
@@ -273,8 +475,15 @@
if (page_to_pfn(page) != pte_pfn(*pte))
goto out_unmap;
- /* If the page is mlock()d, we cannot swap it out. */
- if (vma->vm_flags & VM_LOCKED) {
+ (*mapcount)--;
+
+ /*
+ * If the page is mlock()d, we cannot swap it out.
+ * During mremap, it's possible pages are not in a VMA.
+ */
+ if (!vma)
+ vma = find_vma(mm, address);
+ if (!vma || (vma->vm_flags & VM_LOCKED)) {
ret = SWAP_FAIL;
goto out_unmap;
}
@@ -284,11 +493,6 @@
pteval = ptep_get_and_clear(pte);
flush_tlb_page(vma, address);
- /*
- * This block makes no sense in this subpatch: neither anon
- * pages nor nonlinear pages get here. But we want to hold on
- * to this code, to use in later patches which correct that.
- */
if (PageAnon(page)) {
swp_entry_t entry = { .val = page->private };
/*
@@ -300,15 +504,12 @@
set_pte(pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
} else {
- unsigned long pgidx;
/*
- * If a nonlinear mapping from sys_remap_file_pages,
- * then store the file page offset in the pte.
+ * This only comes into play with the next patch...
+ * If a nonlinear mapping then store
+ * the file page offset in the pte.
*/
- pgidx = (address - vma->vm_start) >> PAGE_SHIFT;
- pgidx += vma->vm_pgoff;
- pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
- if (page->index != pgidx) {
+ if (address != vma_address(page, vma)) {
set_pte(pte, pgoff_to_pte(page->index));
BUG_ON(!pte_file(*pte));
}
@@ -318,10 +519,10 @@
if (pte_dirty(pteval))
set_page_dirty(page);
- mm->rss--;
BUG_ON(!page_mapcount(page));
page_mapcount(page)--;
page_cache_release(page);
+ mm->rss--;
out_unmap:
pte_unmap(pte);
@@ -334,25 +535,67 @@
}
static inline int
-try_to_unmap_obj(struct page *page)
+try_to_unmap_anon(struct page *page, unsigned long *mapcount)
+{
+ struct anonmm *anonmm = (struct anonmm *) page->mapping;
+ struct anonmm *anonhd = anonmm->head;
+ struct list_head *seek_head;
+ int ret = SWAP_AGAIN;
+
+ spin_lock(&anonhd->lock);
+ if (anonmm->mm && anonmm->mm->rss) {
+ ret = try_to_unmap_one(
+ page, anonmm->mm, page->index, mapcount, NULL);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
+ }
+ seek_head = &anonmm->list;
+ list_for_each_entry(anonmm, seek_head, list) {
+ if (!anonmm->mm || !anonmm->mm->rss)
+ continue;
+ ret = try_to_unmap_one(
+ page, anonmm->mm, page->index, mapcount, NULL);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
+ }
+out:
+ spin_unlock(&anonhd->lock);
+ return ret;
+}
+
+static inline int
+try_to_unmap_obj(struct page *page, unsigned long *mapcount)
{
struct address_space *mapping = page->mapping;
struct vm_area_struct *vma;
+ unsigned long address;
int ret = SWAP_AGAIN;
if (down_trylock(&mapping->i_shared_sem))
return ret;
list_for_each_entry(vma, &mapping->i_mmap, shared) {
- ret = try_to_unmap_one(page, vma);
- if (ret == SWAP_FAIL)
- goto out;
+ if (!vma->vm_mm->rss)
+ continue;
+ address = vma_address(page, vma);
+ if (address != NOADDR) {
+ ret = try_to_unmap_one(
+ page, vma->vm_mm, address, mapcount, vma);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
+ }
}
list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- ret = try_to_unmap_one(page, vma);
- if (ret == SWAP_FAIL)
- goto out;
+ if (!vma->vm_mm->rss)
+ continue;
+ address = vma_address(page, vma);
+ if (address != NOADDR) {
+ ret = try_to_unmap_one(
+ page, vma->vm_mm, address, mapcount, vma);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
+ }
}
out:
@@ -375,14 +618,17 @@
int
try_to_unmap(struct page *page)
{
- int ret = SWAP_FAIL;
+ unsigned long mapcount;
+ int ret;
BUG_ON(PageReserved(page));
BUG_ON(!PageLocked(page));
BUG_ON(!page_mapped(page));
- if (!PageAnon(page))
- ret = try_to_unmap_obj(page);
+ mapcount = page_mapcount(page);
+ ret = PageAnon(page)?
+ try_to_unmap_anon(page, &mapcount):
+ try_to_unmap_obj(page, &mapcount);
if (!page_mapped(page)) {
dec_page_state(nr_mapped);
--- anobjrmap3/mm/swapfile.c Thu Mar 20 17:10:12 2003
+++ anobjrmap4/mm/swapfile.c Thu Mar 20 17:10:23 2003
@@ -396,11 +396,11 @@
return;
if (unlikely(pte_none(pte) || pte_present(pte)))
return;
+ vma->vm_mm->rss++;
get_page(page);
set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
- page_add_rmap(page, 1);
+ page_add_rmap(page, vma, address, 1);
swap_free(entry);
- ++vma->vm_mm->rss;
}
/* mmlist_lock and vma->vm_mm->page_table_lock are held */
@@ -425,8 +425,7 @@
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
- unuse_pte(vma, offset+address-vma->vm_start,
- pte, entry, page);
+ unuse_pte(vma, offset + address, pte, entry, page);
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
anobjrmap 3/6 remove pte-pointer-based rmap
Lots of deletions: the next patch will put in the new anon rmap,
I expect it to look clearer if first we remove all of the old
pte-pointer-based anon rmap in this patch - which therefore
leaves anonymous rmap totally disabled, anon pages locked in
memory until the process frees them.
A few additions: the previous patch brought ClearPageAnon into
rmap.c instead of leaving it to final page free; but I think
there'd be a race with swapin or swapoff doing SetPageAnon:
now SetPageAnon under lock within page_add_rmap. That lock
now being called rmap_lock instead of pte_chain_lock.
Removed nr_reverse_maps, ReverseMaps: easily reverted if that
poses a vmstat or meminfo compatibility problem, or someone is
still interested in that number; but objrmap wasn't maintaining
it, and if they don't occupy space, is it worth showing?
Besides, look at page_dup_rmap for copy_page_range: I don't
want to clutter that with inc_page_state(nr_reverse_maps).
fs/exec.c | 10
fs/proc/proc_misc.c | 6
include/asm-generic/rmap.h | 90 ----
include/asm-i386/pgtable.h | 12
include/asm-i386/rmap.h | 21 -
include/linux/mm.h | 20 -
include/linux/page-flags.h | 10
include/linux/rmap.h | 27 -
init/main.c | 2
mm/fremap.c | 16
mm/memory.c | 130 +------
mm/mremap.c | 37 --
mm/nommu.c | 4
mm/page_alloc.c | 10
mm/rmap.c | 818 +++++++++------------------------------------
mm/swapfile.c | 14
mm/vmscan.c | 22 -
17 files changed, 250 insertions(+), 999 deletions(-)
--- anobjrmap2/fs/exec.c Thu Mar 20 17:09:50 2003
+++ anobjrmap3/fs/exec.c Thu Mar 20 17:10:12 2003
@@ -293,15 +293,11 @@
pgd_t * pgd;
pmd_t * pmd;
pte_t * pte;
- struct pte_chain *pte_chain;
if (page_count(page) != 1)
printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address);
pgd = pgd_offset(tsk->mm, address);
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain)
- goto out_sig;
spin_lock(&tsk->mm->page_table_lock);
pmd = pmd_alloc(tsk->mm, pgd, address);
if (!pmd)
@@ -316,22 +312,18 @@
lru_cache_add_active(page);
flush_dcache_page(page);
flush_page_to_ram(page);
- SetPageAnon(page);
set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
- pte_chain = page_add_rmap(page, pte, pte_chain);
+ page_add_rmap(page, 1);
pte_unmap(pte);
tsk->mm->rss++;
spin_unlock(&tsk->mm->page_table_lock);
/* no need for flush_tlb */
- pte_chain_free(pte_chain);
return;
out:
spin_unlock(&tsk->mm->page_table_lock);
-out_sig:
__free_page(page);
force_sig(SIGKILL, tsk);
- pte_chain_free(pte_chain);
return;
}
--- anobjrmap2/fs/proc/proc_misc.c Thu Mar 20 17:10:01 2003
+++ anobjrmap3/fs/proc/proc_misc.c Thu Mar 20 17:10:12 2003
@@ -177,8 +177,7 @@
"Mapped: %8lu kB\n"
"Slab: %8lu kB\n"
"Committed_AS: %8u kB\n"
- "PageTables: %8lu kB\n"
- "ReverseMaps: %8lu\n",
+ "PageTables: %8lu kB\n",
K(i.totalram),
K(i.freeram),
K(i.bufferram),
@@ -197,8 +196,7 @@
K(ps.nr_mapped),
K(ps.nr_slab),
K(committed),
- K(ps.nr_page_table_pages),
- ps.nr_reverse_maps
+ K(ps.nr_page_table_pages)
);
len += hugetlb_report_meminfo(page + len);
--- anobjrmap2/include/asm-generic/rmap.h Mon Feb 10 20:12:52 2003
+++ anobjrmap3/include/asm-generic/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,90 +0,0 @@
-#ifndef _GENERIC_RMAP_H
-#define _GENERIC_RMAP_H
-/*
- * linux/include/asm-generic/rmap.h
- *
- * Architecture dependent parts of the reverse mapping code,
- * this version should work for most architectures with a
- * 'normal' page table layout.
- *
- * We use the struct page of the page table page to find out
- * the process and full address of a page table entry:
- * - page->mapping points to the process' mm_struct
- * - page->index has the high bits of the address
- * - the lower bits of the address are calculated from the
- * offset of the page table entry within the page table page
- *
- * For CONFIG_HIGHPTE, we need to represent the address of a pte in a
- * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE
- * bits and is then ORed with the byte offset of the pte within its page.
- *
- * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for
- * the offset.
- *
- * For CONFIG_HIGHMEM64G, the pte_addr_t is 64 bits. 52 for the pfn, 12 for
- * the offset.
- */
-#include <linux/mm.h>
-
-static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address)
-{
-#ifdef BROKEN_PPC_PTE_ALLOC_ONE
- /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */
- extern int mem_init_done;
-
- if (!mem_init_done)
- return;
-#endif
- page->mapping = (void *)mm;
- page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
- inc_page_state(nr_page_table_pages);
-}
-
-static inline void pgtable_remove_rmap(struct page * page)
-{
- page->mapping = NULL;
- page->index = 0;
- dec_page_state(nr_page_table_pages);
-}
-
-static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
-{
- struct page * page = kmap_atomic_to_page(ptep);
- return (struct mm_struct *) page->mapping;
-}
-
-static inline unsigned long ptep_to_address(pte_t * ptep)
-{
- struct page * page = kmap_atomic_to_page(ptep);
- unsigned long low_bits;
- low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
- return page->index + low_bits;
-}
-
-#if CONFIG_HIGHPTE
-static inline pte_addr_t ptep_to_paddr(pte_t *ptep)
-{
- pte_addr_t paddr;
- paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT;
- return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK);
-}
-#else
-static inline pte_addr_t ptep_to_paddr(pte_t *ptep)
-{
- return (pte_addr_t)ptep;
-}
-#endif
-
-#ifndef CONFIG_HIGHPTE
-static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr)
-{
- return (pte_t *)pte_paddr;
-}
-
-static inline void rmap_ptep_unmap(pte_t *pte)
-{
- return;
-}
-#endif
-
-#endif /* _GENERIC_RMAP_H */
--- anobjrmap2/include/asm-i386/pgtable.h Wed Mar 19 11:05:12 2003
+++ anobjrmap3/include/asm-i386/pgtable.h Thu Mar 20 17:10:12 2003
@@ -288,18 +288,6 @@
#define pte_unmap_nested(pte) do { } while (0)
#endif
-#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G)
-typedef u32 pte_addr_t;
-#endif
-
-#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G)
-typedef u64 pte_addr_t;
-#endif
-
-#if !defined(CONFIG_HIGHPTE)
-typedef pte_t *pte_addr_t;
-#endif
-
/*
* The i386 doesn't have any external MMU info: the kernel page
* tables contain all the necessary information.
--- anobjrmap2/include/asm-i386/rmap.h Mon Sep 16 05:51:50 2002
+++ anobjrmap3/include/asm-i386/rmap.h Thu Jan 1 01:00:00 1970
@@ -1,21 +0,0 @@
-#ifndef _I386_RMAP_H
-#define _I386_RMAP_H
-
-/* nothing to see, move along */
-#include <asm-generic/rmap.h>
-
-#ifdef CONFIG_HIGHPTE
-static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr)
-{
- unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT);
- unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK;
- return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off);
-}
-
-static inline void rmap_ptep_unmap(pte_t *pte)
-{
- kunmap_atomic(pte, KM_PTE2);
-}
-#endif
-
-#endif
--- anobjrmap2/include/linux/mm.h Thu Mar 20 17:10:01 2003
+++ anobjrmap3/include/linux/mm.h Thu Mar 20 17:10:12 2003
@@ -139,8 +139,6 @@
int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
};
-/* forward declaration; pte_chain is meant to be internal to rmap.c */
-struct pte_chain;
struct mmu_gather;
/*
@@ -167,12 +165,7 @@
unsigned long index; /* Our offset within mapping. */
struct list_head lru; /* Pageout list, eg. active_list;
protected by zone->lru_lock !! */
- union {
- struct pte_chain *chain;/* Reverse pte mapping pointer.
- * protected by PG_chainlock */
- pte_addr_t direct;
- int mapcount;
- } pte;
+ unsigned long rmap_count; /* Count mappings in mms */
unsigned long private; /* mapping-private opaque data */
/*
@@ -371,16 +364,7 @@
* refers to user virtual address space into which the page is mapped.
*/
#define page_mapping(page) (PageAnon(page)? NULL: (page)->mapping)
-
-/*
- * Return true if this page is mapped into pagetables. Subtle: test pte.direct
- * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain
- * is only 32-bit.
- */
-static inline int page_mapped(struct page *page)
-{
- return page->pte.direct != 0;
-}
+#define page_mapped(page) ((page)->rmap_count != 0)
/*
* Error return values for the *_nopage functions
--- anobjrmap2/include/linux/page-flags.h Thu Mar 20 17:10:01 2003
+++ anobjrmap3/include/linux/page-flags.h Thu Mar 20 17:10:12 2003
@@ -68,9 +68,8 @@
#define PG_private 12 /* Has something at ->private */
#define PG_writeback 13 /* Page is under writeback */
#define PG_nosave 14 /* Used for system suspend/resume */
-#define PG_chainlock 15 /* lock bit for ->pte_chain */
+#define PG_rmaplock 15 /* Lock bit for reversing to ptes */
-#define PG_direct 16 /* ->pte_chain points directly at pte */
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */
@@ -88,7 +87,6 @@
unsigned long nr_pagecache; /* Pages in pagecache */
unsigned long nr_swapcache; /* Pages in swapcache */
unsigned long nr_page_table_pages;/* Pages used for pagetables */
- unsigned long nr_reverse_maps; /* includes PageDirect */
unsigned long nr_mapped; /* mapped into pagetables */
unsigned long nr_slab; /* In slab */
#define GET_PAGE_STATE_LAST nr_slab
@@ -241,12 +239,6 @@
#define ClearPageNosave(page) clear_bit(PG_nosave, &(page)->flags)
#define TestClearPageNosave(page) test_and_clear_bit(PG_nosave, &(page)->flags)
-#define PageDirect(page) test_bit(PG_direct, &(page)->flags)
-#define SetPageDirect(page) set_bit(PG_direct, &(page)->flags)
-#define TestSetPageDirect(page) test_and_set_bit(PG_direct, &(page)->flags)
-#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags)
-#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags)
-
#define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags)
#define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
#define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
--- anobjrmap2/include/linux/rmap.h Thu Mar 20 17:10:01 2003
+++ anobjrmap3/include/linux/rmap.h Thu Mar 20 17:10:12 2003
@@ -8,20 +8,9 @@
#include <linux/linkage.h>
#ifdef CONFIG_MMU
-
-struct pte_chain;
-struct pte_chain *pte_chain_alloc(int gfp_flags);
-void __pte_chain_free(struct pte_chain *pte_chain);
-
-static inline void pte_chain_free(struct pte_chain *pte_chain)
-{
- if (pte_chain)
- __pte_chain_free(pte_chain);
-}
-
-struct pte_chain *FASTCALL(
- page_add_rmap(struct page *, pte_t *, struct pte_chain *));
-void FASTCALL(page_remove_rmap(struct page *, pte_t *));
+void FASTCALL(page_add_rmap(struct page *, int anon));
+void FASTCALL(page_dup_rmap(struct page *));
+void FASTCALL(page_remove_rmap(struct page *));
/*
* Called from mm/vmscan.c to handle paging out
@@ -43,7 +32,7 @@
#define SWAP_AGAIN 1
#define SWAP_FAIL 2
-static inline void pte_chain_lock(struct page *page)
+static inline void rmap_lock(struct page *page)
{
/*
* Assuming the lock is uncontended, this never enters
@@ -54,18 +43,18 @@
*/
preempt_disable();
#ifdef CONFIG_SMP
- while (test_and_set_bit(PG_chainlock, &page->flags)) {
- while (test_bit(PG_chainlock, &page->flags))
+ while (test_and_set_bit(PG_rmaplock, &page->flags)) {
+ while (test_bit(PG_rmaplock, &page->flags))
cpu_relax();
}
#endif
}
-static inline void pte_chain_unlock(struct page *page)
+static inline void rmap_unlock(struct page *page)
{
#ifdef CONFIG_SMP
smp_mb__before_clear_bit();
- clear_bit(PG_chainlock, &page->flags);
+ clear_bit(PG_rmaplock, &page->flags);
#endif
preempt_enable();
}
--- anobjrmap2/init/main.c Wed Mar 19 11:05:16 2003
+++ anobjrmap3/init/main.c Thu Mar 20 17:10:12 2003
@@ -82,7 +82,6 @@
extern void buffer_init(void);
extern void pidhash_init(void);
extern void pidmap_init(void);
-extern void pte_chain_init(void);
extern void radix_tree_init(void);
extern void free_initmem(void);
extern void populate_rootfs(void);
@@ -436,7 +435,6 @@
kmem_cache_sizes_init();
pidmap_init();
pgtable_cache_init();
- pte_chain_init();
fork_init(num_physpages);
proc_caches_init();
security_scaffolding_startup();
--- anobjrmap2/mm/fremap.c Thu Mar 20 17:10:01 2003
+++ anobjrmap3/mm/fremap.c Thu Mar 20 17:10:12 2003
@@ -31,7 +31,7 @@
if (!PageReserved(page)) {
if (pte_dirty(pte))
set_page_dirty(page);
- page_remove_rmap(page, ptep);
+ page_remove_rmap(page);
page_cache_release(page);
mm->rss--;
}
@@ -56,11 +56,6 @@
pte_t *pte, entry;
pgd_t *pgd;
pmd_t *pmd;
- struct pte_chain *pte_chain;
-
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain)
- goto err;
pgd = pgd_offset(mm, addr);
spin_lock(&mm->page_table_lock);
@@ -80,19 +75,14 @@
flush_icache_page(vma, page);
entry = mk_pte(page, prot);
set_pte(pte, entry);
- pte_chain = page_add_rmap(page, pte, pte_chain);
+ page_add_rmap(page, 0);
pte_unmap(pte);
if (flush)
flush_tlb_page(vma, addr);
- spin_unlock(&mm->page_table_lock);
- pte_chain_free(pte_chain);
- return 0;
-
+ err = 0;
err_unlock:
spin_unlock(&mm->page_table_lock);
- pte_chain_free(pte_chain);
-err:
return err;
}
--- anobjrmap2/mm/memory.c Thu Mar 20 17:10:01 2003
+++ anobjrmap3/mm/memory.c Thu Mar 20 17:10:12 2003
@@ -47,7 +47,6 @@
#include <linux/rmap.h>
#include <asm/pgalloc.h>
-#include <asm/rmap.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
@@ -95,7 +94,7 @@
}
page = pmd_page(*dir);
pmd_clear(dir);
- pgtable_remove_rmap(page);
+ dec_page_state(nr_page_table_pages);
pte_free_tlb(tlb, page);
}
@@ -166,7 +165,7 @@
pte_free(new);
goto out;
}
- pgtable_add_rmap(new, mm, address);
+ inc_page_state(nr_page_table_pages);
pmd_populate(mm, pmd, new);
}
out:
@@ -192,7 +191,7 @@
pte_free_kernel(new);
goto out;
}
- pgtable_add_rmap(virt_to_page(new), mm, address);
+ inc_page_state(nr_page_table_pages);
pmd_populate_kernel(mm, pmd, new);
}
out:
@@ -219,20 +218,10 @@
unsigned long address = vma->vm_start;
unsigned long end = vma->vm_end;
unsigned long cow;
- struct pte_chain *pte_chain = NULL;
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst, src, vma);
- pte_chain = pte_chain_alloc(GFP_ATOMIC);
- if (!pte_chain) {
- spin_unlock(&dst->page_table_lock);
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- spin_lock(&dst->page_table_lock);
- if (!pte_chain)
- goto nomem;
- }
-
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
src_pgd = pgd_offset(src, address)-1;
dst_pgd = pgd_offset(dst, address)-1;
@@ -295,8 +284,7 @@
if (!pte_present(pte)) {
if (!pte_file(pte))
swap_duplicate(pte_to_swp_entry(pte));
- set_pte(dst_pte, pte);
- goto cont_copy_pte_range_noset;
+ goto cont_copy_pte_range;
}
pfn = pte_pfn(pte);
/* the pte points outside of valid memory, the
@@ -304,10 +292,8 @@
* and not mapped via rmap - duplicate the
* mapping as is.
*/
- if (!pfn_valid(pfn)) {
- set_pte(dst_pte, pte);
- goto cont_copy_pte_range_noset;
- }
+ if (!pfn_valid(pfn))
+ goto cont_copy_pte_range;
page = pfn_to_page(pfn);
if (PageReserved(page))
goto cont_copy_pte_range;
@@ -330,33 +316,9 @@
pte = pte_mkold(pte);
get_page(page);
dst->rss++;
-
+ page_dup_rmap(page);
cont_copy_pte_range:
set_pte(dst_pte, pte);
- pte_chain = page_add_rmap(page, dst_pte,
- pte_chain);
- if (pte_chain)
- goto cont_copy_pte_range_noset;
- pte_chain = pte_chain_alloc(GFP_ATOMIC);
- if (pte_chain)
- goto cont_copy_pte_range_noset;
-
- /*
- * pte_chain allocation failed, and we need to
- * run page reclaim.
- */
- pte_unmap_nested(src_pte);
- pte_unmap(dst_pte);
- spin_unlock(&src->page_table_lock);
- spin_unlock(&dst->page_table_lock);
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- spin_lock(&dst->page_table_lock);
- if (!pte_chain)
- goto nomem;
- spin_lock(&src->page_table_lock);
- dst_pte = pte_offset_map(dst_pmd, address);
- src_pte = pte_offset_map_nested(src_pmd,
- address);
cont_copy_pte_range_noset:
address += PAGE_SIZE;
if (address >= end) {
@@ -379,10 +341,8 @@
out_unlock:
spin_unlock(&src->page_table_lock);
out:
- pte_chain_free(pte_chain);
return 0;
nomem:
- pte_chain_free(pte_chain);
return -ENOMEM;
}
@@ -423,7 +383,7 @@
page_mapping(page))
mark_page_accessed(page);
tlb->freed++;
- page_remove_rmap(page, ptep);
+ page_remove_rmap(page);
tlb_remove_page(tlb, page);
}
}
@@ -958,7 +918,6 @@
{
struct page *old_page, *new_page;
unsigned long pfn = pte_pfn(pte);
- struct pte_chain *pte_chain = NULL;
int ret;
if (unlikely(!pfn_valid(pfn))) {
@@ -994,9 +953,6 @@
page_cache_get(old_page);
spin_unlock(&mm->page_table_lock);
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain)
- goto no_mem;
new_page = alloc_page(GFP_HIGHUSER);
if (!new_page)
goto no_mem;
@@ -1010,10 +966,10 @@
if (pte_same(*page_table, pte)) {
if (PageReserved(old_page))
++mm->rss;
- page_remove_rmap(old_page, page_table);
+ else
+ page_remove_rmap(old_page);
break_cow(vma, new_page, address, page_table);
- SetPageAnon(new_page);
- pte_chain = page_add_rmap(new_page, page_table, pte_chain);
+ page_add_rmap(new_page, 1);
lru_cache_add_active(new_page);
/* Free the old page.. */
@@ -1023,16 +979,14 @@
page_cache_release(new_page);
page_cache_release(old_page);
ret = VM_FAULT_MINOR;
- goto out;
-
+out:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
no_mem:
page_cache_release(old_page);
oom:
ret = VM_FAULT_OOM;
-out:
- spin_unlock(&mm->page_table_lock);
- pte_chain_free(pte_chain);
- return ret;
+ goto out;
}
static void vmtruncate_list(struct list_head *head, unsigned long pgoff)
@@ -1155,7 +1109,6 @@
swp_entry_t entry = pte_to_swp_entry(orig_pte);
pte_t pte;
int ret = VM_FAULT_MINOR;
- struct pte_chain *pte_chain = NULL;
pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
@@ -1185,11 +1138,6 @@
}
mark_page_accessed(page);
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain) {
- ret = -ENOMEM;
- goto out;
- }
lock_page(page);
/*
@@ -1222,15 +1170,13 @@
flush_page_to_ram(page);
flush_icache_page(vma, page);
set_pte(page_table, pte);
- SetPageAnon(page);
- pte_chain = page_add_rmap(page, page_table, pte_chain);
+ page_add_rmap(page, 1);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
out:
- pte_chain_free(pte_chain);
return ret;
}
@@ -1246,20 +1192,8 @@
{
pte_t entry;
struct page * page = ZERO_PAGE(addr);
- struct pte_chain *pte_chain;
int ret;
- pte_chain = pte_chain_alloc(GFP_ATOMIC);
- if (!pte_chain) {
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain)
- goto no_mem;
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, addr);
- }
-
/* Read-only mapping of ZERO_PAGE. */
entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
@@ -1289,25 +1223,22 @@
entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
lru_cache_add_active(page);
mark_page_accessed(page);
- SetPageAnon(page);
}
set_pte(page_table, entry);
/* ignores ZERO_PAGE */
- pte_chain = page_add_rmap(page, page_table, pte_chain);
+ page_add_rmap(page, 1);
pte_unmap(page_table);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, entry);
spin_unlock(&mm->page_table_lock);
ret = VM_FAULT_MINOR;
- goto out;
-
-no_mem:
- ret = VM_FAULT_OOM;
out:
- pte_chain_free(pte_chain);
return ret;
+no_mem:
+ ret = VM_FAULT_OOM;
+ goto out;
}
/*
@@ -1328,7 +1259,6 @@
{
struct page * new_page;
pte_t entry;
- struct pte_chain *pte_chain;
int anon = 0;
int ret;
@@ -1346,19 +1276,13 @@
if (new_page == NOPAGE_OOM)
return VM_FAULT_OOM;
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain)
- goto oom;
-
/*
* Should we do an early C-O-W break?
*/
if (write_access && !(vma->vm_flags & VM_SHARED)) {
struct page * page = alloc_page(GFP_HIGHUSER);
- if (!page) {
- page_cache_release(new_page);
+ if (!page)
goto oom;
- }
copy_user_highpage(page, new_page, address);
page_cache_release(new_page);
lru_cache_add_active(page);
@@ -1388,9 +1312,7 @@
if (write_access)
entry = pte_mkwrite(pte_mkdirty(entry));
set_pte(page_table, entry);
- if (anon)
- SetPageAnon(new_page);
- pte_chain = page_add_rmap(new_page, page_table, pte_chain);
+ page_add_rmap(new_page, anon);
pte_unmap(page_table);
} else {
/* One of our sibling threads was faster, back out. */
@@ -1405,12 +1327,12 @@
update_mmu_cache(vma, address, entry);
spin_unlock(&mm->page_table_lock);
ret = VM_FAULT_MAJOR;
- goto out;
-oom:
- ret = VM_FAULT_OOM;
out:
- pte_chain_free(pte_chain);
return ret;
+oom:
+ page_cache_release(new_page);
+ ret = VM_FAULT_OOM;
+ goto out;
}
/*
--- anobjrmap2/mm/mremap.c Thu Mar 20 17:09:50 2003
+++ anobjrmap3/mm/mremap.c Thu Mar 20 17:10:12 2003
@@ -83,30 +83,25 @@
}
static int
-copy_one_pte(struct mm_struct *mm, pte_t *src, pte_t *dst,
- struct pte_chain **pte_chainp)
+copy_one_pte(struct mm_struct *mm, pte_t *src, pte_t *dst)
{
- int error = 0;
pte_t pte;
struct page *page = NULL;
- if (pte_present(*src))
- page = pte_page(*src);
-
if (!pte_none(*src)) {
- if (page)
- page_remove_rmap(page, src);
+ if (!dst)
+ return -1;
+ if (pte_present(*src))
+ page = pte_page(*src);
pte = ptep_get_and_clear(src);
- if (!dst) {
- /* No dest? We must put it back. */
- dst = src;
- error++;
- }
set_pte(dst, pte);
- if (page)
- *pte_chainp = page_add_rmap(page, dst, *pte_chainp);
+ if (page) {
+ int anon = PageAnon(page);
+ page_remove_rmap(page);
+ page_add_rmap(page, anon);
+ }
}
- return error;
+ return 0;
}
static int
@@ -116,13 +111,7 @@
struct mm_struct *mm = vma->vm_mm;
int error = 0;
pte_t *src, *dst;
- struct pte_chain *pte_chain;
- pte_chain = pte_chain_alloc(GFP_KERNEL);
- if (!pte_chain) {
- error = -ENOMEM;
- goto out;
- }
spin_lock(&mm->page_table_lock);
src = get_one_pte_map_nested(mm, old_addr);
if (src) {
@@ -138,14 +127,12 @@
dst = alloc_one_pte_map(mm, new_addr);
if (src == NULL)
src = get_one_pte_map_nested(mm, old_addr);
- error = copy_one_pte(mm, src, dst, &pte_chain);
+ error = copy_one_pte(mm, src, dst);
pte_unmap_nested(src);
pte_unmap(dst);
}
flush_tlb_page(vma, old_addr);
spin_unlock(&mm->page_table_lock);
- pte_chain_free(pte_chain);
-out:
return error;
}
--- anobjrmap2/mm/nommu.c Wed Mar 19 11:05:16 2003
+++ anobjrmap3/mm/nommu.c Thu Mar 20 17:10:12 2003
@@ -567,7 +567,3 @@
{
return -ENOMEM;
}
-
-void pte_chain_init(void)
-{
-}
--- anobjrmap2/mm/page_alloc.c Thu Mar 20 17:10:01 2003
+++ anobjrmap3/mm/page_alloc.c Thu Mar 20 17:10:12 2003
@@ -80,8 +80,7 @@
1 << PG_lru |
1 << PG_active |
1 << PG_dirty |
- 1 << PG_chainlock |
- 1 << PG_direct |
+ 1 << PG_rmaplock |
1 << PG_anon |
1 << PG_swapcache |
1 << PG_writeback);
@@ -220,8 +219,7 @@
1 << PG_locked |
1 << PG_active |
1 << PG_reclaim |
- 1 << PG_chainlock |
- 1 << PG_direct |
+ 1 << PG_rmaplock |
1 << PG_anon |
1 << PG_swapcache |
1 << PG_writeback )))
@@ -328,8 +326,7 @@
1 << PG_active |
1 << PG_dirty |
1 << PG_reclaim |
- 1 << PG_chainlock |
- 1 << PG_direct |
+ 1 << PG_rmaplock |
1 << PG_anon |
1 << PG_swapcache |
1 << PG_writeback )))
@@ -1454,7 +1451,6 @@
"nr_pagecache",
"nr_swapcache",
"nr_page_table_pages",
- "nr_reverse_maps",
"nr_mapped",
"nr_slab",
--- anobjrmap2/mm/rmap.c Thu Mar 20 17:10:01 2003
+++ anobjrmap3/mm/rmap.c Thu Mar 20 17:10:12 2003
@@ -4,37 +4,30 @@
* Copyright 2001, Rik van Riel <[email protected]>
* Released under the General Public License (GPL).
*
- *
- * Simple, low overhead pte-based reverse mapping scheme.
- * This is kept modular because we may want to experiment
- * with object-based reverse mapping schemes. Please try
- * to keep this thing as modular as possible.
+ * Simple, low overhead reverse mapping scheme.
+ * Please try to keep this thing as modular as possible.
*/
/*
* Locking:
- * - the page->pte.chain is protected by the PG_chainlock bit,
- * which nests within the zone->lru_lock, then the
- * mm->page_table_lock, and then the page lock.
+ * - the page->rmap field is protected by the PG_rmaplock bit,
+ * which nests within the mm->page_table_lock,
+ * which nests within the page lock.
* - because swapout locking is opposite to the locking order
* in the page fault path, the swapout path uses trylocks
* on the mm->page_table_lock
*/
+
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rmap.h>
-#include <linux/cache.h>
#include <linux/percpu.h>
-
-#include <asm/pgalloc.h>
-#include <asm/rmap.h>
-#include <asm/tlb.h>
#include <asm/tlbflush.h>
-/* #define DEBUG_RMAP */
+#define page_mapcount(page) ((page)->rmap_count)
/*
* Something oopsable to put for now in the page->mapping
@@ -50,61 +43,13 @@
ClearPageAnon(page);
}
-/*
- * Shared pages have a chain of pte_chain structures, used to locate
- * all the mappings to this page. We only need a pointer to the pte
- * here, the page struct for the page table page contains the process
- * it belongs to and the offset within that process.
- *
- * We use an array of pte pointers in this structure to minimise cache misses
- * while traversing reverse maps.
- */
-#define NRPTE ((L1_CACHE_BYTES - sizeof(void *))/sizeof(pte_addr_t))
-
-struct pte_chain {
- struct pte_chain *next;
- pte_addr_t ptes[NRPTE];
-} ____cacheline_aligned;
-
-kmem_cache_t *pte_chain_cache;
-
-/*
- * pte_chain list management policy:
- *
- * - If a page has a pte_chain list then it is shared by at least two processes,
- * because a single sharing uses PageDirect. (Well, this isn't true yet,
- * coz this code doesn't collapse singletons back to PageDirect on the remove
- * path).
- * - A pte_chain list has free space only in the head member - all succeeding
- * members are 100% full.
- * - If the head element has free space, it occurs in its leading slots.
- * - All free space in the pte_chain is at the start of the head member.
- * - Insertion into the pte_chain puts a pte pointer in the last free slot of
- * the head member.
- * - Removal from a pte chain moves the head pte of the head member onto the
- * victim pte and frees the head member if it became empty.
- */
-
/**
- ** VM stuff below this comment
+ ** Subfunctions of page_referenced: page_referenced_one called
+ ** repeatedly from page_referenced_obj.
**/
-/**
- * find_pte - Find a pte pointer given a vma and a struct page.
- * @vma: the vma to search
- * @page: the page to find
- *
- * Determine if this page is mapped in this vma. If it is, map and rethrn
- * the pte pointer associated with it. Return null if the page is not
- * mapped in this vma for any reason.
- *
- * This is strictly an internal helper function for the object-based rmap
- * functions.
- *
- * It is the caller's responsibility to unmap the pte if it is returned.
- */
-static inline pte_t *
-find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr)
+static int
+page_referenced_one(struct page *page, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
@@ -112,6 +57,7 @@
pte_t *pte;
unsigned long loffset;
unsigned long address;
+ int referenced = 0;
loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
if (loffset < vma->vm_pgoff)
@@ -122,13 +68,18 @@
if (address >= vma->vm_end)
goto out;
+ if (!spin_trylock(&mm->page_table_lock)) {
+ referenced = 1;
+ goto out;
+ }
+
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
- goto out;
+ goto out_unlock;
pmd = pmd_offset(pgd, address);
if (!pmd_present(*pmd))
- goto out;
+ goto out_unlock;
pte = pte_offset_map(pmd, address);
if (!pte_present(*pte))
@@ -137,84 +88,36 @@
if (page_to_pfn(page) != pte_pfn(*pte))
goto out_unmap;
- if (addr)
- *addr = address;
-
- return pte;
+ if (ptep_test_and_clear_young(pte))
+ referenced++;
out_unmap:
pte_unmap(pte);
-out:
- return NULL;
-}
-
-/**
- * page_referenced_obj_one - referenced check for object-based rmap
- * @vma: the vma to look in.
- * @page: the page we're working on.
- *
- * Find a pte entry for a page/vma pair, then check and clear the referenced
- * bit.
- *
- * This is strictly a helper function for page_referenced_obj.
- */
-static int
-page_referenced_obj_one(struct vm_area_struct *vma, struct page *page)
-{
- pte_t *pte;
- int referenced = 0;
- pte = find_pte(vma, page, NULL);
- if (pte) {
- if (ptep_test_and_clear_young(pte))
- referenced++;
- pte_unmap(pte);
- }
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+out:
return referenced;
}
-/**
- * page_referenced_obj_one - referenced check for object-based rmap
- * @page: the page we're checking references on.
- *
- * For an object-based mapped page, find all the places it is mapped and
- * check/clear the referenced flag. This is done by following the page->mapping
- * pointer, then walking the chain of vmas it holds. It returns the number
- * of references it found.
- *
- * This function is only called from page_referenced for object-based pages.
- *
- * The semaphore address_space->i_shared_sem is tried. If it can't be gotten,
- * assume a reference count of 1.
- */
-static int
+static inline int
page_referenced_obj(struct page *page)
{
struct address_space *mapping = page->mapping;
struct vm_area_struct *vma;
int referenced = 0;
- if (!page->pte.mapcount)
- return 0;
-
- if (!mapping)
- return 0;
-
- if (PageSwapCache(page))
- BUG();
-
if (down_trylock(&mapping->i_shared_sem))
return 1;
-
+
list_for_each_entry(vma, &mapping->i_mmap, shared)
- referenced += page_referenced_obj_one(vma, page);
+ referenced += page_referenced_one(page, vma);
list_for_each_entry(vma, &mapping->i_mmap_shared, shared)
- referenced += page_referenced_obj_one(vma, page);
+ referenced += page_referenced_one(page, vma);
up(&mapping->i_shared_sem);
-
return referenced;
}
@@ -223,423 +126,169 @@
* @page: the page to test
*
* Quick test_and_clear_referenced for all mappings to a page,
- * returns the number of processes which referenced the page.
- * Caller needs to hold the pte_chain_lock.
- *
- * If the page has a single-entry pte_chain, collapse that back to a PageDirect
- * representation. This way, it's only done under memory pressure.
+ * returns the number of ptes which referenced the page.
+ * Caller needs to hold the rmap_lock.
*/
-int page_referenced(struct page * page)
+int
+page_referenced(struct page *page)
{
- struct pte_chain * pc;
- int referenced = 0;
+ int referenced;
- if (TestClearPageReferenced(page))
- referenced++;
-
- if (!PageAnon(page)) {
+ referenced = !!TestClearPageReferenced(page);
+ if (page_mapcount(page) && page->mapping && !PageAnon(page))
referenced += page_referenced_obj(page);
- goto out;
- }
- if (PageDirect(page)) {
- pte_t *pte = rmap_ptep_map(page->pte.direct);
- if (ptep_test_and_clear_young(pte))
- referenced++;
- rmap_ptep_unmap(pte);
- } else {
- int nr_chains = 0;
-
- /* Check all the page tables mapping this page. */
- for (pc = page->pte.chain; pc; pc = pc->next) {
- int i;
-
- for (i = NRPTE-1; i >= 0; i--) {
- pte_addr_t pte_paddr = pc->ptes[i];
- pte_t *p;
-
- if (!pte_paddr)
- break;
- p = rmap_ptep_map(pte_paddr);
- if (ptep_test_and_clear_young(p))
- referenced++;
- rmap_ptep_unmap(p);
- nr_chains++;
- }
- }
- if (nr_chains == 1) {
- pc = page->pte.chain;
- page->pte.direct = pc->ptes[NRPTE-1];
- SetPageDirect(page);
- pc->ptes[NRPTE-1] = 0;
- __pte_chain_free(pc);
- }
- }
-out:
return referenced;
}
/**
* page_add_rmap - add reverse mapping entry to a page
- * @page: the page to add the mapping to
- * @ptep: the page table entry mapping this page
+ * @page: the page to add the mapping to
+ * @anon: is this an anonymous (not file-backed) page?
*
- * Add a new pte reverse mapping to a page.
+ * For general use: Add a new reverse mapping to a page.
* The caller needs to hold the mm->page_table_lock.
*/
-struct pte_chain *
-page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain)
+void
+page_add_rmap(struct page *page, int anon)
{
- pte_addr_t pte_paddr = ptep_to_paddr(ptep);
- struct pte_chain *cur_pte_chain;
- int i;
-
-#ifdef DEBUG_RMAP
- if (!page || !ptep)
- BUG();
- if (!pte_present(*ptep))
- BUG();
- if (!ptep_to_mm(ptep))
- BUG();
-#endif
-
if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
- return pte_chain;
+ return;
- pte_chain_lock(page);
+ rmap_lock(page);
- /*
- * If this is an object-based page, just count it. We can
- * find the mappings by walking the object vma chain for that object.
- */
- if (!PageAnon(page)) {
- if (PageSwapCache(page))
- BUG();
- if (!page->pte.mapcount)
- inc_page_state(nr_mapped);
- page->pte.mapcount++;
- pte_chain_unlock(page);
- return pte_chain;
- }
+ if (!page_mapped(page))
+ inc_page_state(nr_mapped);
+ page_mapcount(page)++;
-#ifdef DEBUG_RMAP
- /*
- * This stuff needs help to get up to highmem speed.
- */
- {
- struct pte_chain * pc;
- if (PageDirect(page)) {
- if (page->pte.direct == pte_paddr)
- BUG();
+ if (page->mapping) {
+ if (anon) {
+ BUG_ON(!PageAnon(page));
} else {
- for (pc = page->pte.chain; pc; pc = pc->next) {
- for (i = 0; i < NRPTE; i++) {
- pte_addr_t p = pc->ptes[i];
-
- if (p && p == pte_paddr)
- BUG();
- }
- }
+ BUG_ON(PageAnon(page));
+ }
+ } else {
+ if (anon) {
+ SetPageAnon(page);
+ page->mapping = ANON_MAPPING_DEBUG;
+ } else {
+ /*
+ * Driver did not assign page->mapping,
+ * nor did it set PageReserved. That's
+ * okay, it's as if vma were VM_LOCKED.
+ */
}
- }
-#endif
-
- page->mapping = ANON_MAPPING_DEBUG;
-
- if (page->pte.direct == 0) {
- page->pte.direct = pte_paddr;
- SetPageDirect(page);
- inc_page_state(nr_mapped);
- goto out;
- }
-
- if (PageDirect(page)) {
- /* Convert a direct pointer into a pte_chain */
- ClearPageDirect(page);
- pte_chain->ptes[NRPTE-1] = page->pte.direct;
- pte_chain->ptes[NRPTE-2] = pte_paddr;
- page->pte.direct = 0;
- page->pte.chain = pte_chain;
- pte_chain = NULL; /* We consumed it */
- goto out;
- }
-
- cur_pte_chain = page->pte.chain;
- if (cur_pte_chain->ptes[0]) { /* It's full */
- pte_chain->next = cur_pte_chain;
- page->pte.chain = pte_chain;
- pte_chain->ptes[NRPTE-1] = pte_paddr;
- pte_chain = NULL; /* We consumed it */
- goto out;
}
- BUG_ON(!cur_pte_chain->ptes[NRPTE-1]);
+ rmap_unlock(page);
+}
- for (i = NRPTE-2; i >= 0; i--) {
- if (!cur_pte_chain->ptes[i]) {
- cur_pte_chain->ptes[i] = pte_paddr;
- goto out;
- }
- }
- BUG();
-out:
- pte_chain_unlock(page);
- inc_page_state(nr_reverse_maps);
- return pte_chain;
+/**
+ * page_dup_rmap - duplicate reverse mapping entry to a page
+ * @page: the page to add the mapping to
+ *
+ * For copy_page_range only: minimal extract from page_add_rmap,
+ * avoiding unnecessary tests (already checked) so it's quicker.
+ */
+void
+page_dup_rmap(struct page *page)
+{
+ rmap_lock(page);
+ page_mapcount(page)++;
+ rmap_unlock(page);
}
/**
* page_remove_rmap - take down reverse mapping to a page
* @page: page to remove mapping from
- * @ptep: page table entry to remove
*
- * Removes the reverse mapping from the pte_chain of the page,
+ * For general use: Remove the reverse mapping from the page,
* after that the caller can clear the page table entry and free
- * the page.
- * Caller needs to hold the mm->page_table_lock.
+ * the page. Caller needs to hold the mm->page_table_lock.
*/
-void page_remove_rmap(struct page * page, pte_t * ptep)
+void
+page_remove_rmap(struct page *page)
{
- pte_addr_t pte_paddr = ptep_to_paddr(ptep);
- struct pte_chain *pc;
-
- if (!page || !ptep)
- BUG();
if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
return;
- pte_chain_lock(page);
-
- /*
- * If this is an object-based page, just uncount it. We can
- * find the mappings by walking the object vma chain for that object.
- */
- if (!PageAnon(page)) {
- if (PageSwapCache(page))
- BUG();
- if (!page->pte.mapcount)
- BUG();
- page->pte.mapcount--;
- if (!page->pte.mapcount)
- dec_page_state(nr_mapped);
- pte_chain_unlock(page);
- return;
- }
-
- if (PageDirect(page)) {
- if (page->pte.direct == pte_paddr) {
- page->pte.direct = 0;
- dec_page_state(nr_reverse_maps);
- ClearPageDirect(page);
- goto out;
- }
- } else {
- struct pte_chain *start = page->pte.chain;
- int victim_i = -1;
-
- for (pc = start; pc; pc = pc->next) {
- int i;
+ rmap_lock(page);
- if (pc->next)
- prefetch(pc->next);
- for (i = 0; i < NRPTE; i++) {
- pte_addr_t pa = pc->ptes[i];
-
- if (!pa)
- continue;
- if (victim_i == -1)
- victim_i = i;
- if (pa != pte_paddr)
- continue;
- pc->ptes[i] = start->ptes[victim_i];
- dec_page_state(nr_reverse_maps);
- start->ptes[victim_i] = 0;
- if (victim_i == NRPTE-1) {
- /* Emptied a pte_chain */
- page->pte.chain = start->next;
- __pte_chain_free(start);
- } else {
- /* Do singleton->PageDirect here */
- }
- goto out;
- }
- }
- }
-#ifdef DEBUG_RMAP
- /* Not found. This should NEVER happen! */
- printk(KERN_ERR "page_remove_rmap: pte_chain %p not present.\n", ptep);
- printk(KERN_ERR "page_remove_rmap: only found: ");
- if (PageDirect(page)) {
- printk("%llx", (u64)page->pte.direct);
- } else {
- for (pc = page->pte.chain; pc; pc = pc->next) {
- int i;
- for (i = 0; i < NRPTE; i++)
- printk(" %d:%llx", i, (u64)pc->ptes[i]);
- }
- }
- printk("\n");
- printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n");
-#endif
+ BUG_ON(!page_mapcount(page));
+ page_mapcount(page)--;
-out:
if (!page_mapped(page)) {
dec_page_state(nr_mapped);
- clear_page_anon(page);
+ if (PageAnon(page))
+ clear_page_anon(page);
}
- pte_chain_unlock(page);
+
+ rmap_unlock(page);
}
/**
- * try_to_unmap_obj - unmap a page using the object-based rmap method
- * @page: the page to unmap
- *
- * Determine whether a page is mapped in a given vma and unmap it if it's found.
- *
- * This function is strictly a helper function for try_to_unmap_obj.
- */
-static inline int
-try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page)
+ ** Subfunctions of try_to_unmap: try_to_unmap_one called
+ ** repeatedly from try_to_unmap_obj.
+ **/
+
+static int
+try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long address;
+ pgd_t *pgd;
+ pmd_t *pmd;
pte_t *pte;
pte_t pteval;
- int ret = SWAP_SUCCESS;
+ unsigned long loffset;
+ unsigned long address;
+ int ret = SWAP_AGAIN;
- pte = find_pte(vma, page, &address);
- if (!pte)
+ loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
+ if (loffset < vma->vm_pgoff)
goto out;
- if (vma->vm_flags & VM_LOCKED) {
- ret = SWAP_FAIL;
- goto out_unmap;
- }
-
- flush_cache_page(vma, address);
- pteval = ptep_get_and_clear(pte);
- flush_tlb_page(vma, address);
-
- if (pte_dirty(pteval))
- set_page_dirty(page);
-
- if (!page->pte.mapcount)
- BUG();
-
- mm->rss--;
- page->pte.mapcount--;
- page_cache_release(page);
-
-out_unmap:
- pte_unmap(pte);
-
-out:
- return ret;
-}
-
-/**
- * try_to_unmap_obj - unmap a page using the object-based rmap method
- * @page: the page to unmap
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the address_space struct it points to.
- *
- * This function is only called from try_to_unmap for object-based pages.
- *
- * The semaphore address_space->i_shared_sem is tried. If it can't be gotten,
- * return a temporary error.
- */
-static int
-try_to_unmap_obj(struct page *page)
-{
- struct address_space *mapping = page->mapping;
- struct vm_area_struct *vma;
- int ret = SWAP_SUCCESS;
-
- if (!mapping)
- BUG();
-
- if (PageSwapCache(page))
- BUG();
-
- if (down_trylock(&mapping->i_shared_sem))
- return SWAP_AGAIN;
-
- list_for_each_entry(vma, &mapping->i_mmap, shared) {
- ret = try_to_unmap_obj_one(vma, page);
- if (ret != SWAP_SUCCESS)
- goto out;
- }
-
- list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- ret = try_to_unmap_obj_one(vma, page);
- if (ret != SWAP_SUCCESS)
- goto out;
- }
-
- /* We lose track of sys_remap_file pages (for now) */
- if (page->pte.mapcount)
- ret = SWAP_FAIL;
-
-out:
- up(&mapping->i_shared_sem);
- return ret;
-}
-
-/**
- * try_to_unmap_one - worker function for try_to_unmap
- * @page: page to unmap
- * @ptep: page table entry to unmap from page
- *
- * Internal helper function for try_to_unmap, called for each page
- * table entry mapping a page. Because locking order here is opposite
- * to the locking order used by the page fault path, we use trylocks.
- * Locking:
- * zone->lru_lock page_launder()
- * page lock page_launder(), trylock
- * pte_chain_lock page_launder()
- * mm->page_table_lock try_to_unmap_one(), trylock
- */
-static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t));
-static int try_to_unmap_one(struct page * page, pte_addr_t paddr)
-{
- pte_t *ptep = rmap_ptep_map(paddr);
- unsigned long address = ptep_to_address(ptep);
- struct mm_struct * mm = ptep_to_mm(ptep);
- struct vm_area_struct * vma;
- pte_t pte;
- int ret;
+ address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT);
- if (!mm)
- BUG();
+ if (address >= vma->vm_end)
+ goto out;
/*
* We need the page_table_lock to protect us from page faults,
* munmap, fork, etc...
*/
- if (!spin_trylock(&mm->page_table_lock)) {
- rmap_ptep_unmap(ptep);
- return SWAP_AGAIN;
- }
+ if (!spin_trylock(&mm->page_table_lock))
+ goto out;
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out_unlock;
- /* During mremap, it's possible pages are not in a VMA. */
- vma = find_vma(mm, address);
- if (!vma) {
- ret = SWAP_FAIL;
+ pmd = pmd_offset(pgd, address);
+ if (!pmd_present(*pmd))
goto out_unlock;
- }
- /* The page is mlock()d, we cannot swap it out. */
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte))
+ goto out_unmap;
+
+ if (page_to_pfn(page) != pte_pfn(*pte))
+ goto out_unmap;
+
+ /* If the page is mlock()d, we cannot swap it out. */
if (vma->vm_flags & VM_LOCKED) {
- ret = SWAP_FAIL;
- goto out_unlock;
+ ret = SWAP_FAIL;
+ goto out_unmap;
}
/* Nuke the page table entry. */
flush_cache_page(vma, address);
- pte = ptep_get_and_clear(ptep);
+ pteval = ptep_get_and_clear(pte);
flush_tlb_page(vma, address);
+ /*
+ * This block makes no sense in this subpatch: neither anon
+ * pages nor nonlinear pages get here. But we want to hold on
+ * to this code, to use in later patches which correct that.
+ */
if (PageAnon(page)) {
swp_entry_t entry = { .val = page->private };
/*
@@ -648,34 +297,66 @@
*/
BUG_ON(!PageSwapCache(page));
swap_duplicate(entry);
- set_pte(ptep, swp_entry_to_pte(entry));
- BUG_ON(pte_file(*ptep));
+ set_pte(pte, swp_entry_to_pte(entry));
+ BUG_ON(pte_file(*pte));
} else {
unsigned long pgidx;
/*
- * If a nonlinear mapping then store the file page offset
- * in the pte.
+ * If a nonlinear mapping from sys_remap_file_pages,
+ * then store the file page offset in the pte.
*/
pgidx = (address - vma->vm_start) >> PAGE_SHIFT;
pgidx += vma->vm_pgoff;
pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
- if (1 || page->index != pgidx) {
- set_pte(ptep, pgoff_to_pte(page->index));
- BUG_ON(!pte_file(*ptep));
+ if (page->index != pgidx) {
+ set_pte(pte, pgoff_to_pte(page->index));
+ BUG_ON(!pte_file(*pte));
}
}
/* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pte))
+ if (pte_dirty(pteval))
set_page_dirty(page);
mm->rss--;
+ BUG_ON(!page_mapcount(page));
+ page_mapcount(page)--;
page_cache_release(page);
- ret = SWAP_SUCCESS;
+
+out_unmap:
+ pte_unmap(pte);
out_unlock:
- rmap_ptep_unmap(ptep);
spin_unlock(&mm->page_table_lock);
+
+out:
+ return ret;
+}
+
+static inline int
+try_to_unmap_obj(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct vm_area_struct *vma;
+ int ret = SWAP_AGAIN;
+
+ if (down_trylock(&mapping->i_shared_sem))
+ return ret;
+
+ list_for_each_entry(vma, &mapping->i_mmap, shared) {
+ ret = try_to_unmap_one(page, vma);
+ if (ret == SWAP_FAIL)
+ goto out;
+ }
+
+ list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+ ret = try_to_unmap_one(page, vma);
+ if (ret == SWAP_FAIL)
+ goto out;
+ }
+
+out:
+ up(&mapping->i_shared_sem);
return ret;
}
@@ -684,173 +365,30 @@
* @page: the page to get unmapped
*
* Tries to remove all the page table entries which are mapping this
- * page, used in the pageout path. Caller must hold zone->lru_lock
- * and the page lock. Return values are:
+ * page, used in the pageout path. Caller must hold the page lock
+ * and its rmap_lock. Return values are:
*
* SWAP_SUCCESS - we succeeded in removing all mappings
* SWAP_AGAIN - we missed a trylock, try again later
* SWAP_FAIL - the page is unswappable
*/
-int try_to_unmap(struct page * page)
+int
+try_to_unmap(struct page *page)
{
- struct pte_chain *pc, *next_pc, *start;
- int ret = SWAP_SUCCESS;
- int victim_i = -1;
-
- /* This page should not be on the pageout lists. */
- if (PageReserved(page))
- BUG();
- if (!PageLocked(page))
- BUG();
- /* We need backing store to swap out a page. */
- if (!page_mapping(page) && !PageSwapCache(page))
- BUG();
+ int ret = SWAP_FAIL;
- /*
- * If it's an object-based page, use the object vma chain to find all
- * the mappings.
- */
- if (!PageAnon(page)) {
- ret = try_to_unmap_obj(page);
- goto out;
- }
+ BUG_ON(PageReserved(page));
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!page_mapped(page));
- if (PageDirect(page)) {
- ret = try_to_unmap_one(page, page->pte.direct);
- if (ret == SWAP_SUCCESS) {
- page->pte.direct = 0;
- dec_page_state(nr_reverse_maps);
- ClearPageDirect(page);
- }
- goto out;
- }
+ if (!PageAnon(page))
+ ret = try_to_unmap_obj(page);
- start = page->pte.chain;
- for (pc = start; pc; pc = next_pc) {
- int i;
-
- next_pc = pc->next;
- if (next_pc)
- prefetch(next_pc);
- for (i = 0; i < NRPTE; i++) {
- pte_addr_t pte_paddr = pc->ptes[i];
-
- if (!pte_paddr)
- continue;
- if (victim_i == -1)
- victim_i = i;
-
- switch (try_to_unmap_one(page, pte_paddr)) {
- case SWAP_SUCCESS:
- /*
- * Release a slot. If we're releasing the
- * first pte in the first pte_chain then
- * pc->ptes[i] and start->ptes[victim_i] both
- * refer to the same thing. It works out.
- */
- pc->ptes[i] = start->ptes[victim_i];
- start->ptes[victim_i] = 0;
- dec_page_state(nr_reverse_maps);
- victim_i++;
- if (victim_i == NRPTE) {
- page->pte.chain = start->next;
- __pte_chain_free(start);
- start = page->pte.chain;
- victim_i = 0;
- }
- break;
- case SWAP_AGAIN:
- /* Skip this pte, remembering status. */
- ret = SWAP_AGAIN;
- continue;
- case SWAP_FAIL:
- ret = SWAP_FAIL;
- goto out;
- }
- }
- }
-out:
if (!page_mapped(page)) {
dec_page_state(nr_mapped);
if (PageAnon(page))
clear_page_anon(page);
+ ret = SWAP_SUCCESS;
}
return ret;
}
-
-/**
- ** No more VM stuff below this comment, only pte_chain helper
- ** functions.
- **/
-
-static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags)
-{
- struct pte_chain *pc = p;
-
- memset(pc, 0, sizeof(*pc));
-}
-
-DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0;
-
-/**
- * __pte_chain_free - free pte_chain structure
- * @pte_chain: pte_chain struct to free
- */
-void __pte_chain_free(struct pte_chain *pte_chain)
-{
- int cpu = get_cpu();
- struct pte_chain **pte_chainp;
-
- if (pte_chain->next)
- pte_chain->next = NULL;
- pte_chainp = &per_cpu(local_pte_chain, cpu);
- if (*pte_chainp)
- kmem_cache_free(pte_chain_cache, *pte_chainp);
- *pte_chainp = pte_chain;
- put_cpu();
-}
-
-/*
- * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap().
- *
- * The caller of page_add_rmap() must perform the allocation because
- * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap()
- * will not actually use the pte_chain, because there is space available in one
- * of the existing pte_chains which are attached to the page. So the case of
- * allocating and then freeing a single pte_chain is specially optimised here,
- * with a one-deep per-cpu cache.
- */
-struct pte_chain *pte_chain_alloc(int gfp_flags)
-{
- int cpu;
- struct pte_chain *ret;
- struct pte_chain **pte_chainp;
-
- if (gfp_flags & __GFP_WAIT)
- might_sleep();
-
- cpu = get_cpu();
- pte_chainp = &per_cpu(local_pte_chain, cpu);
- if (*pte_chainp) {
- ret = *pte_chainp;
- *pte_chainp = NULL;
- put_cpu();
- } else {
- put_cpu();
- ret = kmem_cache_alloc(pte_chain_cache, gfp_flags);
- }
- return ret;
-}
-
-void __init pte_chain_init(void)
-{
- pte_chain_cache = kmem_cache_create( "pte_chain",
- sizeof(struct pte_chain),
- 0,
- 0,
- pte_chain_ctor,
- NULL);
-
- if (!pte_chain_cache)
- panic("failed to create pte_chain cache!\n");
-}
--- anobjrmap2/mm/swapfile.c Thu Mar 20 17:10:02 2003
+++ anobjrmap3/mm/swapfile.c Thu Mar 20 17:10:12 2003
@@ -386,7 +386,7 @@
/* mmlist_lock and vma->vm_mm->page_table_lock are held */
static void
unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
- swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
+ swp_entry_t entry, struct page *page)
{
pte_t pte = *dir;
@@ -398,8 +398,7 @@
return;
get_page(page);
set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
- SetPageAnon(page);
- *pte_chainp = page_add_rmap(page, dir, *pte_chainp);
+ page_add_rmap(page, 1);
swap_free(entry);
++vma->vm_mm->rss;
}
@@ -411,7 +410,6 @@
{
pte_t * pte;
unsigned long end;
- struct pte_chain *pte_chain = NULL;
if (pmd_none(*dir))
return;
@@ -427,18 +425,12 @@
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
- /*
- * FIXME: handle pte_chain_alloc() failures
- */
- if (pte_chain == NULL)
- pte_chain = pte_chain_alloc(GFP_ATOMIC);
unuse_pte(vma, offset+address-vma->vm_start,
- pte, entry, page, &pte_chain);
+ pte, entry, page);
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
pte_unmap(pte - 1);
- pte_chain_free(pte_chain);
}
/* mmlist_lock and vma->vm_mm->page_table_lock are held */
--- anobjrmap2/mm/vmscan.c Thu Mar 20 17:10:02 2003
+++ anobjrmap3/mm/vmscan.c Thu Mar 20 17:10:12 2003
@@ -173,7 +173,7 @@
return 0;
}
-/* Must be called with page's pte_chain_lock held. */
+/* Must be called with page's rmap_lock held. */
static inline int page_mapping_inuse(struct page *page)
{
struct address_space *mapping;
@@ -254,10 +254,10 @@
if (PageWriteback(page))
goto keep_locked;
- pte_chain_lock(page);
+ rmap_lock(page);
if (page_referenced(page) && page_mapping_inuse(page)) {
/* In active use or really unfreeable. Activate it. */
- pte_chain_unlock(page);
+ rmap_unlock(page);
goto activate_locked;
}
@@ -273,10 +273,10 @@
if (PageSwapCache(page))
mapping = &swapper_space;
else if (PageAnon(page)) {
- pte_chain_unlock(page);
+ rmap_unlock(page);
if (!add_to_swap(page))
goto activate_locked;
- pte_chain_lock(page);
+ rmap_lock(page);
mapping = &swapper_space;
}
#endif /* CONFIG_SWAP */
@@ -288,16 +288,16 @@
if (page_mapped(page) && mapping) {
switch (try_to_unmap(page)) {
case SWAP_FAIL:
- pte_chain_unlock(page);
+ rmap_unlock(page);
goto activate_locked;
case SWAP_AGAIN:
- pte_chain_unlock(page);
+ rmap_unlock(page);
goto keep_locked;
case SWAP_SUCCESS:
; /* try to free the page below */
}
}
- pte_chain_unlock(page);
+ rmap_unlock(page);
/*
* If the page is dirty, only perform writeback if that write
@@ -629,13 +629,13 @@
page = list_entry(l_hold.prev, struct page, lru);
list_del(&page->lru);
if (page_mapped(page)) {
- pte_chain_lock(page);
+ rmap_lock(page);
if (page_mapped(page) && page_referenced(page)) {
- pte_chain_unlock(page);
+ rmap_unlock(page);
list_add(&page->lru, &l_active);
continue;
}
- pte_chain_unlock(page);
+ rmap_unlock(page);
if (!reclaim_mapped) {
list_add(&page->lru, &l_active);
continue;
anonjrmap 5/6 virtual address chains for odd cases
Two exceptions remain to be handled. The nonlinear file-backed
pages (mapped into a vma at a different location than implied by
by page->index and vm_pgoff) from Ingo's sys_remap_file_pages,
and anonymous pages sys_mremap'ed to a different location while
shared (copy-on-write) with another mm.
Bring back chains to handle these, but list user virtual addresses
not pte addresses: user virtual addresses are invariant across fork,
just bump the count, no need to allocate any memory.
And since copy_page_range won't need to allocate such buffers,
we can do a much simpler implementation than before: "rmap_get_cpu"
called just before taking page_table_lock allocates one rmap_chain
for the cpu (if not already there) in case subsequent page_add_rmap
might need it. These chains too rare for kmem_cache, just kmalloc.
The other awkward case before was swapoff's unuse_pte. But an
anonymous page cannot appear in two places in the same mm (until
Ingo adds sys_remap_anon_pages_in_several_places_at_once), so
only one rmap_chain needed as elsewhere - make that explicit by
returning as soon as found. And try_to_unuse desist at last from
holding the mmlist_lock across the whole search of mms: we can't
rmap_get_cpu (may kmalloc non-atomically) while holding that lock.
There may well be some better data structure to deal with these
cases, but both are very rare - though the nonlinear presumably
will become more common. Perhaps when we see how it gets used,
someone can propose a better structure, this will do for now.
include/linux/mm.h | 7
include/linux/page-flags.h | 5
include/linux/rmap.h | 4
mm/fremap.c | 4
mm/memory.c | 21 ++
mm/mremap.c | 13 +
mm/page_alloc.c | 3
mm/rmap.c | 351 +++++++++++++++++++++++++++++++++++++--------
mm/swapfile.c | 83 ++++++----
9 files changed, 393 insertions(+), 98 deletions(-)
--- anobjrmap4/include/linux/mm.h Thu Mar 20 17:10:23 2003
+++ anobjrmap5/include/linux/mm.h Thu Mar 20 17:10:34 2003
@@ -165,7 +165,10 @@
unsigned long index; /* Our offset within mapping. */
struct list_head lru; /* Pageout list, eg. active_list;
protected by zone->lru_lock !! */
- unsigned long rmap_count; /* Count mappings in mms */
+ union { /* Depending on PG_chained */
+ unsigned long count; /* Count mappings in mms, or */
+ struct rmap_chain *chain;/* Scattered mappings pointer */
+ } rmap; /* Protected by PG_rmaplock */
unsigned long private; /* mapping-private opaque data */
/*
@@ -364,7 +367,7 @@
* refers to user virtual address space into which the page is mapped.
*/
#define page_mapping(page) (PageAnon(page)? NULL: (page)->mapping)
-#define page_mapped(page) ((page)->rmap_count != 0)
+#define page_mapped(page) ((page)->rmap.count != 0)
/*
* Error return values for the *_nopage functions
--- anobjrmap4/include/linux/page-flags.h Thu Mar 20 17:10:12 2003
+++ anobjrmap5/include/linux/page-flags.h Thu Mar 20 17:10:34 2003
@@ -70,6 +70,7 @@
#define PG_nosave 14 /* Used for system suspend/resume */
#define PG_rmaplock 15 /* Lock bit for reversing to ptes */
+#define PG_chained 16 /* Has rmap chain of scattered maps */
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */
@@ -239,6 +240,10 @@
#define ClearPageNosave(page) clear_bit(PG_nosave, &(page)->flags)
#define TestClearPageNosave(page) test_and_clear_bit(PG_nosave, &(page)->flags)
+#define PageChained(page) test_bit(PG_chained, &(page)->flags)
+#define SetPageChained(page) set_bit(PG_chained, &(page)->flags)
+#define ClearPageChained(page) clear_bit(PG_chained, &(page)->flags)
+
#define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags)
#define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
#define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
--- anobjrmap4/include/linux/rmap.h Thu Mar 20 17:10:23 2003
+++ anobjrmap5/include/linux/rmap.h Thu Mar 20 17:10:34 2003
@@ -8,9 +8,13 @@
#include <linux/linkage.h>
#ifdef CONFIG_MMU
+int rmap_get_cpu(void);
+
void page_add_rmap(struct page *, struct vm_area_struct *,
unsigned long addr, int anon);
void page_turn_rmap(struct page *, struct vm_area_struct *);
+void page_move_rmap(struct page *, struct vm_area_struct *,
+ unsigned long oaddr, unsigned long naddr);
void FASTCALL(page_dup_rmap(struct page *));
void FASTCALL(page_remove_rmap(struct page *));
--- anobjrmap4/mm/fremap.c Thu Mar 20 17:10:23 2003
+++ anobjrmap5/mm/fremap.c Thu Mar 20 17:10:34 2003
@@ -58,7 +58,10 @@
pmd_t *pmd;
pgd = pgd_offset(mm, addr);
+ if (!rmap_get_cpu())
+ goto err;
spin_lock(&mm->page_table_lock);
+ put_cpu();
pmd = pmd_alloc(mm, pgd, addr);
if (!pmd)
@@ -83,6 +86,7 @@
err = 0;
err_unlock:
spin_unlock(&mm->page_table_lock);
+err:
return err;
}
--- anobjrmap4/mm/memory.c Thu Mar 20 17:10:23 2003
+++ anobjrmap5/mm/memory.c Thu Mar 20 17:10:34 2003
@@ -1141,19 +1141,23 @@
mark_page_accessed(page);
lock_page(page);
+ if (!rmap_get_cpu()) {
+ ret = VM_FAULT_OOM;
+ goto outrel;
+ }
+ spin_lock(&mm->page_table_lock);
+ put_cpu();
+ page_table = pte_offset_map(pmd, address);
+
/*
* Back out if somebody else faulted in this pte while we
* released the page table lock.
*/
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, address);
if (!pte_same(*page_table, orig_pte)) {
pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
- unlock_page(page);
- page_cache_release(page);
ret = VM_FAULT_MINOR;
- goto out;
+ goto outrel;
}
/* The page isn't present yet, go ahead with the fault. */
@@ -1179,6 +1183,10 @@
spin_unlock(&mm->page_table_lock);
out:
return ret;
+outrel:
+ unlock_page(page);
+ page_cache_release(page);
+ goto out;
}
/*
@@ -1291,7 +1299,10 @@
anon = 1;
}
+ if (!rmap_get_cpu())
+ goto oom;
spin_lock(&mm->page_table_lock);
+ put_cpu();
page_table = pte_offset_map(pmd, address);
/*
--- anobjrmap4/mm/mremap.c Thu Mar 20 17:10:23 2003
+++ anobjrmap5/mm/mremap.c Thu Mar 20 17:10:34 2003
@@ -96,11 +96,8 @@
page = pte_page(*src);
pte = ptep_get_and_clear(src);
set_pte(dst, pte);
- if (page) {
- int anon = PageAnon(page);
- page_remove_rmap(page);
- page_add_rmap(page, vma, new_addr, anon);
- }
+ if (page)
+ page_move_rmap(page, vma, old_addr, new_addr);
}
return 0;
}
@@ -113,7 +110,12 @@
int error = 0;
pte_t *src, *dst;
+ if (!rmap_get_cpu()) {
+ error = -ENOMEM;
+ goto out;
+ }
spin_lock(&mm->page_table_lock);
+ put_cpu();
src = get_one_pte_map_nested(mm, old_addr);
if (src) {
/*
@@ -134,6 +136,7 @@
}
flush_tlb_page(vma, old_addr);
spin_unlock(&mm->page_table_lock);
+out:
return error;
}
--- anobjrmap4/mm/page_alloc.c Thu Mar 20 17:10:12 2003
+++ anobjrmap5/mm/page_alloc.c Thu Mar 20 17:10:34 2003
@@ -81,6 +81,7 @@
1 << PG_active |
1 << PG_dirty |
1 << PG_rmaplock |
+ 1 << PG_chained |
1 << PG_anon |
1 << PG_swapcache |
1 << PG_writeback);
@@ -220,6 +221,7 @@
1 << PG_active |
1 << PG_reclaim |
1 << PG_rmaplock |
+ 1 << PG_chained |
1 << PG_anon |
1 << PG_swapcache |
1 << PG_writeback )))
@@ -327,6 +329,7 @@
1 << PG_dirty |
1 << PG_reclaim |
1 << PG_rmaplock |
+ 1 << PG_chained |
1 << PG_anon |
1 << PG_swapcache |
1 << PG_writeback )))
--- anobjrmap4/mm/rmap.c Thu Mar 20 17:10:23 2003
+++ anobjrmap5/mm/rmap.c Thu Mar 20 17:10:34 2003
@@ -27,11 +27,35 @@
#include <linux/percpu.h>
#include <asm/tlbflush.h>
-#define page_mapcount(page) ((page)->rmap_count)
+/*
+ * struct rmap_chain: extension of struct page, to track scattered
+ * mappings originating from sys_mremap of anonymous cow pages, or
+ * sys_remap_file_pages. Each cpu caches one to grab while locked.
+ */
+struct rmap_chain {
+#define NRSLOT 7 /* first contains count, then */
+ unsigned long slot[NRSLOT]; /* user virtual addresses */
+ struct rmap_chain *next;
+};
+static DEFINE_PER_CPU(struct rmap_chain *, rmap_chain) = 0;
+
+#define page_mapcount(page) (unlikely(PageChained(page))? \
+ (page)->rmap.chain->slot[0]: (page)->rmap.count)
#define NOADDR (~0UL) /* impossible user virtual address */
/*
+ * struct addresser: for next_rmap_address to dole out user
+ * addresses one by one to page_referenced or to try_to_unmap.
+ */
+struct addresser {
+ unsigned long address;
+ unsigned long count;
+ struct rmap_chain *chain;
+ int index;
+};
+
+/*
* struct anonmm: to track a bundle of anonymous memory mappings.
*
* Could be embedded in mm_struct, but mm_struct is rather heavyweight,
@@ -64,6 +88,147 @@
}
/**
+ ** Functions for manipulating struct rmap_chain.
+ **/
+
+/*
+ * Boolean rmap_get_cpu ensures that the cpu has an rmap_chain
+ * cached in case it is needed later while lock is held; it is never
+ * needed when page_add_rmap is adding a freshly allocated anon page.
+ * Caller does put_cpu() once page_table_lock prevents preemption.
+ */
+int
+rmap_get_cpu(void)
+{
+ struct rmap_chain **cache;
+
+ might_sleep();
+ cache = &per_cpu(rmap_chain, get_cpu());
+ if (unlikely(!*cache)) {
+ struct rmap_chain *chain;
+
+ put_cpu();
+ chain = kmalloc(sizeof(*chain), GFP_KERNEL);
+ cache = &per_cpu(rmap_chain, get_cpu());
+ if (*cache)
+ kfree(chain);
+ else if (chain)
+ *cache = chain;
+ else {
+ put_cpu();
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static struct rmap_chain *
+get_rmap_chain(void)
+{
+ struct rmap_chain **cache;
+ struct rmap_chain *chain;
+ int i;
+
+ /*
+ * page_table_lock and rmap_lock are held, no need to get_cpu
+ */
+ cache = &per_cpu(rmap_chain, smp_processor_id());
+ chain = *cache;
+ *cache = NULL;
+ BUG_ON(!chain);
+ for (i = 0; i < NRSLOT; i++)
+ chain->slot[i] = NOADDR;
+ chain->next = NULL;
+ return chain;
+}
+
+static void
+add_rmap_address(struct page *page, unsigned long address)
+{
+ struct rmap_chain *chain;
+ int i = 1;
+
+ if (PageChained(page)) {
+ /*
+ * Check lest duplicate, and find free slot at end
+ */
+ for (chain = page->rmap.chain; ; chain = chain->next, i = 0) {
+ for (; i < NRSLOT; i++) {
+ if (chain->slot[i] == NOADDR)
+ goto set;
+ if (chain->slot[i] == address)
+ return;
+ }
+ if (!chain->next)
+ chain->next = get_rmap_chain();
+ }
+ } else {
+ SetPageChained(page);
+ chain = get_rmap_chain();
+ chain->slot[0] = page->rmap.count;
+ page->rmap.chain = chain;
+ }
+set:
+ chain->slot[i] = address;
+}
+
+static int
+next_rmap_address(struct page *page,
+ struct vm_area_struct *vma, struct addresser *addresser)
+{
+ if (addresser->index == 0) {
+ /* set chain and index for next call */
+ addresser->chain =
+ PageChained(page)? page->rmap.chain: NULL;
+ addresser->index = 1;
+ if (vma) {
+ addresser->address = vma_address(page, vma);
+ if (addresser->address != NOADDR)
+ return 1;
+ } else {
+ addresser->address = page->index;
+ return 1;
+ }
+ }
+ while (addresser->chain) {
+ if (addresser->index >= NRSLOT)
+ addresser->index = 0;
+ addresser->address =
+ addresser->chain->slot[addresser->index];
+ if (addresser->address == NOADDR)
+ break;
+ addresser->index++;
+ if (addresser->index >= NRSLOT)
+ addresser->chain = addresser->chain->next;
+ if (!vma || addresser->address != vma_address(page, vma))
+ return 1;
+ }
+ return 0;
+}
+
+static void
+clear_page_chained(struct page *page)
+{
+ struct rmap_chain *chain = page->rmap.chain;
+
+ /*
+ * At present this is only called when mapcount goes to 0, which
+ * leaves open the possibility that a page might accumulate a
+ * large chain of stale addresses, slowing page_referenced and
+ * wasting memory on the chain; but normally try_to_unmap_one
+ * will bring the count down to 0 and free them all here.
+ */
+
+ page->rmap.count = chain->slot[0];
+ ClearPageChained(page);
+ do {
+ struct rmap_chain *next = chain->next;
+ kfree(chain);
+ chain = next;
+ } while (chain);
+}
+
+/**
** Functions for creating and destroying struct anonmm.
**/
@@ -181,8 +346,9 @@
static int
page_referenced_one(struct page *page, struct mm_struct *mm,
- unsigned long address, unsigned long *mapcount)
+ struct addresser *addresser)
{
+ unsigned long address = addresser->address;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
@@ -211,7 +377,7 @@
if (ptep_test_and_clear_young(pte))
referenced++;
- (*mapcount)--;
+ addresser->count--;
out_unmap:
pte_unmap(pte);
@@ -224,7 +390,7 @@
}
static inline int
-page_referenced_anon(struct page *page, unsigned long *mapcount)
+page_referenced_anon(struct page *page, struct addresser *addresser)
{
struct anonmm *anonmm = (struct anonmm *) page->mapping;
struct anonmm *anonhd = anonmm->head;
@@ -233,19 +399,25 @@
spin_lock(&anonhd->lock);
if (anonmm->mm && anonmm->mm->rss) {
- referenced += page_referenced_one(
- page, anonmm->mm, page->index, mapcount);
- if (!*mapcount)
- goto out;
+ addresser->index = 0;
+ while (next_rmap_address(page, NULL, addresser)) {
+ referenced += page_referenced_one(
+ page, anonmm->mm, addresser);
+ if (!addresser->count)
+ goto out;
+ }
}
seek_head = &anonmm->list;
list_for_each_entry(anonmm, seek_head, list) {
if (!anonmm->mm || !anonmm->mm->rss)
continue;
- referenced += page_referenced_one(
- page, anonmm->mm, page->index, mapcount);
- if (!*mapcount)
- goto out;
+ addresser->index = 0;
+ while (next_rmap_address(page, NULL, addresser)) {
+ referenced += page_referenced_one(
+ page, anonmm->mm, addresser);
+ if (!addresser->count)
+ goto out;
+ }
}
out:
spin_unlock(&anonhd->lock);
@@ -253,11 +425,10 @@
}
static inline int
-page_referenced_obj(struct page *page, unsigned long *mapcount)
+page_referenced_obj(struct page *page, struct addresser *addresser)
{
struct address_space *mapping = page->mapping;
struct vm_area_struct *vma;
- unsigned long address;
int referenced = 0;
if (down_trylock(&mapping->i_shared_sem))
@@ -266,11 +437,11 @@
list_for_each_entry(vma, &mapping->i_mmap, shared) {
if (!vma->vm_mm->rss)
continue;
- address = vma_address(page, vma);
- if (address != NOADDR) {
+ addresser->index = 0;
+ while (next_rmap_address(page, vma, addresser)) {
referenced += page_referenced_one(
- page, vma->vm_mm, address, mapcount);
- if (!*mapcount)
+ page, vma->vm_mm, addresser);
+ if (!addresser->count)
goto out;
}
}
@@ -278,11 +449,11 @@
list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
if (!vma->vm_mm->rss)
continue;
- address = vma_address(page, vma);
- if (address != NOADDR) {
+ addresser->index = 0;
+ while (next_rmap_address(page, vma, addresser)) {
referenced += page_referenced_one(
- page, vma->vm_mm, address, mapcount);
- if (!*mapcount)
+ page, vma->vm_mm, addresser);
+ if (!addresser->count)
goto out;
}
}
@@ -302,15 +473,15 @@
int
page_referenced(struct page *page)
{
- unsigned long mapcount;
+ struct addresser addresser;
int referenced;
referenced = !!TestClearPageReferenced(page);
- mapcount = page_mapcount(page);
- if (mapcount && page->mapping) {
+ addresser.count = page_mapcount(page);
+ if (addresser.count && page->mapping) {
referenced += PageAnon(page)?
- page_referenced_anon(page, &mapcount):
- page_referenced_obj(page, &mapcount);
+ page_referenced_anon(page, &addresser):
+ page_referenced_obj(page, &addresser);
}
return referenced;
}
@@ -343,8 +514,12 @@
if (page->mapping) {
if (anon) {
BUG_ON(!PageAnon(page));
+ if (unlikely(address != page->index))
+ add_rmap_address(page, address);
} else {
BUG_ON(PageAnon(page));
+ if (unlikely(address != vma_address(page, vma)))
+ add_rmap_address(page, address);
}
} else {
if (anon) {
@@ -410,6 +585,50 @@
}
/**
+ * page_move_rmap - move address in reverse mapping entry.
+ * @page: the page originally mapped into some vma
+ * @vma: that old vma into which this page is mapped
+ * @old_address: old virtual address at which page was mapped
+ * @new_address: new virtual address at which page will be mapped
+ *
+ * For sys_remap's copy_one_pte: move address in reverse mapping.
+ * Cannot use page_remove_rmap followed by page_add_rmap since
+ * the new vma into which to add has not yet been set up.
+ */
+void
+page_move_rmap(struct page *page, struct vm_area_struct *vma,
+ unsigned long old_address, unsigned long new_address)
+{
+ if (!page_mapped(page) || !page->mapping)
+ return;
+
+ rmap_lock(page);
+
+ if (PageAnon(page)) {
+ /*
+ * We don't check page_mapcount(page) == 1 here
+ * because the mapcount could be 1 yet the page
+ * still have a chain, and our new_address be in
+ * that chain: if the same address goes in twice,
+ * try_to_unmap would give up too early.
+ */
+ if (page->rmap.count == 1)
+ page->index = new_address;
+ else if (new_address != page->index)
+ add_rmap_address(page, new_address);
+ } else {
+ /*
+ * We must chain the new address if the old
+ * address was nonlinear in its original vma.
+ */
+ if (old_address != vma_address(page, vma))
+ add_rmap_address(page, new_address);
+ }
+
+ rmap_unlock(page);
+}
+
+/**
* page_remove_rmap - take down reverse mapping to a page
* @page: page to remove mapping from
*
@@ -420,13 +639,22 @@
void
page_remove_rmap(struct page *page)
{
+#if 0 /* All its callers have already checked these conditions */
if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
return;
+#endif
rmap_lock(page);
- BUG_ON(!page_mapcount(page));
- page_mapcount(page)--;
+ if (unlikely(PageChained(page))) {
+ BUG_ON(!page->rmap.chain->slot[0]);
+ page->rmap.chain->slot[0]--;
+ if (!page->rmap.chain->slot[0])
+ clear_page_chained(page);
+ } else {
+ BUG_ON(!page->rmap.count);
+ page->rmap.count--;
+ }
if (!page_mapped(page)) {
dec_page_state(nr_mapped);
@@ -444,9 +672,9 @@
static int
try_to_unmap_one(struct page *page, struct mm_struct *mm,
- unsigned long address, unsigned long *mapcount,
- struct vm_area_struct *vma)
+ struct addresser *addresser, struct vm_area_struct *vma)
{
+ unsigned long address = addresser->address;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
@@ -475,7 +703,7 @@
if (page_to_pfn(page) != pte_pfn(*pte))
goto out_unmap;
- (*mapcount)--;
+ addresser->count--;
/*
* If the page is mlock()d, we cannot swap it out.
@@ -505,7 +733,6 @@
BUG_ON(pte_file(*pte));
} else {
/*
- * This only comes into play with the next patch...
* If a nonlinear mapping then store
* the file page offset in the pte.
*/
@@ -520,7 +747,12 @@
set_page_dirty(page);
BUG_ON(!page_mapcount(page));
- page_mapcount(page)--;
+ if (unlikely(PageChained(page))) {
+ page->rmap.chain->slot[0]--;
+ if (!page->rmap.chain->slot[0])
+ clear_page_chained(page);
+ } else
+ page->rmap.count--;
page_cache_release(page);
mm->rss--;
@@ -535,7 +767,7 @@
}
static inline int
-try_to_unmap_anon(struct page *page, unsigned long *mapcount)
+try_to_unmap_anon(struct page *page, struct addresser *addresser)
{
struct anonmm *anonmm = (struct anonmm *) page->mapping;
struct anonmm *anonhd = anonmm->head;
@@ -544,19 +776,25 @@
spin_lock(&anonhd->lock);
if (anonmm->mm && anonmm->mm->rss) {
- ret = try_to_unmap_one(
- page, anonmm->mm, page->index, mapcount, NULL);
- if (ret == SWAP_FAIL || !*mapcount)
- goto out;
+ addresser->index = 0;
+ while (next_rmap_address(page, NULL, addresser)) {
+ ret = try_to_unmap_one(
+ page, anonmm->mm, addresser, NULL);
+ if (ret == SWAP_FAIL || !addresser->count)
+ goto out;
+ }
}
seek_head = &anonmm->list;
list_for_each_entry(anonmm, seek_head, list) {
if (!anonmm->mm || !anonmm->mm->rss)
continue;
- ret = try_to_unmap_one(
- page, anonmm->mm, page->index, mapcount, NULL);
- if (ret == SWAP_FAIL || !*mapcount)
- goto out;
+ addresser->index = 0;
+ while (next_rmap_address(page, NULL, addresser)) {
+ ret = try_to_unmap_one(
+ page, anonmm->mm, addresser, NULL);
+ if (ret == SWAP_FAIL || !addresser->count)
+ goto out;
+ }
}
out:
spin_unlock(&anonhd->lock);
@@ -564,11 +802,10 @@
}
static inline int
-try_to_unmap_obj(struct page *page, unsigned long *mapcount)
+try_to_unmap_obj(struct page *page, struct addresser *addresser)
{
struct address_space *mapping = page->mapping;
struct vm_area_struct *vma;
- unsigned long address;
int ret = SWAP_AGAIN;
if (down_trylock(&mapping->i_shared_sem))
@@ -577,11 +814,11 @@
list_for_each_entry(vma, &mapping->i_mmap, shared) {
if (!vma->vm_mm->rss)
continue;
- address = vma_address(page, vma);
- if (address != NOADDR) {
+ addresser->index = 0;
+ while (next_rmap_address(page, vma, addresser)) {
ret = try_to_unmap_one(
- page, vma->vm_mm, address, mapcount, vma);
- if (ret == SWAP_FAIL || !*mapcount)
+ page, vma->vm_mm, addresser, vma);
+ if (ret == SWAP_FAIL || !addresser->count)
goto out;
}
}
@@ -589,11 +826,11 @@
list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
if (!vma->vm_mm->rss)
continue;
- address = vma_address(page, vma);
- if (address != NOADDR) {
+ addresser->index = 0;
+ while (next_rmap_address(page, vma, addresser)) {
ret = try_to_unmap_one(
- page, vma->vm_mm, address, mapcount, vma);
- if (ret == SWAP_FAIL || !*mapcount)
+ page, vma->vm_mm, addresser, vma);
+ if (ret == SWAP_FAIL || !addresser->count)
goto out;
}
}
@@ -618,17 +855,17 @@
int
try_to_unmap(struct page *page)
{
- unsigned long mapcount;
+ struct addresser addresser;
int ret;
BUG_ON(PageReserved(page));
BUG_ON(!PageLocked(page));
BUG_ON(!page_mapped(page));
- mapcount = page_mapcount(page);
+ addresser.count = page_mapcount(page);
ret = PageAnon(page)?
- try_to_unmap_anon(page, &mapcount):
- try_to_unmap_obj(page, &mapcount);
+ try_to_unmap_anon(page, &addresser):
+ try_to_unmap_obj(page, &addresser);
if (!page_mapped(page)) {
dec_page_state(nr_mapped);
--- anobjrmap4/mm/swapfile.c Thu Mar 20 17:10:23 2003
+++ anobjrmap5/mm/swapfile.c Thu Mar 20 17:10:34 2003
@@ -383,28 +383,29 @@
* share this swap entry, so be cautious and let do_wp_page work out
* what to do if a write is requested later.
*/
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static void
+/* vma->vm_mm->page_table_lock is held */
+static int
unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
swp_entry_t entry, struct page *page)
{
pte_t pte = *dir;
if (pte_file(pte))
- return;
+ return 0;
if (likely(pte_to_swp_entry(pte).val != entry.val))
- return;
+ return 0;
if (unlikely(pte_none(pte) || pte_present(pte)))
- return;
+ return 0;
vma->vm_mm->rss++;
get_page(page);
set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
page_add_rmap(page, vma, address, 1);
swap_free(entry);
+ return 1;
}
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
+/* vma->vm_mm->page_table_lock is held */
+static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
unsigned long address, unsigned long size, unsigned long offset,
swp_entry_t entry, struct page* page)
{
@@ -412,11 +413,11 @@
unsigned long end;
if (pmd_none(*dir))
- return;
+ return 0;
if (pmd_bad(*dir)) {
pmd_ERROR(*dir);
pmd_clear(dir);
- return;
+ return 0;
}
pte = pte_offset_map(dir, address);
offset += address & PMD_MASK;
@@ -425,15 +426,19 @@
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
- unuse_pte(vma, offset + address, pte, entry, page);
+ if (unuse_pte(vma, offset + address, pte, entry, page)) {
+ pte_unmap(pte);
+ return 1;
+ }
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
pte_unmap(pte - 1);
+ return 0;
}
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
+/* vma->vm_mm->page_table_lock is held */
+static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
unsigned long address, unsigned long size,
swp_entry_t entry, struct page* page)
{
@@ -441,11 +446,11 @@
unsigned long offset, end;
if (pgd_none(*dir))
- return;
+ return 0;
if (pgd_bad(*dir)) {
pgd_ERROR(*dir);
pgd_clear(dir);
- return;
+ return 0;
}
pmd = pmd_offset(dir, address);
offset = address & PGDIR_MASK;
@@ -456,15 +461,17 @@
if (address >= end)
BUG();
do {
- unuse_pmd(vma, pmd, address, end - address, offset, entry,
- page);
+ if (unuse_pmd(vma, pmd, address, end - address,
+ offset, entry, page))
+ return 1;
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address && (address < end));
+ return 0;
}
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
+/* vma->vm_mm->page_table_lock is held */
+static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
swp_entry_t entry, struct page* page)
{
unsigned long start = vma->vm_start, end = vma->vm_end;
@@ -472,13 +479,15 @@
if (start >= end)
BUG();
do {
- unuse_pgd(vma, pgdir, start, end - start, entry, page);
+ if (unuse_pgd(vma, pgdir, start, end - start, entry, page))
+ return 1;
start = (start + PGDIR_SIZE) & PGDIR_MASK;
pgdir++;
} while (start && (start < end));
+ return 0;
}
-static void unuse_process(struct mm_struct * mm,
+static int unuse_process(struct mm_struct * mm,
swp_entry_t entry, struct page* page)
{
struct vm_area_struct* vma;
@@ -486,13 +495,17 @@
/*
* Go through process' page directory.
*/
+ if (!rmap_get_cpu())
+ return -ENOMEM;
spin_lock(&mm->page_table_lock);
+ put_cpu();
for (vma = mm->mmap; vma; vma = vma->vm_next) {
pgd_t * pgd = pgd_offset(mm, vma->vm_start);
- unuse_vma(vma, pgd, entry, page);
+ if (unuse_vma(vma, pgd, entry, page))
+ break;
}
spin_unlock(&mm->page_table_lock);
- return;
+ return 0;
}
/*
@@ -635,34 +648,46 @@
flush_page_to_ram(page);
if (start_mm == &init_mm)
shmem = shmem_unuse(entry, page);
- else
- unuse_process(start_mm, entry, page);
+ else {
+ retval = unuse_process(start_mm, entry, page);
+ if (retval)
+ break;
+ }
}
if (*swap_map > 1) {
int set_start_mm = (*swap_map >= swcount);
struct list_head *p = &start_mm->mmlist;
struct mm_struct *new_start_mm = start_mm;
+ struct mm_struct *prev_mm = start_mm;
struct mm_struct *mm;
+ atomic_inc(&new_start_mm->mm_users);
+ atomic_inc(&prev_mm->mm_users);
spin_lock(&mmlist_lock);
- while (*swap_map > 1 &&
+ while (*swap_map > 1 && !retval &&
(p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
+ atomic_inc(&mm->mm_users);
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ prev_mm = mm;
+
swcount = *swap_map;
if (mm == &init_mm) {
set_start_mm = 1;
- spin_unlock(&mmlist_lock);
shmem = shmem_unuse(entry, page);
- spin_lock(&mmlist_lock);
} else
- unuse_process(mm, entry, page);
+ retval = unuse_process(mm, entry, page);
if (set_start_mm && *swap_map < swcount) {
+ mmput(new_start_mm);
+ atomic_inc(&mm->mm_users);
new_start_mm = mm;
set_start_mm = 0;
}
+ spin_lock(&mmlist_lock);
}
- atomic_inc(&new_start_mm->mm_users);
spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
mmput(start_mm);
start_mm = new_start_mm;
}
anobjrmap 2/6 free page->mapping for use by anon
Tracking anonymous pages by mm,address needs a pointer,offset
pair in struct page: mapping,index the natural choice. However,
swapcache already uses them for &swapper_space,swp_entry_t.
But it's trivial to separate swapcache from pagecache with radix
tree; most of swapper_space is actually unused, just a fiction
to pretend swap like file; and page->private is a good place to
keep swp_entry_t now swap never uses bufferheads.
Define page_mapping(page) macro to give NULL when PageAnon,
whatever that may put in page->mapping; define PG_swapcache bit,
deduce swapper_space from that. This does mean more conditionals
(many hidden in page_mapping), but I believe they'll be worth it.
Sorry to lose another PG_ bit? Don't worry, I'm sure we can
deduce PageSwapCache from PageAnon && private when tight; but
that will demand a little care with the Anon/Swap transitions,
at present they're pleasantly independent. Who owns page->list,
Anon or Swap? Dunno, at present neither, useful for testing.
Separating the caches slightly simplifies the tmpfs swizzling,
can use functions with fewer underscores since page can briefly
be in both caches. I've taken the liberty of moving a
page_cache_release into remove_from_page_cache,
and getting rid of ___add_to_page_cache.
Seemed natural to add nr_swapcache separate from nr_pagecache;
but if that poses a vmstat compatibility problem, easily reverted.
Similarly, natural to remove the total_swapcache_pages double
counting bogosity from vm_enough_memory: it dates from a time
when we didn't free the swap when a swapcache page was freed
(but though bogus in 2.4 also, I'd hesitate to change it there).
It is likely that I've screwed up on the "Morton pages", those
ext3 journal pages locked at truncate time which then turn into
fish with wings: please check them out, I never manage to wrap
my head around them. Certainly don't want a page using private
for both bufferheads and swp_entry_t.
Although these patches are for applying to 2.5.65-mm2, they're
really based the version of rmap.c before Dave added the lovely
but flawed find_pte (sorry, you _need_ page_table_lock), and
the very unlovely page_convert_anon. My idea for anon pages
doesn't let them be in the pagecache too, so I've had to erase
page_convert_anon in this patch (sys_remap_file pages will get
lost until freed, as if locked): 5/6 then puts that right.
Similarly, I'm not calling those !page->mapping driver pages
anon: count them in and out, but don't attempt to unmap them
(unless I'm mistaken, they're usually pages a driver has
allocated, has a reference to, can't be freed anyway).
In passing, noticed shrink_list's CONFIG_SWAP was excluding
try_to_unmap of file-backed pages, surely that's wrong?
Moved its #endif up to allow them; but suspect !CONFIG_MMU
was relying on !CONFIG_SWAP there? So added a try_to_unmap
stub in linux/rmap.h; but I've not tested these configs.
fs/buffer.c | 14 +---
fs/proc/proc_misc.c | 4 -
include/linux/mm.h | 28 +++-----
include/linux/page-flags.h | 16 ++--
include/linux/pagemap.h | 11 ---
include/linux/rmap.h | 14 ++--
include/linux/swap.h | 3
mm/filemap.c | 29 ++++----
mm/fremap.c | 10 --
mm/memory.c | 13 +--
mm/mmap.c | 8 --
mm/page-writeback.c | 32 +++++----
mm/page_alloc.c | 17 ++++
mm/page_io.c | 19 +----
mm/rmap.c | 155 +++++++++------------------------------------
mm/swap_state.c | 152 ++++++++++++++++++++------------------------
mm/swapfile.c | 21 +++---
mm/truncate.c | 1
mm/vmscan.c | 32 +++++----
19 files changed, 224 insertions(+), 355 deletions(-)
--- anobjrmap1/fs/buffer.c Wed Mar 19 11:05:11 2003
+++ anobjrmap2/fs/buffer.c Thu Mar 20 17:10:01 2003
@@ -1457,7 +1457,7 @@
*/
int try_to_release_page(struct page *page, int gfp_mask)
{
- struct address_space * const mapping = page->mapping;
+ struct address_space * const mapping = page_mapping(page);
if (!PageLocked(page))
BUG();
@@ -2717,22 +2717,18 @@
int try_to_free_buffers(struct page *page)
{
- struct address_space * const mapping = page->mapping;
+ struct address_space * const mapping = page_mapping(page);
struct buffer_head *buffers_to_free = NULL;
int ret = 0;
+ BUG_ON(!mapping);
BUG_ON(!PageLocked(page));
if (PageWriteback(page))
return 0;
- if (mapping == NULL) { /* swapped-in anon page */
- ret = drop_buffers(page, &buffers_to_free);
- goto out;
- }
-
spin_lock(&mapping->private_lock);
ret = drop_buffers(page, &buffers_to_free);
- if (ret && !PageSwapCache(page)) {
+ if (ret) {
/*
* If the filesystem writes its buffers by hand (eg ext3)
* then we can have clean buffers against a dirty page. We
@@ -2744,7 +2740,7 @@
clear_page_dirty(page);
}
spin_unlock(&mapping->private_lock);
-out:
+
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
--- anobjrmap1/fs/proc/proc_misc.c Wed Mar 19 11:05:11 2003
+++ anobjrmap2/fs/proc/proc_misc.c Thu Mar 20 17:10:01 2003
@@ -182,8 +182,8 @@
K(i.totalram),
K(i.freeram),
K(i.bufferram),
- K(ps.nr_pagecache-total_swapcache_pages-i.bufferram),
- K(total_swapcache_pages),
+ K(ps.nr_pagecache-i.bufferram),
+ K(ps.nr_swapcache),
K(active),
K(inactive),
K(i.totalhigh),
--- anobjrmap1/include/linux/mm.h Wed Mar 19 11:05:15 2003
+++ anobjrmap2/include/linux/mm.h Thu Mar 20 17:10:01 2003
@@ -363,6 +363,16 @@
#endif
/*
+ * On an anonymous page mapped into a user virtual memory area,
+ * page->mapping points to its anonmm, not to a struct address_space.
+ *
+ * Please note that, confusingly, "page_mapping" refers to the inode
+ * address_space which maps the page from disk; whereas "page_mapped"
+ * refers to user virtual address space into which the page is mapped.
+ */
+#define page_mapping(page) (PageAnon(page)? NULL: (page)->mapping)
+
+/*
* Return true if this page is mapped into pagetables. Subtle: test pte.direct
* rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain
* is only 32-bit.
@@ -430,6 +440,7 @@
int __set_page_dirty_buffers(struct page *page);
int __set_page_dirty_nobuffers(struct page *page);
+int set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);
/*
@@ -456,23 +467,6 @@
extern void remove_shrinker(struct shrinker *shrinker);
/*
- * If the mapping doesn't provide a set_page_dirty a_op, then
- * just fall through and assume that it wants buffer_heads.
- * FIXME: make the method unconditional.
- */
-static inline int set_page_dirty(struct page *page)
-{
- if (page->mapping) {
- int (*spd)(struct page *);
-
- spd = page->mapping->a_ops->set_page_dirty;
- if (spd)
- return (*spd)(page);
- }
- return __set_page_dirty_buffers(page);
-}
-
-/*
* On a two-level page table, this ends up being trivial. Thus the
* inlining and the symmetry break with pte_alloc_map() that does all
* of this out-of-line.
--- anobjrmap1/include/linux/page-flags.h Wed Mar 19 11:05:15 2003
+++ anobjrmap2/include/linux/page-flags.h Thu Mar 20 17:10:01 2003
@@ -74,7 +74,9 @@
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */
-#define PG_anon 20 /* Anonymous page */
+
+#define PG_anon 20 /* Anonymous page: anonmm in mapping */
+#define PG_swapcache 21 /* Swap page: swp_entry_t in private */
/*
* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -84,6 +86,7 @@
unsigned long nr_dirty; /* Dirty writeable pages */
unsigned long nr_writeback; /* Pages under writeback */
unsigned long nr_pagecache; /* Pages in pagecache */
+ unsigned long nr_swapcache; /* Pages in swapcache */
unsigned long nr_page_table_pages;/* Pages used for pagetables */
unsigned long nr_reverse_maps; /* includes PageDirect */
unsigned long nr_mapped; /* mapped into pagetables */
@@ -261,15 +264,12 @@
#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags)
#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags)
-/*
- * The PageSwapCache predicate doesn't use a PG_flag at this time,
- * but it may again do so one day.
- */
#ifdef CONFIG_SWAP
-extern struct address_space swapper_space;
-#define PageSwapCache(page) ((page)->mapping == &swapper_space)
+#define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags)
+#define SetPageSwapCache(page) set_bit(PG_swapcache, &(page)->flags)
+#define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags)
#else
-#define PageSwapCache(page) 0
+#define PageSwapCache(page) 0
#endif
struct page; /* forward declaration */
--- anobjrmap1/include/linux/pagemap.h Tue Feb 18 02:14:25 2003
+++ anobjrmap2/include/linux/pagemap.h Thu Mar 20 17:10:01 2003
@@ -74,17 +74,6 @@
extern void remove_from_page_cache(struct page *page);
extern void __remove_from_page_cache(struct page *page);
-static inline void ___add_to_page_cache(struct page *page,
- struct address_space *mapping, unsigned long index)
-{
- list_add(&page->list, &mapping->clean_pages);
- page->mapping = mapping;
- page->index = index;
-
- mapping->nrpages++;
- inc_page_state(nr_pagecache);
-}
-
extern void FASTCALL(__lock_page(struct page *page));
extern void FASTCALL(unlock_page(struct page *page));
--- anobjrmap1/include/linux/rmap.h Thu Mar 20 17:09:50 2003
+++ anobjrmap2/include/linux/rmap.h Thu Mar 20 17:10:01 2003
@@ -22,7 +22,6 @@
struct pte_chain *FASTCALL(
page_add_rmap(struct page *, pte_t *, struct pte_chain *));
void FASTCALL(page_remove_rmap(struct page *, pte_t *));
-void page_convert_anon(struct page *page);
/*
* Called from mm/vmscan.c to handle paging out
@@ -30,6 +29,13 @@
int FASTCALL(page_referenced(struct page *));
int FASTCALL(try_to_unmap(struct page *));
+#else /* !CONFIG_MMU */
+
+#define page_referenced(page) TestClearPageReferenced(page)
+#define try_to_unmap(page) SWAP_FAIL
+
+#endif /* CONFIG_MMU */
+
/*
* Return values of try_to_unmap
*/
@@ -37,12 +43,6 @@
#define SWAP_AGAIN 1
#define SWAP_FAIL 2
-#else /* !CONFIG_MMU */
-
-#define page_referenced(page) TestClearPageReferenced(page)
-
-#endif /* CONFIG_MMU */
-
static inline void pte_chain_lock(struct page *page)
{
/*
--- anobjrmap1/include/linux/swap.h Thu Mar 20 17:09:50 2003
+++ anobjrmap2/include/linux/swap.h Thu Mar 20 17:10:01 2003
@@ -179,9 +179,7 @@
/* linux/mm/swap_state.c */
extern struct address_space swapper_space;
-#define total_swapcache_pages swapper_space.nrpages
extern void show_swap_cache_info(void);
-extern int add_to_swap_cache(struct page *, swp_entry_t);
extern int add_to_swap(struct page *);
extern void __delete_from_swap_cache(struct page *);
extern void delete_from_swap_cache(struct page *);
@@ -219,7 +217,6 @@
#else /* CONFIG_SWAP */
#define total_swap_pages 0
-#define total_swapcache_pages 0UL
#define si_swapinfo(val) \
do { (val)->freeswap = (val)->totalswap = 0; } while (0)
--- anobjrmap1/mm/filemap.c Wed Mar 19 11:05:16 2003
+++ anobjrmap2/mm/filemap.c Thu Mar 20 17:10:01 2003
@@ -81,7 +81,7 @@
{
struct address_space *mapping = page->mapping;
- BUG_ON(PageDirty(page) && !PageSwapCache(page));
+ BUG_ON(PageDirty(page));
radix_tree_delete(&mapping->page_tree, page->index);
list_del(&page->list);
@@ -95,20 +95,22 @@
{
struct address_space *mapping = page->mapping;
- if (unlikely(!PageLocked(page)))
- PAGE_BUG(page);
+ BUG_ON(!PageLocked(page));
write_lock(&mapping->page_lock);
__remove_from_page_cache(page);
write_unlock(&mapping->page_lock);
+ page_cache_release(page);
}
static inline int sync_page(struct page *page)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = page_mapping(page);
if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
return mapping->a_ops->sync_page(page);
+ if (PageSwapCache(page))
+ blk_run_queues();
return 0;
}
@@ -192,16 +194,9 @@
* This adds a page to the page cache, starting out as locked, unreferenced,
* not uptodate and with no errors.
*
- * This function is used for two things: adding newly allocated pagecache
- * pages and for moving existing anon pages into swapcache.
- *
- * In the case of pagecache pages, the page is new, so we can just run
- * SetPageLocked() against it. The other page state flags were set by
- * rmqueue()
- *
- * In the case of swapcache, try_to_swap_out() has already locked the page, so
- * SetPageLocked() is ugly-but-OK there too. The required page state has been
- * set up by swap_out_add_to_swap_cache().
+ * This function is used to add newly allocated pagecache pages:
+ * the page is new, so we can just run SetPageLocked() against it.
+ * The other page state flags were set by rmqueue().
*
* This function does not add the page to the LRU. The caller must do that.
*/
@@ -216,7 +211,11 @@
error = radix_tree_insert(&mapping->page_tree, offset, page);
if (!error) {
SetPageLocked(page);
- ___add_to_page_cache(page, mapping, offset);
+ list_add(&page->list, &mapping->clean_pages);
+ page->mapping = mapping;
+ page->index = offset;
+ mapping->nrpages++;
+ inc_page_state(nr_pagecache);
} else {
page_cache_release(page);
}
--- anobjrmap1/mm/fremap.c Thu Mar 20 17:09:50 2003
+++ anobjrmap2/mm/fremap.c Thu Mar 20 17:10:01 2003
@@ -57,21 +57,11 @@
pgd_t *pgd;
pmd_t *pmd;
struct pte_chain *pte_chain;
- unsigned long pgidx;
pte_chain = pte_chain_alloc(GFP_KERNEL);
if (!pte_chain)
goto err;
- /*
- * Convert this page to anon for objrmap if it's nonlinear
- */
- pgidx = (addr - vma->vm_start) >> PAGE_SHIFT;
- pgidx += vma->vm_pgoff;
- pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
- if (!PageAnon(page) && (page->index != pgidx))
- page_convert_anon(page);
-
pgd = pgd_offset(mm, addr);
spin_lock(&mm->page_table_lock);
--- anobjrmap1/mm/memory.c Thu Mar 20 17:09:50 2003
+++ anobjrmap2/mm/memory.c Thu Mar 20 17:10:01 2003
@@ -419,8 +419,8 @@
if (!PageReserved(page)) {
if (pte_dirty(pte))
set_page_dirty(page);
- if (page->mapping && pte_young(pte) &&
- !PageSwapCache(page))
+ if (pte_young(pte) &&
+ page_mapping(page))
mark_page_accessed(page);
tlb->freed++;
page_remove_rmap(page, ptep);
@@ -1329,6 +1329,7 @@
struct page * new_page;
pte_t entry;
struct pte_chain *pte_chain;
+ int anon = 0;
int ret;
if (!vma->vm_ops || !vma->vm_ops->nopage)
@@ -1349,10 +1350,6 @@
if (!pte_chain)
goto oom;
- /* See if nopage returned an anon page */
- if (!new_page->mapping || PageSwapCache(new_page))
- SetPageAnon(new_page);
-
/*
* Should we do an early C-O-W break?
*/
@@ -1365,8 +1362,8 @@
copy_user_highpage(page, new_page, address);
page_cache_release(new_page);
lru_cache_add_active(page);
- SetPageAnon(page);
new_page = page;
+ anon = 1;
}
spin_lock(&mm->page_table_lock);
@@ -1391,6 +1388,8 @@
if (write_access)
entry = pte_mkwrite(pte_mkdirty(entry));
set_pte(page_table, entry);
+ if (anon)
+ SetPageAnon(new_page);
pte_chain = page_add_rmap(new_page, page_table, pte_chain);
pte_unmap(page_table);
} else {
--- anobjrmap1/mm/mmap.c Wed Mar 19 11:05:16 2003
+++ anobjrmap2/mm/mmap.c Thu Mar 20 17:10:01 2003
@@ -82,14 +82,6 @@
free += nr_swap_pages;
/*
- * This double-counts: the nrpages are both in the
- * page-cache and in the swapper space. At the same time,
- * this compensates for the swap-space over-allocation
- * (ie "nr_swap_pages" being too small).
- */
- free += total_swapcache_pages;
-
- /*
* The code below doesn't account for free space in the
* inode and dentry slab cache, slab cache fragmentation,
* inodes and dentries which will become freeable under
--- anobjrmap1/mm/page-writeback.c Tue Mar 18 07:38:45 2003
+++ anobjrmap2/mm/page-writeback.c Thu Mar 20 17:10:01 2003
@@ -477,21 +477,12 @@
* FIXME: may need to call ->reservepage here as well. That's rather up to the
* address_space though.
*
- * For now, we treat swapper_space specially. It doesn't use the normal
- * block a_ops.
- *
* FIXME: this should move over to fs/buffer.c - buffer_heads have no business in mm/
*/
#include <linux/buffer_head.h>
int __set_page_dirty_buffers(struct page *page)
{
struct address_space * const mapping = page->mapping;
- int ret = 0;
-
- if (mapping == NULL) {
- SetPageDirty(page);
- goto out;
- }
if (!PageUptodate(page))
buffer_error();
@@ -523,8 +514,7 @@
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
-out:
- return ret;
+ return 0;
}
EXPORT_SYMBOL(__set_page_dirty_buffers);
@@ -566,6 +556,24 @@
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
/*
+ * If the mapping doesn't provide a set_page_dirty a_op, then
+ * just fall through and assume that it wants buffer_heads.
+ */
+int set_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ int (*spd)(struct page *);
+
+ if (!mapping) {
+ SetPageDirty(page);
+ return 0;
+ }
+ spd = mapping->a_ops->set_page_dirty;
+ return spd? (*spd)(page): __set_page_dirty_buffers(page);
+}
+EXPORT_SYMBOL(set_page_dirty);
+
+/*
* set_page_dirty() is racy if the caller has no reference against
* page->mapping->host, and if the page is unlocked. This is because another
* CPU could truncate the page off the mapping and then free the mapping.
@@ -592,7 +600,7 @@
int test_clear_page_dirty(struct page *page)
{
if (TestClearPageDirty(page)) {
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = page_mapping(page);
if (mapping && !mapping->backing_dev_info->memory_backed)
dec_page_state(nr_dirty);
--- anobjrmap1/mm/page_alloc.c Wed Mar 19 11:05:16 2003
+++ anobjrmap2/mm/page_alloc.c Thu Mar 20 17:10:01 2003
@@ -80,6 +80,10 @@
1 << PG_lru |
1 << PG_active |
1 << PG_dirty |
+ 1 << PG_chainlock |
+ 1 << PG_direct |
+ 1 << PG_anon |
+ 1 << PG_swapcache |
1 << PG_writeback);
set_page_count(page, 0);
page->mapping = NULL;
@@ -216,12 +220,14 @@
1 << PG_locked |
1 << PG_active |
1 << PG_reclaim |
+ 1 << PG_chainlock |
+ 1 << PG_direct |
+ 1 << PG_anon |
+ 1 << PG_swapcache |
1 << PG_writeback )))
bad_page(function, page);
if (PageDirty(page))
ClearPageDirty(page);
- if (PageAnon(page))
- ClearPageAnon(page);
}
/*
@@ -322,6 +328,10 @@
1 << PG_active |
1 << PG_dirty |
1 << PG_reclaim |
+ 1 << PG_chainlock |
+ 1 << PG_direct |
+ 1 << PG_anon |
+ 1 << PG_swapcache |
1 << PG_writeback )))
bad_page(__FUNCTION__, page);
@@ -869,7 +879,7 @@
struct page_state ps;
get_page_state(&ps);
- return ps.nr_pagecache;
+ return ps.nr_pagecache + ps.nr_swapcache;
}
void si_meminfo(struct sysinfo *val)
@@ -1442,6 +1452,7 @@
"nr_dirty",
"nr_writeback",
"nr_pagecache",
+ "nr_swapcache",
"nr_page_table_pages",
"nr_reverse_maps",
"nr_mapped",
--- anobjrmap1/mm/page_io.c Mon Dec 16 10:37:02 2002
+++ anobjrmap2/mm/page_io.c Thu Mar 20 17:10:01 2003
@@ -16,8 +16,6 @@
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/swapops.h>
-#include <linux/buffer_head.h> /* for block_sync_page() */
-#include <linux/mpage.h>
#include <linux/writeback.h>
#include <asm/pgtable.h>
@@ -32,7 +30,7 @@
swp_entry_t entry;
BUG_ON(!PageSwapCache(page));
- entry.val = page->index;
+ entry.val = page->private;
sis = get_swap_info_struct(swp_type(entry));
bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
@@ -130,13 +128,6 @@
return ret;
}
-struct address_space_operations swap_aops = {
- .writepage = swap_writepage,
- .readpage = swap_readpage,
- .sync_page = block_sync_page,
- .set_page_dirty = __set_page_dirty_nobuffers,
-};
-
/*
* A scruffy utility function to read or write an arbitrary swap page
* and wait on the I/O.
@@ -149,10 +140,8 @@
};
lock_page(page);
-
- BUG_ON(page->mapping);
- page->mapping = &swapper_space;
- page->index = entry.val;
+ SetPageSwapCache(page);
+ page->private = entry.val;
if (rw == READ) {
ret = swap_readpage(NULL, page);
@@ -161,7 +150,7 @@
ret = swap_writepage(page, &swap_wbc);
wait_on_page_writeback(page);
}
- page->mapping = NULL;
+ ClearPageSwapCache(page);
if (ret == 0 && (!PageUptodate(page) || PageError(page)))
ret = -EIO;
return ret;
--- anobjrmap1/mm/rmap.c Thu Mar 20 17:09:50 2003
+++ anobjrmap2/mm/rmap.c Thu Mar 20 17:10:01 2003
@@ -37,6 +37,20 @@
/* #define DEBUG_RMAP */
/*
+ * Something oopsable to put for now in the page->mapping
+ * of an anonymous page, to test that it is ignored.
+ */
+#define ANON_MAPPING_DEBUG ((struct address_space *) 1)
+
+static inline void
+clear_page_anon(struct page *page)
+{
+ BUG_ON(page->mapping != ANON_MAPPING_DEBUG);
+ page->mapping = NULL;
+ ClearPageAnon(page);
+}
+
+/*
* Shared pages have a chain of pte_chain structures, used to locate
* all the mappings to this page. We only need a pointer to the pte
* here, the page struct for the page table page contains the process
@@ -185,7 +199,7 @@
return 0;
if (!mapping)
- BUG();
+ return 0;
if (PageSwapCache(page))
BUG();
@@ -298,8 +312,6 @@
* find the mappings by walking the object vma chain for that object.
*/
if (!PageAnon(page)) {
- if (!page->mapping)
- BUG();
if (PageSwapCache(page))
BUG();
if (!page->pte.mapcount)
@@ -331,6 +343,8 @@
}
#endif
+ page->mapping = ANON_MAPPING_DEBUG;
+
if (page->pte.direct == 0) {
page->pte.direct = pte_paddr;
SetPageDirect(page);
@@ -392,8 +406,6 @@
BUG();
if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
return;
- if (!page_mapped(page))
- return; /* remap_page_range() from a driver? */
pte_chain_lock(page);
@@ -402,8 +414,6 @@
* find the mappings by walking the object vma chain for that object.
*/
if (!PageAnon(page)) {
- if (!page->mapping)
- BUG();
if (PageSwapCache(page))
BUG();
if (!page->pte.mapcount)
@@ -472,10 +482,11 @@
#endif
out:
- pte_chain_unlock(page);
- if (!page_mapped(page))
+ if (!page_mapped(page)) {
dec_page_state(nr_mapped);
- return;
+ clear_page_anon(page);
+ }
+ pte_chain_unlock(page);
}
/**
@@ -565,8 +576,9 @@
goto out;
}
+ /* We lose track of sys_remap_file pages (for now) */
if (page->pte.mapcount)
- BUG();
+ ret = SWAP_FAIL;
out:
up(&mapping->i_shared_sem);
@@ -628,12 +640,13 @@
pte = ptep_get_and_clear(ptep);
flush_tlb_page(vma, address);
- if (PageSwapCache(page)) {
+ if (PageAnon(page)) {
+ swp_entry_t entry = { .val = page->private };
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
- swp_entry_t entry = { .val = page->index };
+ BUG_ON(!PageSwapCache(page));
swap_duplicate(entry);
set_pte(ptep, swp_entry_to_pte(entry));
BUG_ON(pte_file(*ptep));
@@ -690,7 +703,7 @@
if (!PageLocked(page))
BUG();
/* We need backing store to swap out a page. */
- if (!page->mapping)
+ if (!page_mapping(page) && !PageSwapCache(page))
BUG();
/*
@@ -757,118 +770,12 @@
}
}
out:
- if (!page_mapped(page))
+ if (!page_mapped(page)) {
dec_page_state(nr_mapped);
- return ret;
-}
-
-/**
- * page_convert_anon - Convert an object-based mapped page to pte_chain-based.
- * @page: the page to convert
- *
- * Find all the mappings for an object-based page and convert them
- * to 'anonymous', ie create a pte_chain and store all the pte pointers there.
- *
- * This function takes the address_space->i_shared_sem and the pte_chain_lock
- * for the page. It jumps through some hoops to preallocate the correct number
- * of pte_chain structures to ensure that it can complete without releasing
- * the lock.
- */
-void page_convert_anon(struct page *page)
-{
- struct address_space *mapping = page->mapping;
- struct vm_area_struct *vma;
- struct pte_chain *pte_chain = NULL, *ptec;
- pte_t *pte;
- pte_addr_t pte_paddr;
- int mapcount;
- int index = 0;
-
- if (PageAnon(page))
- goto out;
-
-retry:
- /*
- * Preallocate the pte_chains outside the lock.
- */
- mapcount = page->pte.mapcount;
- if (mapcount > 1) {
- for (; index < mapcount; index += NRPTE) {
- ptec = pte_chain_alloc(GFP_KERNEL);
- ptec->next = pte_chain;
- pte_chain = ptec;
- }
- }
- down(&mapping->i_shared_sem);
- pte_chain_lock(page);
-
- /*
- * Check to make sure the number of mappings didn't change. If they
- * did, either retry or free enough pte_chains to compensate.
- */
- if (mapcount < page->pte.mapcount) {
- pte_chain_unlock(page);
- up(&mapping->i_shared_sem);
- goto retry;
- } else if ((mapcount > page->pte.mapcount) && (mapcount > 1)) {
- mapcount = page->pte.mapcount;
- while ((index - NRPTE) > mapcount) {
- index -= NRPTE;
- ptec = pte_chain->next;
- pte_chain_free(pte_chain);
- pte_chain = ptec;
- }
- if (mapcount <= 1)
- pte_chain_free(pte_chain);
- }
- SetPageAnon(page);
-
- if (mapcount == 0)
- goto out_unlock;
- else if (mapcount == 1) {
- SetPageDirect(page);
- page->pte.direct = 0;
- } else
- page->pte.chain = pte_chain;
-
- index = NRPTE-1;
- list_for_each_entry(vma, &mapping->i_mmap, shared) {
- pte = find_pte(vma, page, NULL);
- if (pte) {
- pte_paddr = ptep_to_paddr(pte);
- pte_unmap(pte);
- if (PageDirect(page)) {
- page->pte.direct = pte_paddr;
- goto out_unlock;
- }
- pte_chain->ptes[index] = pte_paddr;
- if (!--index) {
- pte_chain = pte_chain->next;
- index = NRPTE-1;
- }
- }
+ if (PageAnon(page))
+ clear_page_anon(page);
}
- list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- pte = find_pte(vma, page, NULL);
- if (pte) {
- pte_paddr = ptep_to_paddr(pte);
- pte_unmap(pte);
- if (PageDirect(page)) {
- page->pte.direct = pte_paddr;
- goto out_unlock;
- }
- pte_chain->ptes[index] = pte_paddr;
- if (!--index) {
- pte_chain = pte_chain->next;
- index = NRPTE-1;
- }
- }
- }
-out_unlock:
- pte_chain_unlock(page);
- up(&mapping->i_shared_sem);
-out:
- return;
+ return ret;
}
/**
--- anobjrmap1/mm/swap_state.c Wed Mar 5 07:26:34 2003
+++ anobjrmap2/mm/swap_state.c Thu Mar 20 17:10:01 2003
@@ -13,40 +13,35 @@
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
-#include <linux/buffer_head.h> /* block_sync_page() */
+#include <linux/buffer_head.h>
#include <asm/pgtable.h>
/*
- * swapper_inode doesn't do anything much. It is really only here to
- * avoid some special-casing in other parts of the kernel.
+ * Try to avoid special-casing swapcache in shrink_list.
*/
-static struct inode swapper_inode = {
- .i_mapping = &swapper_space,
-};
-
static struct backing_dev_info swap_backing_dev_info = {
.ra_pages = 0, /* No readahead */
.memory_backed = 1, /* Does not contribute to dirty memory */
};
-extern struct address_space_operations swap_aops;
+static struct address_space_operations swap_aops = {
+ .writepage = swap_writepage,
+ .readpage = swap_readpage,
+ /*
+ * sync_page and set_page_dirty are special-cased.
+ */
+};
+/*
+ * Only a few fields of swapper_space, those initialized below,
+ * are ever used: leave most fields null to oops if ever used.
+ */
struct address_space swapper_space = {
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC),
.page_lock = RW_LOCK_UNLOCKED,
- .clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages),
- .dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages),
- .io_pages = LIST_HEAD_INIT(swapper_space.io_pages),
- .locked_pages = LIST_HEAD_INIT(swapper_space.locked_pages),
- .host = &swapper_inode,
.a_ops = &swap_aops,
.backing_dev_info = &swap_backing_dev_info,
- .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap),
- .i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared),
- .i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem),
- .private_lock = SPIN_LOCK_UNLOCKED,
- .private_list = LIST_HEAD_INIT(swapper_space.private_list),
};
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
@@ -68,30 +63,50 @@
swap_cache_info.noent_race, swap_cache_info.exist_race);
}
-int add_to_swap_cache(struct page *page, swp_entry_t entry)
+static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
+{
+ int error;
+
+ BUG_ON(PageSwapCache(page));
+ BUG_ON(PagePrivate(page));
+ error = radix_tree_preload(GFP_ATOMIC);
+ if (!error) {
+ page_cache_get(page);
+ write_lock(&swapper_space.page_lock);
+ error = radix_tree_insert(&swapper_space.page_tree,
+ entry.val, page);
+ if (!error) {
+ SetPageLocked(page);
+ SetPageSwapCache(page);
+ page->private = entry.val;
+ inc_page_state(nr_swapcache);
+ } else
+ page_cache_release(page);
+ write_unlock(&swapper_space.page_lock);
+ radix_tree_preload_end();
+ }
+ return error;
+}
+
+static int add_to_swap_cache(struct page *page, swp_entry_t entry)
{
int error;
- if (page->mapping)
- BUG();
if (!swap_duplicate(entry)) {
INC_CACHE_INFO(noent_race);
return -ENOENT;
}
- error = add_to_page_cache(page, &swapper_space, entry.val, GFP_ATOMIC);
+ error = __add_to_swap_cache(page, entry);
/*
- * Anon pages are already on the LRU, we don't run lru_cache_add here.
+ * Anon pages are already on the LRU,
+ * we don't run lru_cache_add here.
*/
- if (error != 0) {
+ if (error) {
swap_free(entry);
if (error == -EEXIST)
INC_CACHE_INFO(exist_race);
return error;
}
- if (!PageLocked(page))
- BUG();
- if (!PageSwapCache(page))
- BUG();
INC_CACHE_INFO(add_total);
return 0;
}
@@ -105,7 +120,10 @@
BUG_ON(!PageLocked(page));
BUG_ON(!PageSwapCache(page));
BUG_ON(PageWriteback(page));
- __remove_from_page_cache(page);
+
+ radix_tree_delete(&swapper_space.page_tree, page->private);
+ ClearPageSwapCache(page);
+ dec_page_state(nr_swapcache);
INC_CACHE_INFO(del_total);
}
@@ -149,8 +167,7 @@
/*
* Add it to the swap cache and mark it dirty
*/
- err = add_to_page_cache(page, &swapper_space,
- entry.val, GFP_ATOMIC);
+ err = __add_to_swap_cache(page, entry);
if (pf_flags & PF_MEMALLOC)
current->flags |= PF_MEMALLOC;
@@ -158,8 +175,7 @@
switch (err) {
case 0: /* Success */
SetPageUptodate(page);
- ClearPageDirty(page);
- set_page_dirty(page);
+ SetPageDirty(page);
INC_CACHE_INFO(add_total);
return 1;
case -EEXIST:
@@ -185,11 +201,12 @@
{
swp_entry_t entry;
+ BUG_ON(!PageSwapCache(page));
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
BUG_ON(page_has_buffers(page));
- entry.val = page->index;
+ entry.val = page->private;
write_lock(&swapper_space.page_lock);
__delete_from_swap_cache(page);
@@ -201,27 +218,12 @@
int move_to_swap_cache(struct page *page, swp_entry_t entry)
{
- struct address_space *mapping = page->mapping;
- int err;
-
- write_lock(&swapper_space.page_lock);
- write_lock(&mapping->page_lock);
-
- err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
- if (!err) {
- __remove_from_page_cache(page);
- ___add_to_page_cache(page, &swapper_space, entry.val);
- }
-
- write_unlock(&mapping->page_lock);
- write_unlock(&swapper_space.page_lock);
-
+ int err = __add_to_swap_cache(page, entry);
if (!err) {
+ remove_from_page_cache(page);
if (!swap_duplicate(entry))
BUG();
- /* shift page from clean_pages to dirty_pages list */
- BUG_ON(PageDirty(page));
- set_page_dirty(page);
+ SetPageDirty(page);
INC_CACHE_INFO(add_total);
} else if (err == -EEXIST)
INC_CACHE_INFO(exist_race);
@@ -231,29 +233,9 @@
int move_from_swap_cache(struct page *page, unsigned long index,
struct address_space *mapping)
{
- swp_entry_t entry;
- int err;
-
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
- BUG_ON(page_has_buffers(page));
-
- entry.val = page->index;
-
- write_lock(&swapper_space.page_lock);
- write_lock(&mapping->page_lock);
-
- err = radix_tree_insert(&mapping->page_tree, index, page);
- if (!err) {
- __delete_from_swap_cache(page);
- ___add_to_page_cache(page, mapping, index);
- }
-
- write_unlock(&mapping->page_lock);
- write_unlock(&swapper_space.page_lock);
-
+ int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
if (!err) {
- swap_free(entry);
+ delete_from_swap_cache(page);
/* shift page from clean_pages to dirty_pages list */
ClearPageDirty(page);
set_page_dirty(page);
@@ -261,7 +243,6 @@
return err;
}
-
/*
* If we are the only user, then try to free up the swap cache.
*
@@ -319,9 +300,15 @@
*/
struct page * lookup_swap_cache(swp_entry_t entry)
{
- struct page *found;
+ struct page *page;
- found = find_get_page(&swapper_space, entry.val);
+ read_lock(&swapper_space.page_lock);
+ page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
+ if (page) {
+ page_cache_get(page);
+ INC_CACHE_INFO(find_success);
+ }
+ read_unlock(&swapper_space.page_lock);
/*
* Unsafe to assert PageSwapCache and mapping on page found:
* if SMP nothing prevents swapoff from deleting this page from
@@ -329,9 +316,7 @@
* that, but no need to change: we _have_ got the right page.
*/
INC_CACHE_INFO(find_total);
- if (found)
- INC_CACHE_INFO(find_success);
- return found;
+ return page;
}
/*
@@ -352,7 +337,12 @@
* that would confuse statistics: use find_get_page()
* directly.
*/
- found_page = find_get_page(&swapper_space, entry.val);
+ read_lock(&swapper_space.page_lock);
+ found_page = radix_tree_lookup(&swapper_space.page_tree,
+ entry.val);
+ if (found_page)
+ page_cache_get(found_page);
+ read_unlock(&swapper_space.page_lock);
if (found_page)
break;
--- anobjrmap1/mm/swapfile.c Thu Mar 20 17:09:50 2003
+++ anobjrmap2/mm/swapfile.c Thu Mar 20 17:10:02 2003
@@ -242,7 +242,7 @@
struct swap_info_struct * p;
swp_entry_t entry;
- entry.val = page->index;
+ entry.val = page->private;
p = swap_info_get(entry);
if (p) {
/* Is the only swap cache user the cache itself? */
@@ -310,7 +310,7 @@
if (page_count(page) != 2) /* 2: us + cache */
return 0;
- entry.val = page->index;
+ entry.val = page->private;
p = swap_info_get(entry);
if (!p)
return 0;
@@ -348,8 +348,14 @@
p = swap_info_get(entry);
if (p) {
- if (swap_entry_free(p, swp_offset(entry)) == 1)
- page = find_trylock_page(&swapper_space, entry.val);
+ if (swap_entry_free(p, swp_offset(entry)) == 1) {
+ read_lock(&swapper_space.page_lock);
+ page = radix_tree_lookup(&swapper_space.page_tree,
+ entry.val);
+ if (page && TestSetPageLocked(page))
+ page = NULL;
+ read_unlock(&swapper_space.page_lock);
+ }
swap_info_put(p);
}
if (page) {
@@ -959,15 +965,14 @@
struct backing_dev_info *bdi;
BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
-
- bdi = page->mapping->backing_dev_info;
if (PageSwapCache(page)) {
- swp_entry_t entry = { .val = page->index };
+ swp_entry_t entry = { .val = page->private };
struct swap_info_struct *sis;
sis = get_swap_info_struct(swp_type(entry));
bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
- }
+ } else
+ bdi = page->mapping->backing_dev_info;
return bdi_write_congested(bdi);
}
#endif
--- anobjrmap1/mm/truncate.c Mon Feb 10 20:12:55 2003
+++ anobjrmap2/mm/truncate.c Thu Mar 20 17:10:02 2003
@@ -54,7 +54,6 @@
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
remove_from_page_cache(page);
- page_cache_release(page); /* pagecache ref */
}
/*
--- anobjrmap1/mm/vmscan.c Thu Mar 20 17:09:50 2003
+++ anobjrmap2/mm/vmscan.c Thu Mar 20 17:10:02 2003
@@ -176,20 +176,20 @@
/* Must be called with page's pte_chain_lock held. */
static inline int page_mapping_inuse(struct page *page)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping;
/* Page is in somebody's page tables. */
if (page_mapped(page))
return 1;
- /* XXX: does this happen ? */
- if (!mapping)
- return 0;
-
/* Be more reluctant to reclaim swapcache than pagecache */
if (PageSwapCache(page))
return 1;
+ mapping = page_mapping(page);
+ if (!mapping)
+ return 0;
+
/* File is mmap'd by somebody. */
if (!list_empty(&mapping->i_mmap))
return 1;
@@ -261,22 +261,25 @@
goto activate_locked;
}
- mapping = page->mapping;
+ mapping = page_mapping(page);
#ifdef CONFIG_SWAP
/*
- * Anonymous process memory without backing store. Try to
- * allocate it some swap space here.
+ * Anonymous process memory has backing store?
+ * Try to allocate it some swap space here.
*
* XXX: implement swap clustering ?
*/
- if (page_mapped(page) && !mapping && !PagePrivate(page)) {
+ if (PageSwapCache(page))
+ mapping = &swapper_space;
+ else if (PageAnon(page)) {
pte_chain_unlock(page);
if (!add_to_swap(page))
goto activate_locked;
pte_chain_lock(page);
- mapping = page->mapping;
+ mapping = &swapper_space;
}
+#endif /* CONFIG_SWAP */
/*
* The page is mapped into the page tables of one or more
@@ -294,7 +297,6 @@
; /* try to free the page below */
}
}
-#endif /* CONFIG_SWAP */
pte_chain_unlock(page);
/*
@@ -335,7 +337,9 @@
.for_reclaim = 1,
};
- list_move(&page->list, &mapping->locked_pages);
+ if (!PageSwapCache(page))
+ list_move(&page->list,
+ &mapping->locked_pages);
write_unlock(&mapping->page_lock);
SetPageReclaim(page);
@@ -399,7 +403,7 @@
#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page->index };
+ swp_entry_t swap = { .val = page->private };
__delete_from_swap_cache(page);
write_unlock(&mapping->page_lock);
swap_free(swap);
@@ -641,7 +645,7 @@
* FIXME: need to consider page_count(page) here if/when we
* reap orphaned pages via the LRU (Daniel's locking stuff)
*/
- if (total_swap_pages == 0 && !page->mapping &&
+ if (total_swap_pages == 0 && !page_mapping(page) &&
!PagePrivate(page)) {
list_add(&page->lru, &l_active);
continue;
Hugh Dickins <[email protected]> wrote:
>
> First of a sequence of six patches, extending Dave McCracken's
> objrmap to handle anonymous memory too, eliminating pte_chains.
>
> Based upon 2.5.65-mm2, the aggregate has
> 81 files changed, 1140 insertions(+), 1634 deletions(-)
>
> anobjrmap 1/6 create include/linux/rmap.h
> anobjrmap 2/6 free page->mapping for use by anon
> anobjrmap 3/6 remove pte-pointer-based rmap
> anobjrmap 4/6 add anonmm to track anonymous pages
> anonjrmap 5/6 virtual address chains for odd cases
> anonjrmap 6/6 updates to arches other than i386
>
> I've not done any timings, hope others can do that better than
> I would. My guess is that Dave has already covered the worst
> cases, but this should cut the rmap overhead when forking.
Initial indications are that it offers no performance advantage over objrmap.
This needs a lot more work. Timings were on a 2.7G P4-HT. The workload is
applying and removing the 125 patches in 2.5.65-mm. Tons of bash forking.
Note that on uniprocessor kernels we're almost equal to 2.4. But on SMP, 2.5
is way slower.
The profiles are all over the place because the readprofile -M option (which
I use to boost the profiler interrupt rate by ten) isn't working on UP+APIC
2.5 kernels for some reason.
This all needs to be redone with oprofile, find out what on earth is going
on.
objrmap+Hugh's stuff
====================
UP:
pushpatch 999 3.90s user 4.33s system 99% cpu 8.271 total
poppatch 999 2.62s user 2.96s system 99% cpu 5.599 total
SMP:
pushpatch 9999 4.11s user 8.39s system 97% cpu 12.758 total
poppatch 9999 2.82s user 5.42s system 99% cpu 8.269 total
c011c438 copy_mm 694 0.6968
c01590a4 link_path_walk 696 0.3141
c01f9b30 __copy_to_user_ll 696 6.6923
c0114dec flush_tlb_page 766 5.3194
c013dfa0 clear_page_tables 792 2.3294
c013fc20 do_no_page 833 1.0735
c013d500 install_page 987 2.2847
c0117c5c pte_alloc_one 1056 7.5429
c0162510 d_lookup 1254 3.4076
c0108f88 system_call 1292 29.3636
c013fa50 do_anonymous_page 1294 2.7888
c0133e28 find_get_page 2526 27.4565
c0144280 page_add_rmap 2793 8.4127
c013b274 release_pages 4032 10.6105
c0117db0 do_page_fault 4546 3.9530
c013e534 zap_pte_range 5153 10.0645
c01443cc page_dup_rmap 6065 63.1771
c013e2ec copy_page_range 6878 11.7774
c013f220 do_wp_page 8339 10.4761
c01445b0 page_remove_rmap 9415 40.5819
c0106f94 default_idle 131970 2537.8846
00000000 total 217986 0.1157
objrmap
=======
UP:
pushpatch 999 3.91s user 4.64s system 99% cpu 8.584 total
poppatch 999 2.66s user 3.09s system 99% cpu 5.772 total
SMP:
pushpatch 999 4.00s user 8.30s system 100% cpu 12.270 total
poppatch 9999 2.75s user 5.66s system 99% cpu 8.412 total
c011c404 copy_mm 718 0.7671
c013df50 clear_page_tables 770 2.1875
c013ff38 do_no_page 805 1.0705
c0114dec flush_tlb_page 808 5.6111
c013d450 install_page 1035 1.9602
c0117c5c pte_alloc_one 1150 8.2143
c0162a60 d_lookup 1180 3.2065
c0108f88 system_call 1229 27.9318
c013fc9c do_anonymous_page 1413 2.1153
c0133d6c find_get_page 2415 25.1562
c013b1d4 release_pages 4175 10.9868
c0117db0 do_page_fault 4392 3.8191
c013e704 zap_pte_range 5790 10.6434
c013e2cc copy_page_range 6242 5.7796
c013f410 do_wp_page 8462 9.9319
c0144010 page_add_rmap 11357 24.6891
c01441dc page_remove_rmap 11581 18.5593
c0106f94 default_idle 126736 2437.2308
00000000 total 218819 0.1160
100% pte_chains
===============
UP:
pushpatch 999 3.97s user 5.97s system 99% cpu 9.947 total
poppatch 999 2.74s user 3.95s system 99% cpu 6.719 total
c01da5bc radix_tree_lookup 55 0.7237
c013c0d8 free_page_and_swap_cache 57 0.6786
c0149614 link_path_walk 59 0.0322
c01dbac8 __copy_from_user_ll 64 0.5818
c01dba60 __copy_to_user_ll 65 0.6250
c01334dc pte_alloc_map 74 0.4111
c0134c20 do_no_page 74 0.1267
c01512ac d_lookup 93 0.3633
c0114f5c pte_alloc_one 97 0.6929
c0134a4c do_anonymous_page 105 0.2244
c0108b54 system_call 125 2.8409
c0133624 copy_page_range 202 0.2644
c01150b0 do_page_fault 409 0.3658
c0133920 zap_pte_range 421 0.9656
c013836c page_add_rmap 570 2.7404
c013431c do_wp_page 846 1.1371
c013843c page_remove_rmap 850 2.7597
00000000 total 5952 0.0036
SMP:
pushpatch 9999 3.89s user 10.38s system 100% cpu 14.152 total
poppatch 999 2.83s user 6.20s system 99% cpu 9.045 total
c0138478 __set_page_dirty_buffers 654 1.6188
c01f9560 __copy_to_user_ll 676 6.5000
c01f95c8 __copy_from_user_ll 727 6.6091
c011c404 copy_mm 809 0.8643
c013def0 clear_page_tables 817 2.3210
c0114dec flush_tlb_page 854 5.9306
c013fea8 do_no_page 909 1.2625
c013d420 install_page 1068 2.2250
c0117c5c pte_alloc_one 1070 7.6429
c0108f88 system_call 1230 27.9545
c013fc20 do_anonymous_page 1316 2.0309
c0161f40 d_lookup 1355 3.6821
c0133d6c find_get_page 2543 26.4896
c013b1a4 release_pages 4227 11.1237
c0117db0 do_page_fault 4550 3.9565
c013e6a4 zap_pte_range 6399 11.7629
c013e26c copy_page_range 7228 6.6926
c013f3b0 do_wp_page 9082 10.7607
c0143d90 page_add_rmap 13941 41.9910
c0143edc page_remove_rmap 18275 36.8448
c0106f94 default_idle 137863 2651.2115
00000000 total 242308 0.1287
2.4.21-pre5
===========
UP:
pushpatch 9999 3.98s user 3.66s system 99% cpu 7.656 total
poppatch 9999 2.52s user 2.59s system 99% cpu 5.136 total
SMP:
pushpatch 9999 4.34s user 6.10s system 122% cpu 8.522 total
poppatch 999 2.93s user 3.90s system 118% cpu 5.761 total
c0258e70 atomic_dec_and_lock 53 0.7067
c0129b90 unlock_page 63 0.5833
c0116afc copy_mm 71 0.0970
c01275ac handle_mm_fault 73 0.3967
c0129cd8 __find_get_page 77 1.1324
c011b2ec exit_notify 90 0.1257
c0127410 do_no_page 91 0.2209
c0131554 rmqueue 92 0.1586
c014aa94 d_lookup 93 0.3185
c0127324 do_anonymous_page 100 0.4237
c0111b08 flush_tlb_page 117 0.9750
c0106f60 system_call 136 2.4286
c0131b8c __free_pages 240 6.6667
c0126080 copy_page_range 257 0.5949
c0126230 zap_page_range 278 0.3341
c0113ec8 do_page_fault 954 0.7762
c0126de0 do_wp_page 972 1.8692
c01052c0 default_idle 6414 114.5357
00000000 total 12240 0.0088
Hugh Dickins <[email protected]> wrote:
>
> It is likely that I've screwed up on the "Morton pages", those
> ext3 journal pages locked at truncate time which then turn into
> fish with wings: please check them out, I never manage to wrap
> my head around them. Certainly don't want a page using private
> for both bufferheads and swp_entry_t.
It goes BUG in try_to_free_buffers().
We really should fix this up for other reasons, probably by making ext3's
per-page truncate operations wait on commit, and be more aggressive about
pulling the page's buffers off the transaction at truncate time.
The same thing _could_ happen with other filesystems; not too sure about
that.
Still. I suggest you look at freeing up page->list from anon/swapcache
pages. It really doesn't do much.
Meanwhile, I backed out that bit - I don't actually see where the failure is
anyway. The page is page_mapped(), !PageAnon and ->mapping == NULL.
On Thu, Mar 20, 2003 at 10:48:13PM -0800, Andrew Morton wrote:
> This all needs to be redone with oprofile, find out what on earth is going
> on.
How about this?
plain -mm:
vma samples % symbol name
c0106ff4 3819286 51.7395 default_idle
c01df3f0 381341 5.16599 __copy_to_user_ll
c0119760 163634 2.21674 load_balance
c0119cfc 136901 1.85459 rebalance_tick
c011fe3c 122794 1.66348 profile_hook
c014442c 115477 1.56436 do_anonymous_page
c0118db8 114318 1.54866 try_to_wake_up
c024c418 114230 1.54746 increment_tail
c0114888 104490 1.41552 smp_apic_timer_interrupt
c011a2f4 103599 1.40345 schedule
c0139b78 100032 1.35512 __get_page_state
c0135948 93468 1.2662 find_get_page
c0119e70 77881 1.05505 scheduler_tick
c024c440 71306 0.965976 sync_buffer
c0149718 62369 0.844907 page_add_rmap
c0113730 59338 0.803847 mark_offset_tsc
c016b87c 52394 0.709777 d_lookup
c0144e6c 51196 0.693548 handle_mm_fault
c0142614 47403 0.642164 zap_pte_range
c01189c0 46848 0.634646 kmap_atomic
c0149964 44502 0.602865 page_remove_rmap
c01448a4 44499 0.602824 do_no_page
anon:
vma samples % symbol name
c0106ff4 4665308 52.2771 default_idle
c01de800 438006 4.90808 __copy_to_user_ll
c01197e0 184918 2.0721 load_balance
c011ff2c 180525 2.02287 profile_hook
c0118e38 179582 2.01231 try_to_wake_up
c0119d7c 150162 1.68264 rebalance_tick
c011a374 146226 1.63854 schedule
c01440ac 130928 1.46711 do_anonymous_page
c024b828 120327 1.34832 increment_tail
c0139cb8 114392 1.28182 __get_page_state
c0135ae4 111511 1.24954 find_get_page
c0114888 103236 1.15681 smp_apic_timer_interrupt
c0119ef0 82291 0.922112 scheduler_tick
c024b850 73072 0.818808 sync_buffer
c0113730 67964 0.761571 mark_offset_tsc
c01d6e5c 66398 0.744023 ipc_lock
c016acac 64492 0.722665 d_lookup
c014496c 60306 0.675759 handle_mm_fault
c01422e4 58951 0.660576 zap_pte_range
c0149780 56439 0.632427 page_add_rmap
c0141f44 55806 0.625334 copy_page_range
c0118a30 53581 0.600402 kmap_atomic
Multiplicative differential profiling says (0 hits either way omitted):
function (anon fraction hits)/(-mm fraction hits)
---------------------------------------------------------------
task_curr 9.09879
read_pages 5.79014
group_send_sig_info 3.89948
pid_revalidate 3.72223
.text.lock.exec 3.54498
queue_work 3.51544
ext2_count_dirs 3.44207
vmtruncate 3.38385
datagram_poll 3.30865
move_vma 3.30865
net_rx_action 3.30865
init_once 3.30865
init_once 3.30865
ipc_unlock 3.18818
kstat_read_proc 2.82614
add_to_page_cache_lru 2.67237
write_null 2.64692
restore_i387_fxsave 2.61935
__wake_up_locked 2.58488
collect_sigign_sigcatch 2.58236
proc_calc_metrics 2.56421
end_lapic_irq 2.48149
get_sample_stats 2.48149
name_to_int 2.48149
d_unhash 2.2747
create_buffers 2.16535
block_read_full_page 2.0771
tty_write 2.06791
free_hot_page 2.06791
task_dumpable 2.03031
sys_select 1.98519
sys_ftruncate 1.98519
__getblk_slow 1.93005
move_one_page 1.93005
proc_root_link 1.93005
proc_pid_stat 1.90668
set_bh_page 1.89651
.text.lock.sys_i386 1.86112
proc_pid_status 1.81188
sys_exit_group 1.80472
page_cache_read 1.80472
task_vsize 1.73187
get_wchan 1.72952
ext2_destroy_inode 1.72952
expand_files 1.7205
__get_free_pages 1.71341
do_select 1.71341
sys_msgrcv 1.70893
create_empty_buffers 1.67713
do_fcntl 1.66993
fcntl_setlk 1.65433
__find_get_block_slow 1.65433
task_statm 1.65433
unix_sock_destructor 1.65433
__kfree_skb 1.65433
flock_to_posix_lock 1.65433
locks_copy_lock 1.65433
eth_header 1.65433
kfree_skbmem 1.65433
ll_merge_requests_fn 1.65433
credit_entropy_store 1.65433
write_chan 1.65433
ipc_addid 1.65433
init_once 1.65433
init_once 1.65433
seq_printf 1.65433
lookup_create 1.65433
.text.lock.signal 1.65433
.text.lock.signal 1.65433
do_mremap 1.65433
sock_create 1.65433
sys_readlink 1.65433
bio_hw_segments 1.65433
__switch_to 1.63754
block_truncate_page 1.63415
convert_fxsr_from_user 1.62848
pmd_ctor 1.61586
.text.lock.char_dev 1.61079
unmap_underlying_metadata 1.58902
pty_close 1.57161
ext2_rename 1.56242
mm_notify 1.56242
__write_lock_failed 1.56154
ext2_empty_dir 1.55701
mempool_alloc_slab 1.55093
clear_inode 1.53616
pipe_ioctl 1.53616
restore_sigcontext 1.52319
tty_fasync 1.52273
default_llseek 1.52091
mpage_readpage 1.51647
lru_cache_add 1.51647
vfs_rename_other 1.51253
ext2_find_near 1.50836
handle_ra_miss 1.48889
.text.lock.tty_io 1.48513
tty_release 1.46073
cpu_buffer_reset 1.4538
sys_sysctl 1.44754
get_chrfops 1.43988
testmsg 1.43043
sys_rename 1.42874
send_signal 1.42874
remove_from_page_cache 1.4278
lock_rename 1.42272
send_sig_info 1.41799
iput 1.40548
drop_buffers 1.40407
pipe_release 1.39699
ext2_preread_inode 1.38621
setup_sigcontext 1.37861
__bforget 1.37861
nr_running 1.37861
unix_stream_connect 1.37861
sock_poll 1.37861
sock_alloc 1.37861
ext2_write_inode 1.37861
si_meminfo 1.37861
vfs_create 1.37037
.text.lock.inode 1.36307
do_IRQ 1.35899
filp_dtor 1.35514
vfs_readdir 1.35443
get_signal_to_deliver 1.34771
get_pid_list 1.34414
parse_table 1.34414
read_cache_page 1.33028
proc_read_inode 1.32991
sys_munmap 1.3247
cpu_idle 1.32205
sys_fcntl64 1.31594
get_exec_dcookie 1.31383
remove_suid 1.30668
__block_prepare_write 1.30319
.text.lock.semaphore 1.30284
save_i387_fxsave 1.29983
pipefs_delete_dentry 1.29983
sys_mremap 1.29983
try_to_wake_up 1.29939
do_gettimeofday 1.29589
update_wall_time 1.2958
sys_access 1.29537
__generic_file_aio_read 1.29341
pty_open 1.29186
try_atomic_semop 1.28717
mpage_alloc 1.2867
.text.lock.ioctl 1.2867
sys_fork 1.2867
change_protection 1.2847
cache_grow 1.28443
sys_sigreturn 1.28326
free_uid 1.2821
sys_getrlimit 1.2821
worker_thread 1.27834
release_x86_irqs 1.27464
recalc_bh_state 1.26659
vfs_getattr 1.26275
ipc_lock 1.26266
sk_alloc 1.26251
sys_chdir 1.26142
__set_page_buffers 1.26108
igrab 1.25386
__read_lock_failed 1.25354
__bread 1.25108
convert_fxsr_to_user 1.248
find_group_other 1.24724
ext2_delete_inode 1.24074
work_notifysig 1.24074
__down_failed 1.24074
ip_route_input_slow 1.24074
task_mem 1.24074
max_select_fd 1.24074
as_antic_stop 1.24074
as_find_first_arq 1.24074
elv_may_queue 1.24074
uart_write 1.24074
do_getitimer 1.24074
hugetlb_report_meminfo 1.24074
elv_try_last_merge 1.24074
d_free 1.23032
add_kernel_ctx_switch 1.22593
__down 1.22447
__unhash_process 1.22427
init_new_context 1.22177
wake_up_process 1.22097
ext2_put_inode 1.21898
sys_msgsnd 1.21686
profile_hook 1.21605
sys_newuname 1.21575
__rb_rotate_right 1.21479
__exit_sighand 1.21278
dupfd 1.20893
sys_msgget 1.20643
ack_lapic_irq 1.2057
sys_getppid 1.20529
reap_timer_fnc 1.20383
copy_one_pte 1.20315
vfs_follow_link 1.19977
ipc_findkey 1.19939
destroy_context 1.19479
get_one_pte_map_nested 1.19479
free_pages_and_swap_cache 1.19047
dentry_open 1.18975
generic_drop_inode 1.1856
wait_task_zombie 1.18542
kmap_high 1.18276
load_elf_interp 1.18197
proc_pid_readlink 1.18166
vfs_rmdir 1.18166
sys_getgid 1.18166
ksoftirqd 1.18166
inode_sub_bytes 1.17735
load_elf_binary 1.17704
ext2_alloc_inode 1.17181
ext2_free_inode 1.16916
alloc_inode 1.16856
schedule 1.16751
__dequeue_signal 1.16544
ext2_new_inode 1.16451
detach_pid 1.16449
__insert_inode_hash 1.16399
cap_bprm_compute_creds 1.1626
d_move 1.1591
flush_old_exec 1.15845
filp_ctor 1.15803
sys_umask 1.15803
__unix_insert_socket 1.15803
mpage_bio_submit 1.15803
smp_send_reschedule 1.15803
filemap_getpage 1.15684
eventpoll_init_file 1.15425
setup_frame 1.15407
generic_file_open 1.15376
__wake_up_sync 1.15347
add_cpu_switch 1.15027
profile_exit_mmap 1.1502
__brelse 1.14943
sched_best_cpu 1.14599
__copy_from_user_ll 1.14494
__vma_link_rb 1.14359
check_highmem_ptes 1.14309
unmap_region 1.14208
.text.lock.mmap 1.14014
d_callback 1.13996
eligible_child 1.139
kmem_flagcheck 1.13735
unix_release_sock 1.13735
sock_fasync 1.13735
.text.lock.fork 1.13735
expand_stack 1.13593
copy_thread 1.13512
copy_namespace 1.13508
read_inode_bitmap 1.13258
sys_ioctl 1.13138
mprotect_fixup 1.13137
d_path 1.13046
ext2_setattr 1.12795
do_signal 1.12614
recalc_sigpending 1.12412
lookup_hash 1.12409
flush_signal_handlers 1.11807
inode_change_ok 1.11762
clear_page_tables 1.11716
cache_flusharray 1.11529
free_buffer_head 1.11528
find_dcookie 1.11487
alloc_pidmap 1.11417
copy_mm 1.11406
do_group_exit 1.11402
add_user_ctx_switch 1.11355
tty_open 1.11207
rm_from_queue 1.11076
__free_pages_ok 1.11029
destroy_compound_page 1.10995
group_release_blocks 1.10824
mmput 1.10794
vfs_rename 1.10756
fput 1.10624
ext2_add_link 1.1051
pipe_write 1.10345
get_pipe_inode 1.10288
sprintf 1.10288
grow_dev_page 1.10288
mpage_readpages 1.10288
add_disk_randomness 1.10288
sem_revalidate 1.10288
ext2_write_super 1.10288
release_dev 1.10288
handle_IRQ_event 1.10214
remove_shared_vm_struct 1.10116
group_reserve_blocks 1.101
may_open 1.0957
add_wait_queue 1.09563
vfs_lstat 1.09537
setup_arg_pages 1.09501
vma_link 1.09406
ext2_reserve_inode 1.09374
sigprocmask 1.09251
mm_alloc 1.09234
deny_write_access 1.09157
prepare_binprm 1.09115
add_to_page_cache 1.08975
update_wall_time_one_tick 1.08786
sync_cpu_buffers 1.08767
__exit_signal 1.08758
init_fpu 1.08757
radix_tree_insert 1.08415
flush_all_zero_pkmaps 1.08415
timer_interrupt 1.08394
ext2_follow_link 1.08387
generic_fillattr 1.08236
read_block_bitmap 1.08207
release_task 1.08195
ext2_new_block 1.08167
complete 1.08167
flush_sigqueue 1.08167
pipe_new 1.08167
ext2_alloc_branch 1.08052
inode_update_time 1.08025
permission 1.07975
sys_rt_sigprocmask 1.07815
old_mmap 1.07604
alloc_skb 1.07531
radix_tree_node_alloc 1.07531
sys_time 1.07475
inode_add_bytes 1.07386
nr_blockdev_pages 1.07348
do_timer 1.07189
ext2_free_blocks 1.07163
get_jiffies_64 1.06996
compute_creds 1.06907
search_binary_handler 1.06903
attach_pid 1.06859
flush_thread 1.06779
check_tty_count 1.06731
ext2_count_free_blocks 1.06612
exec_mmap 1.06602
down_tty_sem 1.06593
.text.lock.sched 1.06539
set_page_address 1.06474
serial_in 1.0635
.text.lock.timer 1.0635
d_rehash 1.06277
kill_fasync 1.06229
pgd_alloc 1.06086
vm_acct_memory 1.05966
end_level_ioapic_irq 1.05877
sys_wait4 1.05803
remove_wait_queue 1.05742
next_thread 1.0574
ipc_checkid 1.05544
prep_compound_page 1.0554
kmalloc 1.055
invalidate_vcache 1.05463
ext2_find_entry 1.05368
generic_file_llseek 1.05355
.text.lock.rcupdate 1.0535
proc_get_inode 1.053
copy_page_range 1.05286
vfs_fstat 1.0525
padzero 1.05148
__might_sleep 1.04971
copy_files 1.04923
do_page_fault 1.04879
sys_semget 1.04878
cap_bprm_set_security 1.04852
.text.lock.inode 1.04852
vfs_write 1.04813
unmap_vma 1.04774
.text.lock.buffer_sync 1.04774
vfs_stat 1.04761
wait_for_completion 1.04748
can_vma_merge_before 1.04676
__vma_link 1.04672
dup_task_struct 1.04642
inode_times_differ 1.04585
try_to_release_page 1.04566
file_move 1.04555
pipe_read_fasync 1.04484
ext2_block_to_path 1.04432
sys_stat64 1.0438
proc_alloc_inode 1.04307
sys_lstat64 1.04191
find_vma_prepare 1.04176
ext2_release_file 1.04161
__block_commit_write 1.0414
mark_buffer_dirty_inode 1.04108
prepare_to_copy 1.03986
rwsem_down_write_failed 1.03603
do_wp_page 1.036
__fput 1.03586
path_lookup 1.0356
split_vma 1.03514
sys_lookup_dcookie 1.03395
sys_geteuid 1.03395
proc_exe_link 1.03395
sys_fchdir 1.03395
__pdflush 1.03395
take_task_mm 1.03155
__rb_erase_color 1.03012
do_mmap_pgoff 1.02971
notify_change 1.02914
sys_llseek 1.02896
file_kill 1.02868
zap_pte_range 1.02867
exit_itimers 1.02836
si_swapinfo 1.02824
free_pages_bulk 1.02808
do_munmap 1.02776
pte_alloc_one 1.02756
filp_close 1.02755
do_page_cache_readahead 1.02599
kmem_cache_free 1.02594
restore_fpu 1.02496
mm_release 1.02307
__pagevec_lru_add 1.0222
get_empty_filp 1.02081
do_brk 1.02077
copy_strings_kernel 1.02075
do_generic_mapping_read 1.02011
copy_process 1.01847
d_lookup 1.01816
blk_recount_segments 1.01805
sys_brk 1.0174
__free_pages 1.01634
exit_aio 1.01563
sched_balance_exec 1.01538
file_read_actor 1.01537
sys_close 1.01532
destroy_inode 1.01477
ext2_get_group_desc 1.01373
hash_vcache 1.01256
get_offset_tsc 1.01243
reschedule_interrupt 1.01236
do_writepages 1.01098
default_idle 1.01039
free_msg 1.00914
migration_thread 1.00852
page_cache_readahead 1.00658
ext2_commit_chunk 1.00587
free_pages 1.00505
__wake_up_common 1.00433
smp_reschedule_interrupt 1.00184
unmap_page_range 1.00175
current_kernel_time 1.00159
rcu_do_batch 1.0013
file_ra_state_init 1.00099
__lookup 0.999648
mm_init 0.999333
handle_signal 0.998504
flush_tlb_mm 0.998044
smp_call_function_interrupt 0.997789
__mark_inode_dirty 0.997741
fd_install 0.997179
copy_strings 0.996865
elf_map 0.996761
proc_delete_dentry 0.996582
get_unused_fd 0.995773
sched_exit 0.99544
strnlen_user 0.995277
update_atime 0.993595
file_ioctl 0.992596
get_dcookie 0.992596
schedule_timeout 0.992596
ext2_get_branch 0.992596
smp_call_function 0.992596
create_elf_tables 0.992404
syscall_call 0.991656
alloc_buffer_head 0.991035
atomic_dec_and_lock 0.990849
sys_remap_file_pages 0.990708
__tasklet_schedule 0.99054
free_block 0.989915
do_fork 0.989352
release_mm 0.98927
find_get_page 0.986838
generic_delete_inode 0.986474
rcu_start_batch 0.986233
send_IPI_mask_sequence 0.985556
nr_filled_slots 0.985058
rwsem_wake 0.984278
release_thread 0.983653
filemap_nopage 0.983273
pipe_read 0.983104
release_pages 0.982928
vfs_read 0.982926
unmap_vma_list 0.982828
supplemental_group_member 0.982683
build_mmap_rb 0.982256
__posix_lock_file 0.982256
link_path_walk 0.97905
.text.lock.swap 0.978936
kunmap_atomic 0.977014
block_invalidatepage 0.976441
fget 0.976335
find_vma 0.9763
kmap 0.976256
add_cookie_switch 0.976116
sys_unlink 0.975111
do_invalidatepage 0.974871
update_one_process 0.974771
find_busiest_node 0.974726
handle_mm_fault 0.974351
page_address 0.974014
do_pipe 0.973563
do_lookup 0.97356
do_truncate 0.973133
unmap_vmas 0.97254
do_exit 0.972067
vfs_unlink 0.972054
nr_free_pages 0.972008
ext2_get_block 0.971733
free_pgtables 0.970872
get_write_access 0.968651
slab_destroy 0.968569
exit_mmap 0.968157
sys_fstat64 0.967667
generic_make_request 0.96736
apic_timer_interrupt 0.966854
cached_lookup 0.966814
find_vma_prev 0.966762
free_hot_cold_page 0.966525
m_start 0.965548
ext2_free_branches 0.965529
.text.lock.util 0.965024
locate_fd 0.965024
smp_invalidate_interrupt 0.965024
get_zone_counts 0.965023
try_to_free_buffers 0.964726
do_sigaction 0.964593
pgd_free 0.964479
cache_alloc_refill 0.963764
vfs_permission 0.963387
__set_page_dirty_nobuffers 0.963266
page_cache_readaround 0.963002
effective_prio 0.962562
generic_forget_inode 0.961298
init_dev 0.96114
tasklet_action 0.960979
do_execve 0.959841
open_namei 0.959727
get_unmapped_area 0.959157
wake_up_forked_process 0.958575
get_tty_driver 0.958459
sys_mprotect 0.957651
__rmqueue 0.957625
lru_cache_add_active 0.956839
radix_tree_delete 0.955954
strncpy_from_user 0.955211
common_interrupt 0.954965
rcu_process_callbacks 0.954943
do_softirq 0.95493
device_not_available 0.954419
blk_remove_plug 0.954419
__alloc_pages 0.952787
vma_merge 0.952397
irq_entries_start 0.952003
radix_tree_lookup 0.951638
bh_waitq_head 0.951237
.text.lock.page_alloc 0.950563
ext2_delete_entry 0.950357
add_sample 0.95034
__copy_to_user_ll 0.950074
autoremove_wake_function 0.949874
inode_setattr 0.948804
syscall_exit 0.948399
mark_page_accessed 0.947564
mark_offset_tsc 0.947408
kmap_atomic 0.946043
__get_page_state 0.945906
open_exec 0.945135
generic_file_read 0.944971
generic_file_aio_write_nolock 0.944026
cp_new_stat64 0.944024
ext2_inode_by_name 0.942581
run_timer_softirq 0.941371
restore_all 0.941273
__get_user_4 0.941105
set_cpus_allowed 0.940637
add_event_entry 0.940402
schedule_tail 0.939958
render_sigset_t 0.939321
generic_file_write 0.939175
filp_open 0.938787
rcu_check_quiescent_state 0.938045
do_anonymous_page 0.937839
count_open_files 0.936963
notifier_call_chain 0.936895
resume_userspace 0.93609
buffered_rmqueue 0.935896
mod_timer 0.93579
__pagevec_lru_add_active 0.935714
page_fault 0.935649
.text.lock.file_table 0.935417
load_balance 0.934753
generic_file_write_nolock 0.934505
get_page_cache_size 0.933775
do_mpage_readpage 0.933137
new_inode 0.930292
pte_alloc_map 0.929736
ret_from_exception 0.929286
bad_range 0.928216
clear_user 0.928191
ext2_alloc_block 0.92757
do_no_page 0.927484
vm_enough_memory 0.925605
kunmap_high 0.925301
balance_dirty_pages_ratelimited 0.923248
scsi_io_completion 0.923066
idle_cpu 0.922739
ext2_release_inode 0.922605
vsnprintf 0.922524
update_process_times 0.922382
fasync_helper 0.920139
event_buffer_read 0.920008
ext2_prepare_write 0.919862
d_alloc 0.919581
unix_create1 0.91907
n_tty_ioctl 0.91907
follow_page 0.91907
mark_buffer_dirty 0.918178
__user_walk 0.916517
find_get_pages 0.91515
sync_supers 0.914233
save_i387 0.911759
locks_remove_flock 0.909879
de_put 0.909853
real_lookup 0.909687
proc_lookup 0.909153
__pagevec_free 0.908626
sys_rt_sigaction 0.907711
eventpoll_release 0.907527
rebalance_tick 0.907287
copy_semundo 0.90594
.text.lock.dcache 0.905811
set_brk 0.904709
detach_vmas_to_be_unmapped 0.903867
sys_getpgrp 0.90236
dequeue_signal 0.900148
sys_dup2 0.899615
system_call 0.899586
up_tty_sem 0.898063
sys_write 0.897262
free_task_struct 0.896924
getrusage 0.896093
pipe_read_release 0.896093
lookup_dcookie 0.892876
rb_insert_color 0.891897
page_waitqueue 0.891785
ext2_truncate 0.891498
error_code 0.891106
page_remove_rmap 0.890508
call_rcu 0.890118
proc_root_lookup 0.889294
ext2_readdir 0.8892
sys_execve 0.88732
.text.lock.scsi_lib 0.886246
put_unused_fd 0.88525
path_release 0.885245
insert_vm_struct 0.884975
sys_open 0.884771
add_sample_entry 0.884231
pipe_write_fasync 0.884209
filldir64 0.883406
find_lock_page 0.882984
number 0.88162
sys_getpid 0.881108
run_local_timers 0.8806
check_ttfb_buffer 0.880528
sys_read 0.879921
zap_pmd_range 0.879469
sys_pipe 0.878861
internal_add_timer 0.8776
raise_softirq 0.877385
nr_free_highpages 0.877294
chrdev_open 0.876253
pipe_wait 0.875089
kmem_cache_alloc 0.874574
scheduler_tick 0.874001
in_group_p 0.873357
__iget 0.872487
increment_tail 0.871313
.text.lock.root 0.870698
radix_tree_preload 0.870133
.text.lock.namei 0.869401
prep_new_page 0.868889
do_flush_tlb_all 0.868869
sys_vfork 0.868521
__mmdrop 0.868521
dnotify_flush 0.868297
proc_pid_lookup 0.866552
__find_get_block 0.86641
generic_file_mmap 0.864466
test_clear_page_dirty 0.864399
locks_remove_posix 0.864385
get_page_state 0.862991
kfree 0.861918
follow_mount 0.861344
.text.lock.dec_and_lock 0.861283
.text.lock.highmem 0.860662
find_inode_fast 0.859958
can_vma_merge_after 0.857848
rb_erase 0.856007
kunmap 0.855686
lru_add_drain 0.851885
generic_commit_write 0.851333
rcu_check_callbacks 0.850151
ext2_create 0.847842
sync_buffer 0.847649
ext2_update_inode 0.847091
block_prepare_write 0.846626
__wake_up 0.84635
m_stop 0.841343
seq_read 0.839309
page_slot 0.837905
dput 0.837362
modules_open 0.836253
seq_release 0.833888
__getblk 0.832907
d_delete 0.832116
getname 0.831301
count 0.830045
ext2_discard_prealloc 0.829132
batch_entropy_process 0.827163
proc_pid_cmdline 0.827163
__remove_from_page_cache 0.827163
flush_tlb_others 0.827163
proc_pid_make_inode 0.827163
ext2_check_page 0.827163
select_bits_free 0.827163
pdflush_operation 0.827163
ext2_ioctl 0.827163
mempool_free_slab 0.827163
__netdev_rx 0.827163
n_tty_receive_buf 0.827163
attempt_merge 0.827163
ext2_find_shared 0.827163
__pollwait 0.827163
ip_route_input 0.827163
__blk_run_queue 0.827163
serial8250_interrupt 0.827163
uart_write_room 0.827163
n_tty_chars_in_buffer 0.827163
locks_delete_lock 0.827163
get_dirty_limits 0.827163
__unix_remove_socket 0.827163
arp_rcv 0.827163
tcp_transmit_skb 0.827163
tcp_ack 0.827163
qdisc_restart 0.827163
process_backlog 0.827163
sys_socket 0.827163
nmi_cpu_stop 0.827163
exit_as_io_context 0.827163
kblockd_schedule_work 0.827163
ll_front_merge_fn 0.827163
receive_chars 0.827163
normal_poll 0.827163
semctl_main 0.827163
ext2_commit_super 0.827163
init_once 0.827163
proc_self_follow_link 0.827163
init_once 0.827163
.text.lock.binfmt_elf 0.827163
d_splice_alias 0.827163
assign_type 0.827163
blkdev_writepage 0.827163
end_buffer_io_sync 0.827163
chown_common 0.827163
delayed_work_timer_fn 0.827163
__d_path 0.827163
ext2_last_byte 0.827163
proc_check_root 0.827163
blk_congestion_wait 0.827163
ext2_readpage 0.827163
poll_freewait 0.827163
grab_block 0.822957
is_bad_inode 0.822786
wake_up_inode 0.821691
flush_tlb_page 0.821048
sys_getcwd 0.820438
seq_open 0.820173
smp_apic_timer_interrupt 0.817236
inode_has_buffers 0.814755
access_process_vm 0.81463
__cond_resched 0.813143
lookup_mnt 0.809749
activate_page 0.807376
ext2_set_inode_flags 0.806988
add_timer_randomness 0.805954
math_state_restore 0.805954
cache_init_objs 0.803679
set_close_on_exec 0.803187
__rb_rotate_left 0.802835
__clear_page_buffers 0.801602
balance_node 0.797724
alloc_undo 0.797265
sys_getgroups 0.795349
d_instantiate 0.795207
mempool_free 0.789565
bio_put 0.789565
kernel_read 0.789132
inode_init_once 0.788056
wake_up_buffer 0.787009
install_page 0.786485
sys_setrlimit 0.785805
radix_tree_gang_lookup 0.784379
memory_open 0.783628
ext2_get_inode 0.778301
__kill_fasync 0.775465
rmqueue_bulk 0.772631
set_fs_pwd 0.772384
can_share_swap_page 0.768732
cascade 0.767368
bh_lru_install 0.767006
proc_delete_inode 0.766163
store_msg 0.762287
vsprintf 0.76099
__up_wakeup 0.758233
unlock_page 0.758186
set_binfmt 0.754786
as_add_arq_rb 0.751966
sys_lseek 0.751966
work_resched 0.750574
inode_needs_sync 0.749616
page_add_rmap 0.748517
prepare_to_wait 0.748386
as_remove_request 0.748386
pipe_write_release 0.744447
buffer_insert_list 0.744447
ipcperms 0.739382
add_entropy_words 0.738538
__copy_user_intel 0.731801
ext2_get_page 0.729301
scsi_end_request 0.728886
expand_fd_array 0.727655
add_us_sample 0.724798
sys_getrusage 0.723768
__copy_user_zeroing_intel 0.722689
scsi_alloc_sgtable 0.719272
get_new_inode_fast 0.715814
truncate_inode_pages 0.713972
ext2_unlink 0.713972
sched_migrate_task 0.713072
ext2_make_empty 0.708997
writeback_in_progress 0.708997
unlock_rename 0.708997
sys_quotactl 0.707684
proc_file_read 0.706535
update_queue 0.705372
lookup_chrfops 0.705185
select_parent 0.701518
invalidate_inode_buffers 0.695867
proc_fd_link 0.689303
pagevec_lookup 0.685364
default_wake_function 0.680112
proc_base_lookup 0.677814
next_signal 0.67677
do_notify_resume 0.675622
ext2_count_free_inodes 0.672946
unlock_buffer 0.670673
cpu_raise_softirq 0.668093
unlock_new_inode 0.666326
submit_bio 0.66173
scsi_softirq 0.66173
get_user_pages 0.66173
ext2_lookup 0.659145
__sync_single_inode 0.6565
finish_wait 0.654036
scsi_dispatch_cmd 0.651171
sys_rmdir 0.643349
syscall_exit_work 0.641051
sys_semtimedop 0.639077
scsi_queue_next_request 0.628989
sys_ipc 0.625478
clear_queue_congested 0.622805
scsi_get_request_dev 0.620372
serial_out 0.620372
elv_merge 0.620372
skip_atoi 0.618149
bio_alloc 0.615108
restore_i387 0.61507
sys_getdents64 0.611874
as_find_next_arq 0.611381
work_pending 0.609489
end_page_writeback 0.608556
mempool_alloc 0.604465
as_find_arq_hash 0.598103
find_group_orlov 0.597395
__blk_put_request 0.596642
mpage_writepage 0.596313
ahc_linux_run_complete_queue 0.594523
sys_socketcall 0.590831
ext2_mkdir 0.590831
ext2_writepages 0.590831
__put_task_struct 0.579014
sys_gettimeofday 0.579014
scsi_get_command 0.574661
iget_locked 0.572849
tty_ioctl 0.570457
load_script 0.567197
put_as_io_context 0.565954
filemap_populate 0.564866
scsi_done 0.563562
elv_queue_empty 0.551442
sysctl_string 0.551442
sys_mkdir 0.551442
unix_write_space 0.551442
neigh_update 0.551442
sock_wmalloc 0.551442
hash_dcookie 0.551442
dget_locked 0.551442
sys_chmod 0.551442
do_sysctl_strategy 0.551442
del_timer_sync 0.548712
ahc_done 0.546995
mpage_end_io_write 0.543212
sync_sb_inodes 0.53592
end_buffer_async_write 0.533653
elv_next_request 0.531748
as_insert_request 0.531748
batch_entropy_store 0.526376
sys_getuid 0.526376
ahc_linux_run_device_queue 0.52433
proc_destroy_inode 0.52298
writeback_inodes 0.522419
radix_tree_extend 0.522419
pid_base_iput 0.516977
i_waitq_head 0.512841
load_msg 0.510284
as_remove_dispatched_request 0.509023
proc_info_read 0.509023
remove_arg_zero 0.509023
scsi_init_io 0.501874
scsi_host_busy_dec_and_test 0.499934
scsi_init_cmd_errh 0.497986
.text.lock.inode 0.496298
scsi_free_sgtable 0.496298
intr_handler 0.496298
ll_back_merge_fn 0.496298
locks_init_lock 0.496298
dev_watchdog 0.496298
netif_rx 0.496298
skb_release_data 0.496298
bio_phys_segments 0.496298
scsi_add_timer 0.496298
__up 0.496298
as_add_arq_hash 0.495436
__as_add_arq_rb 0.492359
__make_request 0.490844
writeback_release 0.49069
as_add_request 0.490171
as_choose_req 0.487967
end_that_request_first 0.486566
.text.lock.fs_writeback 0.484889
ahc_run_qoutfifo 0.482512
fn_hash_lookup 0.472665
show_cpuinfo 0.472665
truncate_complete_page 0.471169
bio_add_page 0.469471
prune_dcache 0.468943
scsi_put_command 0.467938
rb_next 0.467527
bio_endio 0.465279
as_next_request 0.462238
del_timer 0.460367
init_page_buffers 0.459535
ahc_linux_isr 0.457226
pid_delete_dentry 0.45118
__block_write_full_page 0.449323
end_that_request_last 0.448462
disk_round_stats 0.44791
as_move_to_dispatch 0.443123
meminfo_read_proc 0.440264
bio_destructor 0.433922
elv_remove_request 0.431563
add_timer 0.430814
as_find_arq_rb 0.430462
sd_rw_intr 0.426923
ahc_linux_queue 0.420474
scsi_finish_command 0.419944
writeback_acquire 0.419896
scsi_prep_fn 0.419407
neigh_lookup 0.413581
uart_start 0.413581
do_setitimer 0.413581
call_function_interrupt 0.413581
sys_connect 0.413581
sockfd_lookup 0.413581
move_addr_to_kernel 0.413581
as_merged_request 0.413581
elv_rq_merge_ok 0.413581
uart_write_wakeup 0.413581
tty_read 0.413581
freeque 0.413581
ext2_readpages 0.413581
sys_symlink 0.413581
kill_proc_info 0.413581
sys_getegid 0.413581
sys_alarm 0.413581
scsi_decide_disposition 0.407409
as_update_arq 0.401417
ahc_linux_queue_cmd_complete 0.396769
blk_rq_map_sg 0.395778
write_inode 0.3956
sd_init_command 0.389812
.text.lock.inode 0.381768
drive_stat_acct 0.375983
find_task_by_pid 0.375983
scsi_delete_timer 0.367628
__writeback_single_inode 0.367628
as_merge 0.367628
__end_that_request_first 0.363778
mpage_writepages 0.362196
as_queue_notready 0.354498
proc_lookupfd 0.354498
scsi_request_fn 0.35021
as_dispatch_request 0.340056
mark_buffer_async_write 0.330865
sock_init_data 0.330865
locks_insert_lock 0.330865
wb_kupdate 0.330865
second_overflow 0.330865
rb_prev 0.328744
ksize 0.327419
as_remove_queued_request 0.326863
__scsi_get_command 0.321674
.text.lock.ll_rw_blk 0.320244
get_request 0.316929
sock_map_fd 0.310186
bio_get_nr_vecs 0.310186
get_as_io_context 0.30376
setattr_mask 0.295415
submit_bh 0.275721
ret_from_fork 0.275721
ip_queue_xmit 0.275721
.text.lock.hosts 0.275721
do_sysctl 0.275721
put_fs_struct 0.275721
io_schedule 0.275721
end_bio_bh_io_sync 0.261935
blk_run_queues 0.261209
set_queue_congested 0.25734
as_fifo_expired 0.254512
__pagevec_release 0.248149
as_complete_arq 0.248149
as_update_iohist 0.243283
mask_and_ack_level_ioapic_irq 0.236332
read_chan 0.229767
radix_tree_node_ctor 0.227243
io_schedule_timeout 0.22559
transmit_chars 0.206791
schedule_work 0.206791
.text.lock.scsi 0.179077
unix_find_other 0.165433
ext2_set_link 0.165433
queue_delayed_work 0.165433
shrink_dcache_parent 0.137861
block_write_full_page 0.137861
blk_queue_bounce 0.118166
blk_plug_device 0.118166
init_buffer_head 0.103785
ss_wakeup 0.0973133
mpage_end_io_read 0.0580465
__lock_page 0.0516977
On Thu, Mar 20, 2003 at 10:48:13PM -0800, Andrew Morton wrote:
>> This all needs to be redone with oprofile, find out what on earth is going
>> on.
On Thu, Mar 20, 2003 at 11:07:46PM -0800, William Lee Irwin III wrote:
> How about this?
These were profiles of kernel compiles on 16x/16GB NUMA-Q.
The profiles leave me more confused than when I started. Cache effects
are potentially responsible.
-- wli
Hi,
On Fri, 2003-03-21 at 06:48, Andrew Morton wrote:
> It goes BUG in try_to_free_buffers().
>
> We really should fix this up for other reasons, probably by making ext3's
> per-page truncate operations wait on commit, and be more aggressive about
> pulling the page's buffers off the transaction at truncate time.
Ouch.
> The same thing _could_ happen with other filesystems; not too sure about
> that.
XFS used to have synchronous truncates, for similar sorts of reasons.
It was dog slow for unlinks. They worked pretty hard to fix that; I'd
really like to avoid adding extra synchronicity to ext3 in this case.
Pulling buffers off the transaction more aggressively would certainly be
worth looking at. Trouble is, if a truncate transaction on disk gets
interrupted by a crash, you really do have to be able to undo it, so you
simply don't have the luxury of throwing the buffers away until a commit
has occurred (unless you're in writeback mode.)
--Stephen
"Stephen C. Tweedie" <[email protected]> wrote:
>
> Hi,
>
> On Fri, 2003-03-21 at 06:48, Andrew Morton wrote:
>
> > It goes BUG in try_to_free_buffers().
> >
> > We really should fix this up for other reasons, probably by making ext3's
> > per-page truncate operations wait on commit, and be more aggressive about
> > pulling the page's buffers off the transaction at truncate time.
>
> Ouch.
But this is specifically for the case where truncate finds the page's buffers
are attached to the committing transaction.
At present we just give up; this can result in an alarming number of pages
floating about on the LRU with no references at all except for their buffers.
These pages cause the overcommit accounting to make grossly wrong decisions.
I have a patch in -mm which liberates the pages when the commit has
completed, but I don't like it - that freeing really should happen in the
context of the truncate, not at some time in the future. Doing it this way
means that the pages are either pagecache or free, and there is not a time
window in which they are simply AWOL.
> > The same thing _could_ happen with other filesystems; not too sure about
> > that.
>
> XFS used to have synchronous truncates, for similar sorts of reasons.
> It was dog slow for unlinks. They worked pretty hard to fix that; I'd
> really like to avoid adding extra synchronicity to ext3 in this case.
I doubt it will matter much - usually the pages will be attached to the
current transaction and we just zap them (I think - the "memory leak" isn't
too hard to trigger).
I haven't looked too closely lately, but I think journal_unmap_buffer() is
being a bit silly - if it sees the buffer is on the committing transaction it
just gives up. But it doesn't need to do that for ordered-data buffers. We
could just grab journal_datalist_lock and pull those buffers off the
transaction even during commit.
> Pulling buffers off the transaction more aggressively would certainly be
> worth looking at. Trouble is, if a truncate transaction on disk gets
> interrupted by a crash, you really do have to be able to undo it, so you
> simply don't have the luxury of throwing the buffers away until a commit
> has occurred (unless you're in writeback mode.)
For metadata, yes. But for ordered-data pages this doesn't matter.
btw, I have a vague feeling that I've forgotten something here, but I've
forgotten what it was. I'll have a play with it.