Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id ; Wed, 2 Oct 2002 10:52:39 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id ; Wed, 2 Oct 2002 10:52:39 -0400 Received: from pixpat.austin.ibm.com ([192.35.232.241]:39247 "EHLO baldur.austin.ibm.com") by vger.kernel.org with ESMTP id ; Wed, 2 Oct 2002 10:52:13 -0400 Date: Wed, 02 Oct 2002 09:57:35 -0500 From: Dave McCracken To: Linux Memory Management , Linux Kernel Subject: [PATCH] Snapshot of shared page tables Message-ID: <45850000.1033570655@baldur.austin.ibm.com> X-Mailer: Mulberry/2.2.1 (Linux/x86) MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="==========1850159384==========" Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 39159 Lines: 1436 --==========1850159384========== Content-Type: text/plain; charset=us-ascii Content-Transfer-Encoding: 7bit Content-Disposition: inline Ok, here it is. This patch works for my simple tests, both under UP and SMP, including under memory pressure. I'd appreciate anyone who'd like to take it and beat on it. Please let me know of any problems you find. The patch is against this morning's 2.5 BK tree. Dave McCracken ====================================================================== Dave McCracken IBM Linux Base Kernel Team 1-512-838-3059 dmccr@us.ibm.com T/L 678-3059 --==========1850159384========== Content-Type: text/plain; charset=iso-8859-1; name="shpte-2.5.40-1.diff" Content-Transfer-Encoding: quoted-printable Content-Disposition: attachment; filename="shpte-2.5.40-1.diff"; size=37266 # This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or = higher. # This patch includes the following deltas: # ChangeSet 1.674 -> 1.675 =20 # include/linux/mm.h 1.83.1.2 -> 1.86 =20 # include/asm-i386/pgalloc.h 1.16.1.2 -> 1.19 =20 # kernel/fork.c 1.70.1.15 -> 1.76 =20 # Makefile 1.293.1.22 -> 1.302 =20 # include/linux/rmap-locking.h 1.1 -> 1.2 =20 # init/main.c 1.64.1.6 -> 1.69 =20 # mm/swapfile.c 1.54.1.2 -> 1.58 =20 # fs/exec.c 1.45.1.4 -> 1.49 =20 # mm/memory.c 1.84.1.6 -> 1.92 =20 # include/asm-generic/rmap.h 1.2.1.1 -> 1.6 =20 # include/linux/page-flags.h 1.24.1.1 -> 1.27 =20 # include/asm-i386/pgtable.h 1.17.1.3 -> 1.20 =20 # mm/rmap.c 1.12.1.4 -> 1.20 =20 # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 02/10/02 dmc@baldur.austin.ibm.com 1.663.2.2 # Check in working snapshot # -------------------------------------------- # 02/10/02 dmc@baldur.austin.ibm.com 1.675 # Merge baldur.austin.ibm.com:/home/dmc/linux/bk/linux-2.5 # into baldur.austin.ibm.com:/home/dmc/linux/bk/linux-2.5-shpte # -------------------------------------------- # diff -Nru a/Makefile b/Makefile --- a/Makefile Wed Oct 2 09:47:17 2002 +++ b/Makefile Wed Oct 2 09:47:17 2002 @@ -1,7 +1,7 @@ VERSION =3D 2 PATCHLEVEL =3D 5 SUBLEVEL =3D 40 -EXTRAVERSION =3D +EXTRAVERSION =3D-shpte =20 # *DOCUMENTATION* # Too see a list of typical targets execute "make help" diff -Nru a/fs/exec.c b/fs/exec.c --- a/fs/exec.c Wed Oct 2 09:47:17 2002 +++ b/fs/exec.c Wed Oct 2 09:47:17 2002 @@ -308,8 +308,8 @@ flush_page_to_ram(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); page_add_rmap(page, pte); + increment_rss(virt_to_page(pte)); pte_unmap(pte); - tsk->mm->rss++; spin_unlock(&tsk->mm->page_table_lock); =20 /* no need for flush_tlb */ diff -Nru a/include/asm-generic/rmap.h b/include/asm-generic/rmap.h --- a/include/asm-generic/rmap.h Wed Oct 2 09:47:17 2002 +++ b/include/asm-generic/rmap.h Wed Oct 2 09:47:17 2002 @@ -26,33 +26,6 @@ */ #include =20 -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * = mm, unsigned long address) -{ -#ifdef BROKEN_PPC_PTE_ALLOC_ONE - /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ - extern int mem_init_done; - - if (!mem_init_done) - return; -#endif - page->mapping =3D (void *)mm; - page->index =3D address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page * page) -{ - page->mapping =3D NULL; - page->index =3D 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page =3D kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; -} - static inline unsigned long ptep_to_address(pte_t * ptep) { struct page * page =3D kmap_atomic_to_page(ptep); @@ -86,5 +59,11 @@ return; } #endif + +extern void pgtable_add_rmap(struct page * page, struct mm_struct * mm, = unsigned long address); +extern void pgtable_add_rmap_locked(struct page * page, struct mm_struct * = mm, unsigned long address); +extern void pgtable_remove_rmap(struct page * page, struct mm_struct *mm); +extern void pgtable_remove_rmap_locked(struct page * page, struct = mm_struct *mm); +extern void increment_rss(struct page *ptepage); =20 #endif /* _GENERIC_RMAP_H */ diff -Nru a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h --- a/include/asm-i386/pgalloc.h Wed Oct 2 09:47:17 2002 +++ b/include/asm-i386/pgalloc.h Wed Oct 2 09:47:17 2002 @@ -16,6 +16,13 @@ ((unsigned long long)page_to_pfn(pte) << (unsigned long long) PAGE_SHIFT))); } + +static inline void pmd_populate_rdonly(struct mm_struct *mm, pmd_t *pmd, = struct page *page) +{ + set_pmd(pmd, __pmd(_PAGE_TABLE_RDONLY + + ((unsigned long long)page_to_pfn(page) << + (unsigned long long) PAGE_SHIFT))); +} /* * Allocate and free page tables. */ diff -Nru a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h --- a/include/asm-i386/pgtable.h Wed Oct 2 09:47:17 2002 +++ b/include/asm-i386/pgtable.h Wed Oct 2 09:47:17 2002 @@ -124,6 +124,7 @@ #define _PAGE_PROTNONE 0x080 /* If not present */ =20 #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | = _PAGE_ACCESSED | _PAGE_DIRTY) +#define _PAGE_TABLE_RDONLY (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | = _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | = _PAGE_DIRTY) #define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) =20 @@ -184,8 +185,8 @@ #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) !=3D = _KERNPG_TABLE) - +#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_RW)) = !=3D \ + (_KERNPG_TABLE & ~_PAGE_RW)) =20 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) =20 @@ -209,6 +210,8 @@ static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |=3D = _PAGE_DIRTY; return pte; } static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |=3D = _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |=3D _PAGE_RW; = return pte; } +static inline int pmd_write(pmd_t pmd) { return (pmd).pmd & _PAGE_RW; } +static inline pmd_t pmd_wrprotect(pmd_t pmd) { (pmd).pmd &=3D ~_PAGE_RW; = return pmd; } =20 static inline int ptep_test_and_clear_dirty(pte_t *ptep) { return = test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); } static inline int ptep_test_and_clear_young(pte_t *ptep) { return = test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low); } @@ -263,6 +266,10 @@ ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + __pte_offset(address)) #define pte_offset_map_nested(dir, address) \ ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + __pte_offset(address)) +#define pte_page_map(__page, address) \ + ((pte_t *)kmap_atomic(__page,KM_PTE0) + __pte_offset(address)) +#define pte_page_map_nested(__page, address) \ + ((pte_t *)kmap_atomic(__page,KM_PTE1) + __pte_offset(address)) #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) =20 diff -Nru a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h Wed Oct 2 09:47:17 2002 +++ b/include/linux/mm.h Wed Oct 2 09:47:17 2002 @@ -163,6 +163,8 @@ struct pte_chain *chain;/* Reverse pte mapping pointer. * protected by PG_chainlock */ pte_addr_t direct; + struct mm_chain *mmchain;/* Reverse mm_struct mapping pointer */ + struct mm_struct *mmdirect; } pte; unsigned long private; /* mapping-private opaque data */ =20 diff -Nru a/include/linux/rmap-locking.h b/include/linux/rmap-locking.h --- a/include/linux/rmap-locking.h Wed Oct 2 09:47:17 2002 +++ b/include/linux/rmap-locking.h Wed Oct 2 09:47:17 2002 @@ -31,3 +31,6 @@ #endif preempt_enable(); } + +#define pte_page_lock pte_chain_lock +#define pte_page_unlock pte_chain_unlock diff -Nru a/init/main.c b/init/main.c --- a/init/main.c Wed Oct 2 09:47:17 2002 +++ b/init/main.c Wed Oct 2 09:47:17 2002 @@ -511,9 +511,7 @@ static void do_pre_smp_initcalls(void) { extern int spawn_ksoftirqd(void); -#ifdef CONFIG_SMP - extern int migration_init(void); - +#if CONFIG_SMP migration_init(); #endif spawn_ksoftirqd(); diff -Nru a/kernel/fork.c b/kernel/fork.c --- a/kernel/fork.c Wed Oct 2 09:47:17 2002 +++ b/kernel/fork.c Wed Oct 2 09:47:17 2002 @@ -205,6 +205,7 @@ struct vm_area_struct * mpnt, *tmp, **pprev; int retval; unsigned long charge =3D 0; + pmd_t *prev_pmd =3D 0; =20 flush_cache_mm(current->mm); mm->locked_vm =3D 0; @@ -267,7 +268,7 @@ *pprev =3D tmp; pprev =3D &tmp->vm_next; mm->map_count++; - retval =3D copy_page_range(mm, current->mm, tmp); + retval =3D share_page_range(mm, current->mm, tmp, &prev_pmd); spin_unlock(&mm->page_table_lock); =20 if (tmp->vm_ops && tmp->vm_ops->open) diff -Nru a/mm/memory.c b/mm/memory.c --- a/mm/memory.c Wed Oct 2 09:47:17 2002 +++ b/mm/memory.c Wed Oct 2 09:47:17 2002 @@ -36,6 +36,18 @@ * (Gerhard.Wichert@pdb.siemens.de) */ =20 +/* + * A note on locking of the page table structure: + * + * The top level lock that protects the page table is the = mm->page_table_lock. + * This lock protects the pgd and pmd layer. However, with the advent of = shared + * pte pages, this lock is not sufficient. The pte layer is now = protected by the + * pte_page_lock, set in the struct page of the pte page. Note that with = this + * locking scheme, once the pgd and pmd layers have been set in the page = fault + * path and the pte_page_lock has been taken, the page_table_lock can be = released. + *=20 + */ + #include #include #include @@ -44,6 +56,7 @@ #include #include #include +#include =20 #include #include @@ -83,7 +96,7 @@ */ static inline void free_one_pmd(mmu_gather_t *tlb, pmd_t * dir) { - struct page *page; + struct page *ptepage; =20 if (pmd_none(*dir)) return; @@ -92,10 +105,13 @@ pmd_clear(dir); return; } - page =3D pmd_page(*dir); + ptepage =3D pmd_page(*dir); pmd_clear(dir); - pgtable_remove_rmap(page); - pte_free_tlb(tlb, page); + pgtable_remove_rmap(ptepage, tlb->mm); + if (page_count(ptepage) =3D=3D 1) { + dec_page_state(nr_page_table_pages); + } + pte_free_tlb(tlb, ptepage); } =20 static inline void free_one_pgd(mmu_gather_t *tlb, pgd_t * dir) @@ -136,6 +152,190 @@ } while (--nr); } =20 +/* + * This function makes the decision whether a pte page needs to be = unshared + * or not. Note that page_count() =3D=3D 1 isn't even tested here. The = assumption + * is that if the pmd entry is marked writeable, then the page is either = already + * unshared or doesn't need to be unshared. This catches the situation = where + * task B unshares the pte page, then task A faults and needs to unprotect = the + * pmd entry. This is actually done in pte_unshare. + * + * This function should be called with the page_table_lock held. + */ +static inline int pte_needs_unshare(struct mm_struct *mm, struct = vm_area_struct *vma, + pmd_t *pmd, unsigned long address, int write_access) +{ + struct page *ptepage; + + /* It's not even there */ + if (!pmd_present(*pmd)) + return 0; + + /* If it's already writable, then it doesn't need to be unshared. */ + if (pmd_write(*pmd)) + return 0; + + /* If this isn't a write fault we don't need to unshare. */ + if (!write_access) + return 0; + + /* + * If this page fits entirely inside a shared region, don't unshare it. + */ + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); + if (((vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) =3D=3D VM_MAYWRITE) + && (vma->vm_start <=3D ptepage->index) + && (vma->vm_end >=3D (ptepage->index + PGDIR_SIZE))) { + pte_page_unlock(ptepage); + return 0; + } + return 1; +} + +/* + * Here is where a pte page is actually unshared. It actually covers a = couple of + * possible conditions. If the page_count() is already 1, then that means = it just + * needs to be set writeable. Otherwise, a new page needs to be = allocated. + * + * When each pte entry is copied, it is evaluated for COW protection, as = well as + * checking whether the swap count needs to be incremented. + * + * This function must be called with the page_table_lock held. It will = release + * the lock when it allocates a new page. + * + * When it returns, it has taken the pte_page_lock if there was no error. + */ + +static pte_t *pte_unshare(struct mm_struct *mm, pmd_t *pmd, unsigned long = address) +{ + pte_t *src_ptb, *dst_ptb; + struct page *oldpage, *newpage; + struct vm_area_struct *vma; + int base, addr; + int end, page_end; + int src_unshare; + + oldpage =3D pmd_page(*pmd); + + /* If it's already unshared, we just need to set it writeable */ + if (page_count(oldpage) =3D=3D 1) { + pmd_populate(mm, pmd, oldpage); + flush_tlb_mm(mm); + goto out_map; + } + + pte_page_unlock(oldpage); + spin_unlock(&mm->page_table_lock); + newpage =3D pte_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (unlikely(!newpage)) + return NULL; + + pte_page_lock(oldpage); + + /* See if it got unshared while we dropped the lock */ + if (page_count(oldpage) =3D=3D 1) { + pte_free(newpage); + goto out_map; + } + + pte_page_lock(newpage); + + base =3D addr =3D oldpage->index; + page_end =3D base + PGDIR_SIZE; + vma =3D find_vma(mm, base); + if (!vma || (page_end <=3D vma->vm_start)) + BUG(); /* No valid pages in this pte page */ + + src_unshare =3D page_count(oldpage) =3D=3D 2; + dst_ptb =3D pte_page_map(newpage, base); + src_ptb =3D pte_page_map_nested(oldpage, base); + + if (vma->vm_start > addr) + addr =3D vma->vm_start; + + if (vma->vm_end < page_end) + end =3D vma->vm_end; + else + end =3D page_end; + + do { + unsigned int cow =3D (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) =3D=3D = VM_MAYWRITE; + pte_t *src_pte =3D src_ptb + __pte_offset(addr); + pte_t *dst_pte =3D dst_ptb + __pte_offset(addr); + + do { + pte_t pte =3D *src_pte; + struct page *page; + + if (pte_none(pte)) + goto unshare_skip_set; + + if (!pte_present(pte)) { + swap_duplicate(pte_to_swp_entry(pte)); + set_pte(dst_pte, pte); + goto unshare_skip_set; + } + page =3D pte_page(pte); + if (!PageReserved(page)) { + /* COW mappings require write protecting both sides */ + if (cow) { + pte =3D pte_wrprotect(pte); + if (src_unshare) + set_pte(src_pte, pte); + } + /* If it's a shared mapping, + * mark it clean in the new mapping + */ + if (vma->vm_flags & VM_SHARED) + pte =3D pte_mkclean(pte); + pte =3D pte_mkold(pte); + get_page(page); + } + set_pte(dst_pte, pte); + page_add_rmap(page, dst_pte); +unshare_skip_set: + src_pte++; + dst_pte++; + addr +=3D PAGE_SIZE; + } while (addr < end); + + if (addr >=3D page_end) + break; + + vma =3D vma->vm_next; + if (!vma) + break; + + if (page_end <=3D vma->vm_start) + break; + + addr =3D vma->vm_start; + if (vma->vm_end < page_end) + end =3D vma->vm_end; + else + end =3D page_end; + } while (1); + + pte_unmap_nested(src_ptb); + + pgtable_remove_rmap_locked(oldpage, mm); + pgtable_add_rmap_locked(newpage, mm, base); + pmd_populate(mm, pmd, newpage); + inc_page_state(nr_page_table_pages); + + flush_tlb_mm(mm); + + put_page(oldpage); + pte_page_unlock(oldpage); + + return dst_ptb + __pte_offset(address); + +out_map: + return pte_offset_map(pmd, address); +} + pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long = address) { if (!pmd_present(*pmd)) { @@ -157,11 +357,10 @@ } pgtable_add_rmap(new, mm, address); pmd_populate(mm, pmd, new); + inc_page_state(nr_page_table_pages); } out: - if (pmd_present(*pmd)) - return pte_offset_map(pmd, address); - return NULL; + return pte_offset_map(pmd, address); } =20 pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long = address) @@ -183,7 +382,6 @@ pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); pmd_populate_kernel(mm, pmd, new); } out: @@ -192,28 +390,14 @@ #define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) #define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) =20 -/* - * copy one vm_area from one task to the other. Assumes the page tables - * already present in the new task to be cleared in the whole range - * covered by this vma. - * - * 08Jan98 Merged into one routine from several inline routines to reduce - * variable count and make things faster. -jj - * - * dst->page_table_lock is held on entry and exit, - * but may be dropped within pmd_alloc() and pte_alloc_map(). - */ -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) +int share_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma, pmd_t **prev_pmd) { - pgd_t * src_pgd, * dst_pgd; + pgd_t *src_pgd, *dst_pgd; unsigned long address =3D vma->vm_start; unsigned long end =3D vma->vm_end; unsigned long cow =3D (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) =3D=3D = VM_MAYWRITE; - - if (is_vm_hugetlb_page(vma)) - return copy_hugetlb_page_range(dst, src, vma); - + src_pgd =3D pgd_offset(src, address)-1; dst_pgd =3D pgd_offset(dst, address)-1; =20 @@ -222,14 +406,12 @@ =20 src_pgd++; dst_pgd++; - /* copy_pmd_range */ - if (pgd_none(*src_pgd)) - goto skip_copy_pmd_range; + goto skip_share_pmd_range; if (pgd_bad(*src_pgd)) { pgd_ERROR(*src_pgd); pgd_clear(src_pgd); -skip_copy_pmd_range: address =3D (address + PGDIR_SIZE) & PGDIR_MASK; +skip_share_pmd_range: address =3D (address + PGDIR_SIZE) & PGDIR_MASK; if (!address || (address >=3D end)) goto out; continue; @@ -240,84 +422,47 @@ if (!dst_pmd) goto nomem; =20 + spin_lock(&src->page_table_lock); + do { - pte_t * src_pte, * dst_pte; - - /* copy_pte_range */ - - if (pmd_none(*src_pmd)) - goto skip_copy_pte_range; - if (pmd_bad(*src_pmd)) { + pmd_t pmdval =3D *src_pmd; + struct page *ptepage =3D pmd_page(pmdval); + + if (pmd_none(pmdval)) + goto skip_share_pte_range; + if (pmd_bad(pmdval)) { pmd_ERROR(*src_pmd); pmd_clear(src_pmd); -skip_copy_pte_range: address =3D (address + PMD_SIZE) & PMD_MASK; - if (address >=3D end) - goto out; - goto cont_copy_pmd_range; + goto skip_share_pte_range; } =20 - dst_pte =3D pte_alloc_map(dst, dst_pmd, address); - if (!dst_pte) - goto nomem; - spin_lock(&src->page_table_lock); - src_pte =3D pte_offset_map_nested(src_pmd, address); - do { - pte_t pte =3D *src_pte; - struct page *ptepage; - unsigned long pfn; - - /* copy_one_pte */ - - if (pte_none(pte)) - goto cont_copy_pte_range_noset; - /* pte contains position in swap, so copy. */ - if (!pte_present(pte)) { - swap_duplicate(pte_to_swp_entry(pte)); - set_pte(dst_pte, pte); - goto cont_copy_pte_range_noset; - } - ptepage =3D pte_page(pte); - pfn =3D pte_pfn(pte); - if (!pfn_valid(pfn)) - goto cont_copy_pte_range; - ptepage =3D pfn_to_page(pfn); - if (PageReserved(ptepage)) - goto cont_copy_pte_range; - - /* If it's a COW mapping, write protect it both in the parent and the = child */ - if (cow) { - ptep_set_wrprotect(src_pte); - pte =3D *src_pte; - } + if (cow) { + pmdval =3D pmd_wrprotect(pmdval); + set_pmd(src_pmd, pmdval); + } + set_pmd(dst_pmd, pmdval); =20 - /* If it's a shared mapping, mark it clean in the child */ - if (vma->vm_flags & VM_SHARED) - pte =3D pte_mkclean(pte); - pte =3D pte_mkold(pte); + /* Only do this if we haven't seen this pte page before */ + if (src_pmd !=3D *prev_pmd) { get_page(ptepage); - dst->rss++; + pgtable_add_rmap(ptepage, dst, address); + *prev_pmd =3D src_pmd; + dst->rss +=3D ptepage->private; + } =20 -cont_copy_pte_range: set_pte(dst_pte, pte); - page_add_rmap(ptepage, dst_pte); -cont_copy_pte_range_noset: address +=3D PAGE_SIZE; - if (address >=3D end) { - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); - goto out_unlock; - } - src_pte++; - dst_pte++; - } while ((unsigned long)src_pte & PTE_TABLE_MASK); - pte_unmap_nested(src_pte-1); - pte_unmap(dst_pte-1); - spin_unlock(&src->page_table_lock); - -cont_copy_pmd_range: src_pmd++; +skip_share_pte_range: address =3D (address + PMD_SIZE) & PMD_MASK; + if (address >=3D end) + goto out_unlock; + + src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + spin_unlock(&src->page_table_lock); } + out_unlock: spin_unlock(&src->page_table_lock); + out: return 0; nomem: @@ -326,6 +471,7 @@ =20 static void zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long = address, unsigned long size) { + struct page *ptepage; unsigned long offset; pte_t *ptep; =20 @@ -336,11 +482,34 @@ pmd_clear(pmd); return; } - ptep =3D pte_offset_map(pmd, address); + offset =3D address & ~PMD_MASK; if (offset + size > PMD_SIZE) size =3D PMD_SIZE - offset; size &=3D PAGE_MASK; + + /* + * Check to see if the pte page is shared. If it is and we're unmapping + * the entire page, just decrement the reference count and we're done. + * If we're only unmapping part of the page we'll have to unshare it the + * slow way. + */ + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); + if (page_count(ptepage) > 1) { + if ((offset =3D=3D 0) && (size =3D=3D PMD_SIZE)) { + pmd_clear(pmd); + pgtable_remove_rmap_locked(ptepage, tlb->mm); + tlb->mm->rss -=3D ptepage->private; + put_page(ptepage); + pte_page_unlock(ptepage); + return; + } + ptep =3D pte_unshare(tlb->mm, pmd, address); + ptepage =3D pmd_page(*pmd); + } else { + ptep =3D pte_offset_map(pmd, address); + } for (offset=3D0; offset < size; ptep++, offset +=3D PAGE_SIZE) { pte_t pte =3D *ptep; if (pte_none(pte)) @@ -365,6 +534,7 @@ pte_clear(ptep); } } + pte_page_unlock(ptepage); pte_unmap(ptep-1); } =20 @@ -460,6 +630,19 @@ spin_unlock(&mm->page_table_lock); } =20 +void unmap_all_pages(mmu_gather_t *tlb, struct mm_struct *mm, unsigned = long address, unsigned long end) +{ + pgd_t * dir; + + if (address >=3D end) + BUG(); + dir =3D pgd_offset(mm, address); + do { + zap_pmd_range(tlb, dir, address, end - address); + address =3D (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); +} /* * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. @@ -1005,6 +1188,7 @@ unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) { struct page *old_page, *new_page; + struct page *ptepage =3D pmd_page(*pmd); unsigned long pfn =3D pte_pfn(pte); =20 if (!pfn_valid(pfn)) @@ -1018,7 +1202,7 @@ flush_cache_page(vma, address); establish_pte(vma, address, page_table, = pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return VM_FAULT_MINOR; } } @@ -1028,7 +1212,7 @@ * Ok, we need to copy. Oh, well.. */ page_cache_get(old_page); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); =20 new_page =3D alloc_page(GFP_HIGHUSER); if (!new_page) @@ -1038,11 +1222,11 @@ /* * Re-check the pte - we dropped the lock */ - spin_lock(&mm->page_table_lock); + pte_page_lock(ptepage); page_table =3D pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) - ++mm->rss; + increment_rss(ptepage); page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); page_add_rmap(new_page, page_table); @@ -1052,14 +1236,14 @@ new_page =3D old_page; } pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); page_cache_release(new_page); page_cache_release(old_page); return VM_FAULT_MINOR; =20 bad_wp_page: pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address); /* * This should really halt the system so it can be debugged or @@ -1188,12 +1372,13 @@ pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) { struct page *page; + struct page *ptepage =3D pmd_page(*pmd); swp_entry_t entry =3D pte_to_swp_entry(orig_pte); pte_t pte; int ret =3D VM_FAULT_MINOR; =20 pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); page =3D lookup_swap_cache(entry); if (!page) { swapin_readahead(entry); @@ -1203,14 +1388,14 @@ * Back out if somebody else faulted in this pte while * we released the page table lock. */ - spin_lock(&mm->page_table_lock); + pte_page_lock(ptepage); page_table =3D pte_offset_map(pmd, address); if (pte_same(*page_table, orig_pte)) ret =3D VM_FAULT_OOM; else ret =3D VM_FAULT_MINOR; pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return ret; } =20 @@ -1226,11 +1411,11 @@ * Back out if somebody else faulted in this pte while we * released the page table lock. */ - spin_lock(&mm->page_table_lock); + pte_page_lock(ptepage); page_table =3D pte_offset_map(pmd, address); if (!pte_same(*page_table, orig_pte)) { pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); unlock_page(page); page_cache_release(page); return VM_FAULT_MINOR; @@ -1242,7 +1427,7 @@ if (vm_swap_full()) remove_exclusive_swap_page(page); =20 - mm->rss++; + increment_rss(ptepage); pte =3D mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte =3D pte_mkdirty(pte_mkwrite(pte)); @@ -1256,7 +1441,7 @@ /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return ret; } =20 @@ -1269,6 +1454,7 @@ { pte_t entry; struct page * page =3D ZERO_PAGE(addr); + struct page *ptepage =3D pmd_page(*pmd); =20 /* Read-only mapping of ZERO_PAGE. */ entry =3D pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); @@ -1277,23 +1463,23 @@ if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); =20 page =3D alloc_page(GFP_HIGHUSER); if (!page) goto no_mem; clear_user_highpage(page, addr); =20 - spin_lock(&mm->page_table_lock); + pte_page_lock(ptepage); page_table =3D pte_offset_map(pmd, addr); =20 if (!pte_none(*page_table)) { pte_unmap(page_table); page_cache_release(page); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return VM_FAULT_MINOR; } - mm->rss++; + increment_rss(ptepage); flush_page_to_ram(page); entry =3D pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add(page); @@ -1306,7 +1492,7 @@ =20 /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return VM_FAULT_MINOR; =20 no_mem: @@ -1329,12 +1515,13 @@ unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) { struct page * new_page; + struct page *ptepage =3D pmd_page(*pmd); pte_t entry; =20 if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, pmd, write_access, = address); pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); =20 new_page =3D vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); =20 @@ -1359,7 +1546,7 @@ new_page =3D page; } =20 - spin_lock(&mm->page_table_lock); + pte_page_lock(ptepage); page_table =3D pte_offset_map(pmd, address); =20 /* @@ -1374,7 +1561,7 @@ */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { - ++mm->rss; + increment_rss(ptepage); flush_page_to_ram(new_page); flush_icache_page(vma, new_page); entry =3D mk_pte(new_page, vma->vm_page_prot); @@ -1387,13 +1574,13 @@ /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); page_cache_release(new_page); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return VM_FAULT_MINOR; } =20 /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return VM_FAULT_MAJOR; } =20 @@ -1445,7 +1632,7 @@ entry =3D pte_mkyoung(entry); establish_pte(vma, address, pte, entry); pte_unmap(pte); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(pmd_page(*pmd)); return VM_FAULT_MINOR; } =20 @@ -1470,9 +1657,19 @@ pmd =3D pmd_alloc(mm, pgd, address); =20 if (pmd) { - pte_t * pte =3D pte_alloc_map(mm, pmd, address); - if (pte) + pte_t * pte; + + if (pte_needs_unshare(mm, vma, pmd, address, write_access)) + pte =3D pte_unshare(mm, pmd, address); + else { + pte =3D pte_alloc_map(mm, pmd, address); + pte_page_lock(pmd_page(*pmd)); + } + + if (pte) { + spin_unlock(&mm->page_table_lock); return handle_pte_fault(mm, vma, address, write_access, pte, pmd); + } } spin_unlock(&mm->page_table_lock); return VM_FAULT_OOM; diff -Nru a/mm/rmap.c b/mm/rmap.c --- a/mm/rmap.c Wed Oct 2 09:47:17 2002 +++ b/mm/rmap.c Wed Oct 2 09:47:17 2002 @@ -45,11 +45,17 @@ */ #define NRPTE ((L1_CACHE_BYTES - sizeof(void *))/sizeof(pte_addr_t)) =20 +struct mm_chain { + struct mm_chain *next; + struct mm_struct *mm; +}; + struct pte_chain { struct pte_chain *next; pte_addr_t ptes[NRPTE]; }; =20 +static kmem_cache_t *mm_chain_cache; static kmem_cache_t *pte_chain_cache; =20 /* @@ -102,6 +108,25 @@ kmem_cache_free(pte_chain_cache, pte_chain); } =20 +static inline struct mm_chain *mm_chain_alloc(void) +{ + struct mm_chain *ret; + + ret =3D kmem_cache_alloc(mm_chain_cache, GFP_ATOMIC); + return ret; +} + +static void mm_chain_free(struct mm_chain *mc, + struct mm_chain *prev_mc, struct page *page) +{ + if (prev_mc) + prev_mc->next =3D mc->next; + else if (page) + page->pte.mmchain =3D mc->next; + + kmem_cache_free(mm_chain_cache, mc); +} + /** ** VM stuff below this comment **/ @@ -161,6 +186,94 @@ return referenced; } =20 +void pgtable_add_rmap_locked(struct page * page, struct mm_struct * mm, + unsigned long address) +{ + struct mm_chain *mc; + +#ifdef BROKEN_PPC_PTE_ALLOC_ONE + /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ + extern int mem_init_done; + + if (!mem_init_done) + return; +#endif +#ifdef RMAP_DEBUG + BUG_ON(mm =3D=3D NULL); +#endif + + if (PageDirect(page)) { + mc =3D mm_chain_alloc(); + mc->mm =3D page->pte.mmdirect; + mc->next =3D NULL; + page->pte.mmchain =3D mc; + ClearPageDirect(page); + } + if (page->pte.mmchain) { + /* Hook up the mm_chain to the page. */ + mc =3D mm_chain_alloc(); + mc->mm =3D mm; + mc->next =3D page->pte.mmchain; + page->pte.mmchain =3D mc; + } else { + page->pte.mmdirect =3D mm; + SetPageDirect(page); + page->index =3D address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); + } +} + +void pgtable_remove_rmap_locked(struct page * page, struct mm_struct *mm) +{ + struct mm_chain * mc, * prev_mc =3D NULL; + +#ifdef DEBUG_RMAP + BUG_ON(mm =3D=3D NULL); +#endif + + if (PageDirect(page)) { + if (page->pte.mmdirect =3D=3D mm) { + page->pte.mmdirect =3D NULL; + ClearPageDirect(page); + page->index =3D 0; + goto out; + } + } else { +#ifdef DEBUG_RMAP + BUG_ON(page->pte.mmchain->next =3D=3D NULL); +#endif + for (mc =3D page->pte.mmchain; mc; prev_mc =3D mc, mc =3D mc->next) { + if (mc->mm =3D=3D mm) { + mm_chain_free(mc, prev_mc, page); + /* Check whether we can convert to direct */ + mc =3D page->pte.mmchain; + if (!mc->next) { + page->pte.mmdirect =3D mc->mm; + SetPageDirect(page); + mm_chain_free(mc, NULL, NULL); + } + goto out; + } + } + } + BUG(); +out: +} + +void pgtable_add_rmap(struct page * page, struct mm_struct * mm, + unsigned long address) +{ + pte_page_lock(page); + pgtable_add_rmap_locked(page, mm, address); + pte_page_unlock(page); +} + +void pgtable_remove_rmap(struct page * page, struct mm_struct *mm) +{ + pte_page_lock(page); + pgtable_remove_rmap_locked(page, mm); + pte_page_unlock(page); +} + /** * page_add_rmap - add reverse mapping entry to a page * @page: the page to add the mapping to @@ -180,8 +293,6 @@ BUG(); if (!pte_present(*ptep)) BUG(); - if (!ptep_to_mm(ptep)) - BUG(); #endif =20 if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) @@ -199,12 +310,15 @@ if (page->pte.direct =3D=3D pte_paddr) BUG(); } else { + int count =3D 0; for (pc =3D page->pte.chain; pc; pc =3D pc->next) { - for (i =3D 0; i < NRPTE; i++) { + for (i =3D 0; i < NRPTE; i++, count++) { pte_addr_t p =3D pc->ptes[i]; =20 - if (p && p =3D=3D pte_paddr) + if (p && p =3D=3D pte_paddr) { + printk(KERN_ERR "page_add_rmap: page %08lx (count %d), ptep %08lx, = rmap count %d\n", page, page_count(page), ptep, count); BUG(); + } } } } @@ -342,6 +456,98 @@ return; } =20 +static inline int pgtable_check_mlocked_mm(struct mm_struct *mm, unsigned = long address) +{ + struct vm_area_struct *vma; + int ret =3D SWAP_SUCCESS; + + /* During mremap, it's possible pages are not in a VMA. */ + vma =3D find_vma(mm, address); + if (!vma) { + ret =3D SWAP_FAIL; + goto out; + } + + /* The page is mlock()d, we cannot swap it out. */ + if (vma->vm_flags & VM_LOCKED) { + ret =3D SWAP_FAIL; + } +out: + return ret; +} + +static inline int pgtable_check_mlocked(struct page *ptepage, unsigned = long address) +{ + struct mm_chain *mc; + int ret =3D SWAP_SUCCESS; + + if (PageDirect(ptepage)) { + ret =3D pgtable_check_mlocked_mm(ptepage->pte.mmdirect, address); + goto out; + } + + for (mc =3D ptepage->pte.mmchain; mc; mc =3D mc->next) { +#ifdef DEBUG_RMAP + BUG_ON(mc->mm =3D=3D NULL); +#endif + ret =3D pgtable_check_mlocked_mm(mc->mm, address); + if (ret !=3D SWAP_SUCCESS) + goto out; + } +out: + return ret; +} + +static inline int pgtable_unmap_one_mm(struct mm_struct *mm, unsigned long = address) +{ + struct vm_area_struct *vma; + int ret =3D SWAP_SUCCESS; + + /* During mremap, it's possible pages are not in a VMA. */ + vma =3D find_vma(mm, address); + if (!vma) { + ret =3D SWAP_FAIL; + goto out; + } + flush_tlb_page(vma, address); + flush_cache_page(vma, address); + mm->rss--; + +out: + return ret; +} + +static inline int pgtable_unmap_one(struct page *ptepage, unsigned long = address) +{ + struct mm_chain *mc; + int ret =3D SWAP_SUCCESS; + + if (PageDirect(ptepage)) { + ret =3D pgtable_unmap_one_mm(ptepage->pte.mmdirect, address); + if (ret !=3D SWAP_SUCCESS) + goto out; + } else for (mc =3D ptepage->pte.mmchain; mc; mc =3D mc->next) { + ret =3D pgtable_unmap_one_mm(mc->mm, address); + if (ret !=3D SWAP_SUCCESS) + goto out; + } + ptepage->private--; +out: + return ret; +} + +void increment_rss(struct page *ptepage) +{ + struct mm_chain *mc; + + if (PageDirect(ptepage)) + ptepage->pte.mmdirect->rss++; + else for (mc =3D ptepage->pte.mmchain; mc; mc =3D mc->next) + mc->mm->rss++; + + ptepage->private++; +} + /** * try_to_unmap_one - worker function for try_to_unmap * @page: page to unmap @@ -360,42 +566,24 @@ static int try_to_unmap_one(struct page * page, pte_addr_t paddr) { pte_t *ptep =3D rmap_ptep_map(paddr); - unsigned long address =3D ptep_to_address(ptep); - struct mm_struct * mm =3D ptep_to_mm(ptep); - struct vm_area_struct * vma; pte_t pte; + struct page *ptepage =3D kmap_atomic_to_page(ptep); + unsigned long address =3D ptep_to_address(ptep); int ret; =20 - if (!mm) - BUG(); + pte_page_lock(ptepage); =20 - /* - * We need the page_table_lock to protect us from page faults, - * munmap, fork, etc... - */ - if (!spin_trylock(&mm->page_table_lock)) { - rmap_ptep_unmap(ptep); - return SWAP_AGAIN; - } - - - /* During mremap, it's possible pages are not in a VMA. */ - vma =3D find_vma(mm, address); - if (!vma) { - ret =3D SWAP_FAIL; + ret =3D pgtable_check_mlocked(ptepage, address); + if (ret !=3D SWAP_SUCCESS) goto out_unlock; - } + pte =3D ptep_get_and_clear(ptep); =20 - /* The page is mlock()d, we cannot swap it out. */ - if (vma->vm_flags & VM_LOCKED) { - ret =3D SWAP_FAIL; + ret =3D pgtable_unmap_one(ptepage, address); + if (ret !=3D SWAP_SUCCESS) { + set_pte(ptep, pte); goto out_unlock; } - - /* Nuke the page table entry. */ - pte =3D ptep_get_and_clear(ptep); - flush_tlb_page(vma, address); - flush_cache_page(vma, address); + pte_page_unlock(ptepage); =20 /* Store the swap location in the pte. See handle_pte_fault() ... */ if (PageSwapCache(page)) { @@ -408,13 +596,15 @@ if (pte_dirty(pte)) set_page_dirty(page); =20 - mm->rss--; page_cache_release(page); ret =3D SWAP_SUCCESS; + goto out; =20 out_unlock: + pte_page_unlock(ptepage); + +out: rmap_ptep_unmap(ptep); - spin_unlock(&mm->page_table_lock); return ret; } =20 @@ -523,6 +713,17 @@ =20 void __init pte_chain_init(void) { + + mm_chain_cache =3D kmem_cache_create( "mm_chain", + sizeof(struct mm_chain), + 0, + 0, + NULL, + NULL); + + if (!mm_chain_cache) + panic("failed to create mm_chain cache!\n"); + pte_chain_cache =3D kmem_cache_create( "pte_chain", sizeof(struct pte_chain), 0, diff -Nru a/mm/swapfile.c b/mm/swapfile.c --- a/mm/swapfile.c Wed Oct 2 09:47:17 2002 +++ b/mm/swapfile.c Wed Oct 2 09:47:17 2002 @@ -371,7 +371,7 @@ */ /* mmlist_lock and vma->vm_mm->page_table_lock are held */ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long = address, - pte_t *dir, swp_entry_t entry, struct page* page) + pte_t *dir, swp_entry_t entry, struct page* page, pmd_t *pmd) { pte_t pte =3D *dir; =20 @@ -383,7 +383,7 @@ set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_rmap(page, dir); swap_free(entry); - ++vma->vm_mm->rss; + increment_rss(pmd_page(*pmd)); } =20 /* mmlist_lock and vma->vm_mm->page_table_lock are held */ @@ -408,7 +408,7 @@ if (end > PMD_SIZE) end =3D PMD_SIZE; do { - unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page); + unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page, dir); address +=3D PAGE_SIZE; pte++; } while (address && (address < end)); --==========1850159384==========-- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/