Content-Type: text/plain; charset=US-ASCII
From: Daniel Phillips <phillips@bonn-fries.net>
To: Hugh Dickins <hugh@veritas.com>
Subject: Re: [RFC] Page table sharing
Date: Tue, 19 Feb 2002 00:37:56 +0100
Cc: Linus Torvalds <torvalds@transmeta.com>, dmccr@us.ibm.com,
        Kernel Mailing List <linux-kernel@vger.kernel.org>, linux-mm@kvack.org,
        Robert Love <rml@tech9.net>, Rik van Riel <riel@conectiva.com.br>,
        mingo@redhat.co, Andrew Morton <akpm@zip.com.au>,
        manfred@colorfullife.com, wli@holomorphy.com
In-Reply-To: <Pine.LNX.4.21.0202181759100.1248-100000@localhost.localdomain>
In-Reply-To: <Pine.LNX.4.21.0202181759100.1248-100000@localhost.localdomain>
MIME-Version: 1.0
Content-Transfer-Encoding: 7BIT
Message-Id: <E16cxM4-0000xB-00@starship.berlin>
Sender: linux-kernel-owner@vger.kernel.org

On February 18, 2002 08:04 pm, Hugh Dickins wrote:
> (By the way, the patch you appended to your next mail did not apply
> - I think you'd hand-edited an incidental irrelevant cleanup out of
> the patch to memory.c, without adjusting its line counts; and also
> had to edit "public_html/" out of the URL you gave.)

Thanks, here it is again.  This time I left the gratuitous whitespace
cleanup in as the route of least resistance ;-)

  nl.linux.org/~phillips/patches/ptab-2.4.17-2  

-- 
Daniel

--- ../2.4.17.clean/fs/exec.c	Fri Dec 21 12:41:55 2001
+++ ./fs/exec.c	Mon Feb 18 22:57:10 2002
@@ -860,6 +860,7 @@
 	int retval;
 	int i;
 
+	ptab(printk(">>> execve %s\n", filename));
 	file = open_exec(filename);
 
 	retval = PTR_ERR(file);
--- ../2.4.17.clean/include/linux/mm.h	Fri Dec 21 12:42:03 2001
+++ ./include/linux/mm.h	Mon Feb 18 22:57:10 2002
@@ -411,7 +411,7 @@
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
 extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(__pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address, int write));
 extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
@@ -424,6 +424,16 @@
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
 		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+
+static inline pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+	return __pte_alloc(mm, pmd, address, 1);
+}
+
+#define nil do { } while (0)
+#define ptab_off(cmd) nil
+#define ptab_on(cmd) do { if (current->flags & PF_SHARE_TABLES) cmd; } while (0)
+#define ptab ptab_off
 
 /*
  * On a two-level page table, this ends up being trivial. Thus the
--- ../2.4.17.clean/include/linux/sched.h	Fri Dec 21 12:42:03 2001
+++ ./include/linux/sched.h	Mon Feb 18 22:57:10 2002
@@ -427,7 +427,7 @@
 #define PF_MEMDIE	0x00001000	/* Killed for out-of-memory */
 #define PF_FREE_PAGES	0x00002000	/* per process page freeing */
 #define PF_NOIO		0x00004000	/* avoid generating further I/O */
-
+#define PF_SHARE_TABLES 0x00008000    /* share page tables (testing) */
 #define PF_USEDFPU	0x00100000	/* task used FPU this quantum (SMP) */
 
 /*
--- ../2.4.17.clean/kernel/fork.c	Wed Nov 21 13:18:42 2001
+++ ./kernel/fork.c	Mon Feb 18 22:57:10 2002
@@ -140,6 +140,7 @@
 	mm->swap_address = 0;
 	pprev = &mm->mmap;
 
+	ptab(printk(">>> dup_mmap\n"));
 	/*
 	 * Add it to the mmlist after the parent.
 	 * Doing it this way means that we can order the list,
@@ -566,9 +567,10 @@
 	struct task_struct *p;
 	struct completion vfork;
 
+	ptab(printk(">>> fork, stack=%lx\n", stack_start));
 	retval = -EPERM;
 
-	/* 
+	/*
 	 * CLONE_PID is only allowed for the initial SMP swapper
 	 * calls
 	 */
--- ../2.4.17.clean/kernel/sys.c	Tue Sep 18 17:10:43 2001
+++ ./kernel/sys.c	Mon Feb 18 22:57:10 2002
@@ -514,6 +514,11 @@
 	current->uid = new_ruid;
 	current->user = new_user;
 	free_uid(old_user);
+
+	if (current->uid == 9999)
+		current->flags |= PF_SHARE_TABLES;
+	printk(">>> user: uid=%i pid=%i pf=%x\n", current->uid, current->pid, current->flags);
+	
 	return 0;
 }
 
--- ../2.4.17.clean/mm/memory.c	Fri Dec 21 12:42:05 2001
+++ ./mm/memory.c	Mon Feb 18 23:14:44 2002
@@ -34,6 +34,9 @@
  *
  * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
  *		(Gerhard.Wichert@pdb.siemens.de)
+ *
+ * Feb 2002  -  Shared page tables added by Daniel Phillips
+ *		(phillips@nl.linux.org)
  */
 
 #include <linux/mm.h>
@@ -49,6 +52,8 @@
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
 
+#define assert(cond) do { if (!(cond)) BUG(); } while (0)
+
 unsigned long max_mapnr;
 unsigned long num_physpages;
 void * high_memory;
@@ -100,8 +105,11 @@
 		return;
 	}
 	pte = pte_offset(dir, 0);
-	pmd_clear(dir);
-	pte_free(pte);
+	ptab(printk(">>> free page table %p (%i--)\n", pte, page_count(virt_to_page(pte))));
+	if (put_page_testzero(virt_to_page(pte))) {
+		pmd_clear(dir);
+		pte_free_slow(pte);
+	}
 }
 
 static inline void free_one_pgd(pgd_t * dir)
@@ -143,8 +151,8 @@
  */
 void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
 {
-	pgd_t * page_dir = mm->pgd;
-
+	pgd_t *page_dir = mm->pgd;
+	ptab(printk(">>> clear_page_tables\n"));
 	spin_lock(&mm->page_table_lock);
 	page_dir += first;
 	do {
@@ -171,13 +179,23 @@
  * dst->page_table_lock is held on entry and exit,
  * but may be dropped within pmd_alloc() and pte_alloc().
  */
-int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
-			struct vm_area_struct *vma)
+spinlock_t page_table_share_lock = SPIN_LOCK_UNLOCKED;
+
+int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma)
 {
-	pgd_t * src_pgd, * dst_pgd;
+	pgd_t *src_pgd, *dst_pgd;
 	unsigned long address = vma->vm_start;
 	unsigned long end = vma->vm_end;
 	unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
+	int share_page_tables = !!(current->flags & PF_SHARE_TABLES);
+
+#if 0
+	static int teststart = 2, testcount = 99, tests = 0;
+	if (share_page_tables && (tests++ < teststart || tests > teststart + testcount))
+		share_page_tables = 0;
+	if (share_page_tables)
+		printk(">>> copy_page_range test %i\n", tests - 1);
+#endif
 
 	src_pgd = pgd_offset(src, address)-1;
 	dst_pgd = pgd_offset(dst, address)-1;
@@ -186,15 +204,15 @@
 		pmd_t * src_pmd, * dst_pmd;
 
 		src_pgd++; dst_pgd++;
-		
+
 		/* copy_pmd_range */
-		
+
 		if (pgd_none(*src_pgd))
-			goto skip_copy_pmd_range;
+			goto skip_pmd_range;
 		if (pgd_bad(*src_pgd)) {
 			pgd_ERROR(*src_pgd);
 			pgd_clear(src_pgd);
-skip_copy_pmd_range:	address = (address + PGDIR_SIZE) & PGDIR_MASK;
+skip_pmd_range:		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 			if (!address || (address >= end))
 				goto out;
 			continue;
@@ -204,34 +222,58 @@
 		dst_pmd = pmd_alloc(dst, dst_pgd, address);
 		if (!dst_pmd)
 			goto nomem;
-
 		do {
-			pte_t * src_pte, * dst_pte;
-		
-			/* copy_pte_range */
-		
+			pte_t *src_ptb, *dst_ptb;
+
 			if (pmd_none(*src_pmd))
-				goto skip_copy_pte_range;
+				goto skip_ptb_range;
 			if (pmd_bad(*src_pmd)) {
 				pmd_ERROR(*src_pmd);
 				pmd_clear(src_pmd);
-skip_copy_pte_range:		address = (address + PMD_SIZE) & PMD_MASK;
+skip_ptb_range:			address = (address + PMD_SIZE) & PMD_MASK;
 				if (address >= end)
 					goto out;
-				goto cont_copy_pmd_range;
+				goto cont_pmd_range;
 			}
 
-			src_pte = pte_offset(src_pmd, address);
-			dst_pte = pte_alloc(dst, dst_pmd, address);
-			if (!dst_pte)
+			src_ptb = pte_offset(src_pmd, address);
+
+			if (!share_page_tables) goto no_share;
+
+			if (pmd_none(*dst_pmd)) {
+				spin_lock(&src->page_table_lock);
+				get_page(virt_to_page(src_ptb));
+				pmd_populate(dst, dst_pmd, ((ulong) src_ptb & PAGE_MASK));
+				ptab(printk(">>> share %p @ %p (++%i)\n", src_ptb, address, page_count(virt_to_page(src_ptb))));
+				goto share;
+			}
+			if ((pmd_val(*src_pmd) & PAGE_MASK) != ((pmd_val(*dst_pmd) & PAGE_MASK)))
+				goto no_share;
+			spin_lock(&src->page_table_lock);
+share:
+			do {
+				pte_t pte = *src_ptb;
+				if (!pte_none(pte) && pte_present(pte)) {
+					struct page *page = pte_page(pte);
+					if (VALID_PAGE(page) && !PageReserved(page) && cow)
+						ptep_set_wrprotect(src_ptb);
+				}
+				if ((address += PAGE_SIZE) >= end)
+					goto out_unlock;
+				src_ptb++;
+			} while ((ulong) src_ptb & PTE_TABLE_MASK);
+			spin_unlock(&src->page_table_lock);
+
+			goto cont_pmd_range;
+no_share:
+			dst_ptb = __pte_alloc(dst, dst_pmd, address, 0);
+			if (!dst_ptb)
 				goto nomem;
 
-			spin_lock(&src->page_table_lock);			
+			spin_lock(&src->page_table_lock);
 			do {
-				pte_t pte = *src_pte;
+				pte_t pte = *src_ptb;
 				struct page *ptepage;
-				
-				/* copy_one_pte */
 
 				if (pte_none(pte))
 					goto cont_copy_pte_range_noset;
@@ -240,14 +282,14 @@
 					goto cont_copy_pte_range;
 				}
 				ptepage = pte_page(pte);
-				if ((!VALID_PAGE(ptepage)) || 
+				if ((!VALID_PAGE(ptepage)) ||
 				    PageReserved(ptepage))
 					goto cont_copy_pte_range;
 
 				/* If it's a COW mapping, write protect it both in the parent and the child */
 				if (cow) {
-					ptep_set_wrprotect(src_pte);
-					pte = *src_pte;
+					ptep_set_wrprotect(src_ptb);
+					pte = *src_ptb;
 				}
 
 				/* If it's a shared mapping, mark it clean in the child */
@@ -257,16 +299,16 @@
 				get_page(ptepage);
 				dst->rss++;
 
-cont_copy_pte_range:		set_pte(dst_pte, pte);
+cont_copy_pte_range:		set_pte(dst_ptb, pte);
 cont_copy_pte_range_noset:	address += PAGE_SIZE;
 				if (address >= end)
 					goto out_unlock;
-				src_pte++;
-				dst_pte++;
-			} while ((unsigned long)src_pte & PTE_TABLE_MASK);
+				src_ptb++;
+				dst_ptb++;
+			} while ((ulong) src_ptb & PTE_TABLE_MASK);
 			spin_unlock(&src->page_table_lock);
-		
-cont_copy_pmd_range:	src_pmd++;
+
+cont_pmd_range:		src_pmd++;
 			dst_pmd++;
 		} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
 	}
@@ -288,7 +330,6 @@
 		BUG();
 	}
 }
-
 static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
 {
 	unsigned long offset;
@@ -299,10 +340,21 @@
 		return 0;
 	if (pmd_bad(*pmd)) {
 		pmd_ERROR(*pmd);
-		pmd_clear(pmd);
+		pmd_clear(pmd);	
 		return 0;
 	}
+
 	ptep = pte_offset(pmd, address);
+
+	spin_lock(&page_table_share_lock);
+	if (page_count(virt_to_page(ptep)) > 1) {
+		ptab(printk(">>> zap table!!! %p (%i)\n", ptep, page_count(virt_to_page(ptep))));
+		tlb_remove_page(tlb, (pte_t *) pmd, pmd_val(*pmd));
+		spin_unlock(&page_table_share_lock);
+		return 0;
+	}
+	spin_unlock(&page_table_share_lock);
+
 	offset = address & ~PMD_MASK;
 	if (offset + size > PMD_SIZE)
 		size = PMD_SIZE - offset;
@@ -352,6 +404,24 @@
 	return freed;
 }
 
+static int unshare_one(struct mm_struct *mm, pgd_t *dir, unsigned long address)
+{
+	if ((address & PMD_MASK)) {
+		address &= PMD_MASK;
+		if (!pgd_none(*dir)) {
+			pmd_t *pmd = pmd_offset(dir, address);
+			if (!pmd_none(*pmd)) {
+				pte_t *ptb = pte_offset(pmd, address);
+				if (page_count(virt_to_page(ptb)) > 1) {
+					__pte_alloc(mm, pmd, address, 1);
+					return 1;
+				}
+			}
+		}
+	}
+	return 0;
+}
+
 /*
  * remove user pages in a given range.
  */
@@ -374,6 +444,14 @@
 	if (address >= end)
 		BUG();
 	spin_lock(&mm->page_table_lock);
+
+	/*
+	 * Ensure first and last partial page tables are unshared
+	 */
+	do {
+		unshare_one(mm, dir, address);
+	} while (unshare_one(mm, pgd_offset(mm, end & PMD_MASK), end));
+
 	flush_cache_range(mm, address, end);
 	tlb = tlb_gather_mmu(mm);
 
@@ -517,7 +595,7 @@
 
 	mm = current->mm;
 	dprintk ("map_user_kiobuf: begin\n");
-	
+		
 	pgcount = (va + len + PAGE_SIZE - 1)/PAGE_SIZE - va/PAGE_SIZE;
 	/* mapping 0 bytes is not permitted */
 	if (!pgcount) BUG();
@@ -1398,33 +1476,73 @@
  * We've already handled the fast-path in-line, and we own the
  * page table lock.
  */
-pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+pte_t *__pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address, int unshare)
 {
-	if (pmd_none(*pmd)) {
-		pte_t *new;
+	struct page *ptb_page;
+	pte_t *new, *src_ptb, *dst_ptb;
 
-		/* "fast" allocation can happen without dropping the lock.. */
-		new = pte_alloc_one_fast(mm, address);
-		if (!new) {
-			spin_unlock(&mm->page_table_lock);
-			new = pte_alloc_one(mm, address);
-			spin_lock(&mm->page_table_lock);
-			if (!new)
-				return NULL;
+	if (pmd_none(*pmd)) {
+		spin_unlock(&mm->page_table_lock);
+		new = pte_alloc_one(mm, address);
+		spin_lock(&mm->page_table_lock);
+		if (!new) return NULL;
 
-			/*
-			 * Because we dropped the lock, we should re-check the
-			 * entry, as somebody else could have populated it..
-			 */
-			if (!pmd_none(*pmd)) {
-				pte_free(new);
-				goto out;
-			}
-		}
+		/* Populated while unlocked? */
+		if (!pmd_none(*pmd))
+			goto unshared_free;
 		pmd_populate(mm, pmd, new);
+unshared:	return pte_offset(pmd, address);
 	}
-out:
-	return pte_offset(pmd, address);
+	if (!unshare || page_count(virt_to_page(pmd_page(*pmd))) == 1)
+		goto unshared;
+	spin_unlock(&mm->page_table_lock);
+	new = pte_alloc_one(mm, address);
+	spin_lock(&mm->page_table_lock);
+	if (!new) return NULL;
+
+	assert(page_count(virt_to_page(new)) == 1);
+	spin_lock(&page_table_share_lock);
+	ptb_page = virt_to_page(pmd_page(*pmd));
+	if (page_count(ptb_page) == 1)
+		goto unshared_unlock;
+
+	ptab(printk(">>> make page table %p @ %p %s\n", new, address,
+		unshare == 2? "write fault":
+		unshare == 1? "unshared":
+		"bogus"));
+
+	src_ptb = pte_offset(pmd, 0);
+	dst_ptb = new;
+	ptab(printk(">>> unshare %p (%i--)\n", *pmd, page_count(ptb_page)));
+	// assert(ptb_page->flags & PG_shared_debug);
+	do {
+		pte_t pte = *src_ptb;
+		if (!pte_none(pte)) {
+			if (pte_present(pte)) {
+				struct page *page = pte_page(pte);
+				if (VALID_PAGE(page) && !PageReserved(page)) {
+					get_page(page);
+					pte = pte_mkold(pte_mkclean(pte));
+					mm->rss++;
+				}
+			} else
+				swap_duplicate(pte_to_swp_entry(pte));
+			set_pte(dst_ptb, pte);
+		}
+		src_ptb++;
+		dst_ptb++;
+	} while ((ulong) dst_ptb & PTE_TABLE_MASK);
+	pmd_populate(mm, pmd, new);
+	put_page(ptb_page);
+	spin_unlock(&page_table_share_lock);
+	goto unshared;
+
+unshared_unlock:
+	get_page(ptb_page);
+	spin_unlock(&page_table_share_lock);
+unshared_free:
+	pte_free_slow(new);
+        goto unshared;
 }
 
 int make_pages_present(unsigned long addr, unsigned long end)
--- ../2.4.17.clean/mm/mremap.c	Thu Sep 20 23:31:26 2001
+++ ./mm/mremap.c	Mon Feb 18 22:57:54 2002
@@ -92,6 +92,7 @@
 {
 	unsigned long offset = len;
 
+	ptab(printk(">>> mremap\n"));
 	flush_cache_range(mm, old_addr, old_addr + len);
 
 	/*
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/