2002-02-16 18:03:45

by Daniel Phillips

[permalink] [raw]
Subject: [RFC] Page table sharing

Hi,

I think this patch is ready to look at now. It's been pretty stable, though
I haven't gone as far as booting with it - page table sharing is still
restricted to uid 9999. I'm running it on a 2 way under moderate load without
apparent problems. The speedup on forking from a parent with large vm is
*way* more than I expected.

I haven't fully analyzed the locking yet, but I'm beginning to suspect it
just works as is, i.e., I haven't exposed any new critical regions. I'd be
happy to be corrected on that though.

Changed from the previous version:

- Debug tracing is in macros now, and off by default
- TLB flushing in zap_pte_range is hopefully correct now

--
Daniel

--- ../2.4.17.clean/fs/exec.c Fri Dec 21 12:41:55 2001
+++ ./fs/exec.c Sat Feb 16 17:41:52 2002
@@ -860,6 +860,7 @@
int retval;
int i;

+ ptab(printk(">>> execve %s\n", filename));
file = open_exec(filename);

retval = PTR_ERR(file);
--- ../2.4.17.clean/include/linux/mm.h Fri Dec 21 12:42:03 2001
+++ ./include/linux/mm.h Sat Feb 16 17:41:52 2002
@@ -411,7 +411,7 @@

extern int vmtruncate(struct inode * inode, loff_t offset);
extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(__pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address, int write));
extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
extern int make_pages_present(unsigned long addr, unsigned long end);
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
@@ -424,6 +424,19 @@

int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+
+static inline pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+ return __pte_alloc(mm, pmd, address, 1);
+}
+
+#define nil do { } while (0)
+
+#if 0
+# define ptab(cmd) cmd
+#else
+# define ptab(cmd) nil
+#endif

/*
* On a two-level page table, this ends up being trivial. Thus the
--- ../2.4.17.clean/include/linux/sched.h Fri Dec 21 12:42:03 2001
+++ ./include/linux/sched.h Sat Feb 16 17:41:52 2002
@@ -427,7 +427,7 @@
#define PF_MEMDIE 0x00001000 /* Killed for out-of-memory */
#define PF_FREE_PAGES 0x00002000 /* per process page freeing */
#define PF_NOIO 0x00004000 /* avoid generating further I/O */
-
+#define PF_SHARE_TABLES 0x00008000 /* share page tables (testing) */
#define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */

/*
--- ../2.4.17.clean/kernel/fork.c Wed Nov 21 13:18:42 2001
+++ ./kernel/fork.c Sat Feb 16 17:41:52 2002
@@ -566,9 +566,10 @@
struct task_struct *p;
struct completion vfork;

+ ptab(printk(">>> fork, stack=%li\n", stack_start));
retval = -EPERM;

- /*
+ /*
* CLONE_PID is only allowed for the initial SMP swapper
* calls
*/
--- ../2.4.17.clean/kernel/sys.c Tue Sep 18 17:10:43 2001
+++ ./kernel/sys.c Sat Feb 16 17:41:52 2002
@@ -514,6 +514,11 @@
current->uid = new_ruid;
current->user = new_user;
free_uid(old_user);
+
+ if (current->uid == 9999)
+ current->flags |= PF_SHARE_TABLES;
+ printk(">>> user: uid=%i pid=%i pf=%x\n", current->uid, current->pid, current->flags);
+
return 0;
}

--- ../2.4.17.clean/mm/memory.c Fri Dec 21 12:42:05 2001
+++ ./mm/memory.c Sat Feb 16 17:41:52 2002
@@ -34,6 +34,9 @@
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* ([email protected])
+ *
+ * Feb 2002 - Shared page tables added by Daniel Phillips
+ * ([email protected])
*/

#include <linux/mm.h>
@@ -100,8 +103,12 @@
return;
}
pte = pte_offset(dir, 0);
- pmd_clear(dir);
- pte_free(pte);
+ if (current->uid == 9999 || page_count(virt_to_page(pte)) > 1)
+ ptab(printk(">>> free page table %p (%i)\n", pte, page_count(virt_to_page(pte))));
+ if (put_page_testzero(virt_to_page(pte))) {
+ pmd_clear(dir);
+ pte_free(pte);
+ }
}

static inline void free_one_pgd(pgd_t * dir)
@@ -143,8 +150,8 @@
*/
void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
{
- pgd_t * page_dir = mm->pgd;
-
+ pgd_t *page_dir = mm->pgd;
+ ptab(printk(">>> clear_page_tables\n"));
spin_lock(&mm->page_table_lock);
page_dir += first;
do {
@@ -171,13 +178,21 @@
* dst->page_table_lock is held on entry and exit,
* but may be dropped within pmd_alloc() and pte_alloc().
*/
-int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
- struct vm_area_struct *vma)
+int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma)
{
- pgd_t * src_pgd, * dst_pgd;
+ pgd_t *src_pgd, *dst_pgd;
unsigned long address = vma->vm_start;
unsigned long end = vma->vm_end;
unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
+ int share_page_tables = !!(current->flags & PF_SHARE_TABLES);
+
+#if 0
+ static int teststart = 0, testcount = 999, tests = 0;
+ if (share_page_tables && (tests++ < teststart || tests > teststart + testcount))
+ share_page_tables = 0;
+ if (share_page_tables)
+ printk(">>> copy_page_range test %i\n", tests - 1);
+#endif

src_pgd = pgd_offset(src, address)-1;
dst_pgd = pgd_offset(dst, address)-1;
@@ -186,15 +201,15 @@
pmd_t * src_pmd, * dst_pmd;

src_pgd++; dst_pgd++;
-
+
/* copy_pmd_range */
-
+
if (pgd_none(*src_pgd))
- goto skip_copy_pmd_range;
+ goto skip_pmd_range;
if (pgd_bad(*src_pgd)) {
pgd_ERROR(*src_pgd);
pgd_clear(src_pgd);
-skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
+skip_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
if (!address || (address >= end))
goto out;
continue;
@@ -204,34 +219,56 @@
dst_pmd = pmd_alloc(dst, dst_pgd, address);
if (!dst_pmd)
goto nomem;
-
do {
- pte_t * src_pte, * dst_pte;
-
- /* copy_pte_range */
-
+ pte_t *src_ptb, *dst_ptb;
+
if (pmd_none(*src_pmd))
- goto skip_copy_pte_range;
+ goto skip_ptb_range;
if (pmd_bad(*src_pmd)) {
pmd_ERROR(*src_pmd);
pmd_clear(src_pmd);
-skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
+skip_ptb_range: address = (address + PMD_SIZE) & PMD_MASK;
if (address >= end)
goto out;
- goto cont_copy_pmd_range;
+ goto cont_pmd_range;
}

- src_pte = pte_offset(src_pmd, address);
- dst_pte = pte_alloc(dst, dst_pmd, address);
- if (!dst_pte)
+ src_ptb = pte_offset(src_pmd, address);
+
+ if (!share_page_tables) goto no_share;
+
+ if (pmd_none(*dst_pmd)) {
+ get_page(virt_to_page(src_ptb));
+ pmd_populate(dst, dst_pmd, ((unsigned long) src_ptb & PAGE_MASK));
+ ptab(printk(">>> share %p @ %p (%i)\n", src_ptb, address,
+ page_count(virt_to_page(src_ptb))));
+ } else if (page_count(virt_to_page(src_ptb)) == 1) // should test for ptbs !=
+ goto no_share;
+
+ spin_lock(&src->page_table_lock);
+ do {
+ pte_t pte = *src_ptb;
+ if (!pte_none(pte) && pte_present(pte)) {
+ struct page *page = pte_page(pte);
+ if (VALID_PAGE(page) && !PageReserved(page) && cow)
+ ptep_set_wrprotect(src_ptb);
+ }
+ if ((address += PAGE_SIZE) >= end)
+ goto out_unlock;
+ src_ptb++;
+ } while ((unsigned) src_ptb & PTE_TABLE_MASK);
+ spin_unlock(&src->page_table_lock);
+
+ goto cont_pmd_range;
+no_share:
+ dst_ptb = __pte_alloc(dst, dst_pmd, address, 0);
+ if (!dst_ptb)
goto nomem;

- spin_lock(&src->page_table_lock);
+ spin_lock(&src->page_table_lock);
do {
- pte_t pte = *src_pte;
+ pte_t pte = *src_ptb;
struct page *ptepage;
-
- /* copy_one_pte */

if (pte_none(pte))
goto cont_copy_pte_range_noset;
@@ -240,14 +277,14 @@
goto cont_copy_pte_range;
}
ptepage = pte_page(pte);
- if ((!VALID_PAGE(ptepage)) ||
+ if ((!VALID_PAGE(ptepage)) ||
PageReserved(ptepage))
goto cont_copy_pte_range;

/* If it's a COW mapping, write protect it both in the parent and the child */
if (cow) {
- ptep_set_wrprotect(src_pte);
- pte = *src_pte;
+ ptep_set_wrprotect(src_ptb);
+ pte = *src_ptb;
}

/* If it's a shared mapping, mark it clean in the child */
@@ -257,16 +294,16 @@
get_page(ptepage);
dst->rss++;

-cont_copy_pte_range: set_pte(dst_pte, pte);
+cont_copy_pte_range: set_pte(dst_ptb, pte);
cont_copy_pte_range_noset: address += PAGE_SIZE;
if (address >= end)
goto out_unlock;
- src_pte++;
- dst_pte++;
- } while ((unsigned long)src_pte & PTE_TABLE_MASK);
+ src_ptb++;
+ dst_ptb++;
+ } while ((unsigned) src_ptb & PTE_TABLE_MASK);
spin_unlock(&src->page_table_lock);
-
-cont_copy_pmd_range: src_pmd++;
+
+cont_pmd_range: src_pmd++;
dst_pmd++;
} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
}
@@ -302,7 +339,18 @@
pmd_clear(pmd);
return 0;
}
+
ptep = pte_offset(pmd, address);
+
+ if (page_count(virt_to_page(ptep)) > 1) {
+ ptab(printk(">>> zap table!!! %p (%i)\n",
+ ptep, page_count(virt_to_page(ptep))));
+ // pmd_clear(pmd);
+ // put_page(virt_to_page(ptep));
+ tlb_remove_page(tlb, (pte_t *) pmd, pmd_val(*pmd));
+ return 0;
+ }
+
offset = address & ~PMD_MASK;
if (offset + size > PMD_SIZE)
size = PMD_SIZE - offset;
@@ -346,12 +394,30 @@
freed = 0;
do {
freed += zap_pte_range(tlb, pmd, address, end - address);
- address = (address + PMD_SIZE) & PMD_MASK;
+ address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
return freed;
}

+static int unshare_one(struct mm_struct *mm, pgd_t *dir, unsigned long address)
+{
+ if ((address & PMD_MASK)) {
+ address &= PMD_MASK;
+ if (!pgd_none(*dir)) {
+ pmd_t *pmd = pmd_offset(dir, address);
+ if (!pmd_none(*pmd)) {
+ pte_t *ptb = pte_offset(pmd, address);
+ if (page_count(virt_to_page(ptb)) > 1) {
+ __pte_alloc(mm, pmd, address, 1);
+ return 1;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
/*
* remove user pages in a given range.
*/
@@ -361,6 +427,7 @@
pgd_t * dir;
unsigned long start = address, end = address + size;
int freed = 0;
+ ptab(printk(">>> zap_page_range %lx+%lx\n", address, size));

dir = pgd_offset(mm, address);

@@ -374,6 +441,14 @@
if (address >= end)
BUG();
spin_lock(&mm->page_table_lock);
+
+ /*
+ * Ensure first and last partial page tables are unshared
+ */
+ do {
+ unshare_one(mm, dir, address);
+ } while (unshare_one(mm, pgd_offset(mm, end & PMD_MASK), end));
+
flush_cache_range(mm, address, end);
tlb = tlb_gather_mmu(mm);

@@ -1348,7 +1423,8 @@
pmd = pmd_alloc(mm, pgd, address);

if (pmd) {
- pte_t * pte = pte_alloc(mm, pmd, address);
+ pte_t *pte = __pte_alloc(mm, pmd, address,
+ write_access /*&& !(vma->vm_flags & VM_SHARED)*/);
if (pte)
return handle_pte_fault(mm, vma, address, write_access, pte);
}
@@ -1398,28 +1474,50 @@
* We've already handled the fast-path in-line, and we own the
* page table lock.
*/
-pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+pte_t *__pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address, int write)
{
- if (pmd_none(*pmd)) {
- pte_t *new;
-
- /* "fast" allocation can happen without dropping the lock.. */
- new = pte_alloc_one_fast(mm, address);
+ if (pmd_none(*pmd) || (write && page_count(virt_to_page(pmd_page(*pmd))) > 1)) {
+ pte_t *new = pte_alloc_one_fast(mm, address);
if (!new) {
spin_unlock(&mm->page_table_lock);
new = pte_alloc_one(mm, address);
spin_lock(&mm->page_table_lock);
if (!new)
return NULL;
-
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (!pmd_none(*pmd)) {
+ /* Recheck in case populated while unlocked */
+ if (!pmd_none(*pmd) && !(write && page_count(virt_to_page(pmd_page(*pmd))) > 1)) {
pte_free(new);
goto out;
}
+ }
+ ptab(printk(">>> make page table %p @ %p %s\n", new, address,
+ write == 2? "write fault":
+ write == 1? "unshared":
+ write == 0? "sharable":
+ "bogus"));
+ if (!page_count(virt_to_page(new)) == 1) BUG();
+ if (!pmd_none(*pmd)) {
+ pte_t *src_ptb = pte_offset(pmd, 0);
+ pte_t *dst_ptb = new;
+ ptab(printk(">>> unshare %p (%i--)\n", *pmd, page_count(virt_to_page(pmd_page(*pmd)))));
+ do {
+ pte_t pte = *src_ptb;
+ if (!pte_none(pte)) {
+ if (pte_present(pte)) {
+ struct page *page = pte_page(pte);
+ if (VALID_PAGE(page) && !PageReserved(page)) {
+ get_page(page);
+ pte = pte_mkold(pte_mkclean(pte));
+ mm->rss++;
+ }
+ } else
+ swap_duplicate(pte_to_swp_entry(pte));
+ set_pte(dst_ptb, pte);
+ }
+ src_ptb++;
+ dst_ptb++;
+ } while ((unsigned) dst_ptb & PTE_TABLE_MASK);
+ put_page(virt_to_page(pmd_page(*pmd)));
}
pmd_populate(mm, pmd, new);
}
--- ../2.4.17.clean/mm/mremap.c Thu Sep 20 23:31:26 2001
+++ ./mm/mremap.c Sat Feb 16 17:41:52 2002
@@ -92,6 +92,7 @@
{
unsigned long offset = len;

+ ptab(printk(">>> mremap\n"));
flush_cache_range(mm, old_addr, old_addr + len);

/*


2002-02-16 20:22:49

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC] Page table sharing



On Sat, 16 Feb 2002, Daniel Phillips wrote:
>
> I think this patch is ready to look at now. It's been pretty stable, though
> I haven't gone as far as booting with it - page table sharing is still
> restricted to uid 9999. I'm running it on a 2 way under moderate load without
> apparent problems. The speedup on forking from a parent with large vm is
> *way* more than I expected.

I'd really like to hear what happens when you enable it unconditionally,
and then run various real loads along with things like "lmbench".

Also, when you test forking over a parent, do you test just the fork, or
do you test the "fork+wait" combination that waits for the child to exit
too? The latter is the only really meaningful thing to test.

Anyway, the patch certainly looks pretty simple and small. Great.

> I haven't fully analyzed the locking yet, but I'm beginning to suspect it
> just works as is, i.e., I haven't exposed any new critical regions. I'd be
> happy to be corrected on that though.

What's the protection against two different MM's doing a
"zap_page_range()" concurrently, both thinking that they can just drop the
page table directory entry, and neither actually freeing it? I don't see
any such logic there..

I suspect that the only _good_ way to handle it is to do

pmd_page = ..

if (put_page_testzero(pmd_page)) {
.. free the actual page table entries ..
__free_pages_ok(pmd_page, 0);
}

instead of using the free_page() logic. Maybe you do that already, I
didn't go through the patches _that_ closely.

Ie you'd do a "two-phase" page free - first do the count handling, and if
that indicates you should really free the pmd, you free the lower page
tables before you physically free the pmd page (ie the page is "live" even
though it has a count of zero).

Linus

2002-02-16 21:03:54

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 16, 2002 09:21 pm, Linus Torvalds wrote:
> On Sat, 16 Feb 2002, Daniel Phillips wrote:
> >
> > I think this patch is ready to look at now. It's been pretty stable,
> > though I haven't gone as far as booting with it - page table sharing is
> > still restricted to uid 9999. I'm running it on a 2 way under moderate
> > load without apparent problems. The speedup on forking from a parent
> > with large vm is *way* more than I expected.
>
> I'd really like to hear what happens when you enable it unconditionally,
> and then run various real loads along with things like "lmbench".

I'm running on it unconditionally now. I'm still up though I haven't run
really heavy stress tests. I've had one bug report, curiously with the
version keying on uid 9999 while *not* running as uid 9999. This makes me
think that there are some uninitialized page use counts on page tables
somewhere in the system.

> Also, when you test forking over a parent, do you test just the fork, or
> do you test the "fork+wait" combination that waits for the child to exit
> too? The latter is the only really meaningful thing to test.

Will do. Does this do the trick:

wait();
gettimeofday(&etime, NULL);

If so, it doesn't affect the timings at all, in other words I haven't just
pushed the work into the child's exit.

> Anyway, the patch certainly looks pretty simple and small. Great.
>
> > I haven't fully analyzed the locking yet, but I'm beginning to suspect it
> > just works as is, i.e., I haven't exposed any new critical regions. I'd
> > be happy to be corrected on that though.
>
> What's the protection against two different MM's doing a
> "zap_page_range()" concurrently, both thinking that they can just drop the
> page table directory entry, and neither actually freeing it? I don't see
> any such logic there..

Nothing prevents that, duh.

> I suspect that the only _good_ way to handle it is to do
>
> pmd_page = ..
>
> if (put_page_testzero(pmd_page)) {
> .. free the actual page table entries ..
> __free_pages_ok(pmd_page, 0);
> }
>
> instead of using the free_page() logic. Maybe you do that already, I
> didn't go through the patches _that_ closely.

I do something similar in clear_page_tables->free_one_pmd, after the entries
are all gone. I have to do something different in zap_page_range - it wants
to free the pmd only if the count is *greater* than one, and can't tolerate
two mms thinking that at the same time. I think I'd better lock the pmd page
there.

> Ie you'd do a "two-phase" page free - first do the count handling, and if
> that indicates you should really free the pmd, you free the lower page
> tables before you physically free the pmd page (ie the page is "live" even
> though it has a count of zero).

Actually, I'm not freeing the pmd I'm freeing *pmd, a page table. So, if the
count is > 1 it can be freed without doing anything to the ptes on it. This
is the entire source of the speedup, by the way.

--
Daniel

2002-02-17 22:15:40

by Hugh Dickins

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On Sun, 17 Feb 2002, Daniel Phillips wrote:
>
> Note that we have to hold the page_table_share_lock until we're finished
> copying in the ptes, otherwise the source could go away. This can turn
> into a lock on the page table itself eventually, so whatever contention
> there might be will be eliminated.
>
> Fixing up copy_page_range to bring the pmd populate inside the
> mm->page_table_lock is trivial, I won't go into it here. With that plus
> changes as above, I think it's tight. Though I would not bet my life on
> it ;-)

Sorry, I didn't really try to follow your preceding discussion of
zap_page_range. (I suspect you need to step back and think again if it
gets that messy; but that may be unfair, I haven't thought it through).

You need your "page_table_share_lock" (better, per-page-table spinlock)
much more than you seem to realize. If mm1 and mm2 share a page table,
mm1->page_table_lock and mm2->page_table_lock give no protection against
each other. Consider copy_page_range from mm1 or __pte_alloc in mm1
while try_to_swap_out is acting on shared page table in mm2. In fact,
I think even the read faults are vulnerable to races (mm1 and mm2
bringing page in at the same time so double-counting it), since your
__pte_alloc doesn't regard a read fault as reason to break the share.

I'm also surprised that your copy_page_range appears to be setting
write protect on each pte, including expensive pte_page, VALID_PAGE
stuff on each. You avoid actually copying pte and incrementing counts,
but I'd expect you to want to avoid the whole scan: invalidating entry
for the page table itself, to force fault if needed.

Hugh

2002-02-17 06:24:50

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC] Page table sharing



On Sat, 16 Feb 2002, Daniel Phillips wrote:
> >
> > if (put_page_testzero(pmd_page)) {
> > .. free the actual page table entries ..
> > __free_pages_ok(pmd_page, 0);
> > }
> >
> > instead of using the free_page() logic. Maybe you do that already, I
> > didn't go through the patches _that_ closely.
>
> I do something similar in clear_page_tables->free_one_pmd, after the entries
> are all gone. I have to do something different in zap_page_range - it wants
> to free the pmd only if the count is *greater* than one, and can't tolerate
> two mms thinking that at the same time. I think I'd better lock the pmd page
> there.

But that's ok.

If you have the logic that

if (put_page_testzero(pmd_page)) {
... do the lower-level free ...
__free_pages_ok(pmd_page, 0);
}

then you automatically have exactly the behaviour you want, with no
locking at all (except for the "local" locking inherent in the atomic
decrement-and-test).

What you have is:
- _if_ the count was > 1, then you do nothing at all (except for
decrementing your count)
- for the last user (and for that _only_), where the count was 1 and goes
to zero, you'll do the inside of the "if ()" statement, and actually
clean up the page table and free the pmd.

So you not only get the optimization you want, you also quite naturally
get the "exclusive last user" case.

Linus

2002-02-18 01:31:40

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 17, 2002 11:16 pm, Hugh Dickins wrote:
> On Sun, 17 Feb 2002, Daniel Phillips wrote:
> >
> > Note that we have to hold the page_table_share_lock until we're finished
> > copying in the ptes, otherwise the source could go away. This can turn
> > into a lock on the page table itself eventually, so whatever contention
> > there might be will be eliminated.
> >
> > Fixing up copy_page_range to bring the pmd populate inside the
> > mm->page_table_lock is trivial, I won't go into it here. With that plus
> > changes as above, I think it's tight. Though I would not bet my life on
> > it ;-)
>
> Sorry, I didn't really try to follow your preceding discussion of
> zap_page_range. (I suspect you need to step back and think again if it
> gets that messy; but that may be unfair, I haven't thought it through).

Oh believe me, it could get a lot messier than what I described. This
problem is hard because it's hard.

> You need your "page_table_share_lock" (better, per-page-table spinlock)
> much more than you seem to realize. If mm1 and mm2 share a page table,
> mm1->page_table_lock and mm2->page_table_lock give no protection against
> each other.

Unless we decrement and find that the count was originally 1, that means
we are the exclusive owner and are protected by the mm->page_table_lock
we hold. Only if that is not the case do we need the extra spinlock.

Please ping me right away if this analysis is wrong.

> Consider copy_page_range from mm1 or __pte_alloc in mm1
> while try_to_swap_out is acting on shared page table in mm2. In fact,
> I think even the read faults are vulnerable to races (mm1 and mm2
> bringing page in at the same time so double-counting it), since your
> __pte_alloc doesn't regard a read fault as reason to break the share.

This is exactly what I've been considering, intensively, for days.
(Sleeping has been optional ;-) Please re-evaluate this in light of the
exclusive owner observation above.

> I'm also surprised that your copy_page_range appears to be setting
> write protect on each pte, including expensive pte_page, VALID_PAGE
> stuff on each.

What do you mean? Look at my copy_page_range inner loop:

do {
pte_t pte = *src_ptb;
if (!pte_none(pte) && pte_present(pte)) {
struct page *page = pte_page(pte);
if (VALID_PAGE(page) && !PageReserved(page) && cow)
ptep_set_wrprotect(src_ptb);
}
if ((address += PAGE_SIZE) >= end)
goto out_unlock;
src_ptb++;
} while ((unsigned) src_ptb & PTE_TABLE_MASK);

The && cow line takes care of it. One thing I'm not doing is taking the
obvious optimization for inherited shared memory, you'll see it's commented
out. For no particular reason, it's a carryover from the early debugging.

> You avoid actually copying pte and incrementing counts,
> but I'd expect you to want to avoid the whole scan: invalidating entry
> for the page table itself, to force fault if needed.

Yes, I'm concentrating on correctness right now, but the results are far
from shabby anyway. I'm forking 2-5 times as fast from large-memory
parents:

time ./test 100 10000
100 forks from parent with 10000 pages of mapped memory...
Old total fork time: 0.488664 seconds in 100 iterations (4886 usec/fork)
New total fork time: 0.218832 seconds in 100 iterations (2188 usec/fork)
0.03user 0.82system 0:01.30elapsed 65%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (889major+20471minor)pagefaults 0swaps

time ./test 10 100000
10 forks from parent with 100000 pages of mapped memory...
Old total fork time: 1.045828 seconds in 10 iterations (104582 usec/fork)
New total fork time: 0.225679 seconds in 10 iterations (22567 usec/fork)
0.05user 64.89system 1:06.56elapsed 97%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (1189major+200386minor)pagefaults 0swaps

I guess the real fork speed up is 5X and the rest is overhead. By the way,
notice how long it takes to set up the memory table for the large memory
parent, it looks like there's a basic vm problem (2.4.17).

Using unmapped page tables or Linus's suggestion - setting the page table
RO - is a short step from here. This will bring these timings from impressive
to downright silly. However, unless somebody wants to do this hack right now
I'll put it aside for a couple of weeks while the current code stabilizes.
Oh, and I guess I'd better apologize to Linus for calling his suggestion a
'decoration'.

Here's my test script:

echo $1 forks from parent with $2 pages of mapped memory...
echo -n "Old "; sudo -u old ./forktime -n $1 -m $2
echo -n "New "; sudo -u new ./forktime -n $1 -m $2

And the c program 'forktime':

/*
* Time fork() system call.
*
* Original by Dave McCracken
* Reformat by Daniel Phillips
*/

#include <stdio.h>
#include <sys/time.h>
#include <sys/mman.h>

int main(int argc, char *argv[])
{
int n=1, m=1, i, pagesize = getpagesize();
struct timeval stime, etime, rtime;
extern char *optarg;
char *mem;

args: switch (getopt(argc, argv, "n:m:")) {
case 'n':
n = atoi(optarg);
goto args;
case 'm':
m = atoi(optarg);
goto args;
case -1:
break;
default:
fprintf(stderr, "Usage: %s [-n iterations] [-m mappedpages]\n", argv[0]);
exit(1);
}

mem = mmap(NULL, m * pagesize, PROT_READ|PROT_WRITE, MAP_PRIVATE | MAP_ANON, 0, 0);

for (i = 0; i < m; i++)
mem[i * pagesize] = 0;

gettimeofday (&stime, NULL);

for (i = 0; i < n; i++)
switch (fork()) {
case 0: exit(0);
case -1: exit(2);
}

wait();
gettimeofday(&etime, NULL);
timersub(&etime, &stime, &rtime);
printf("total fork time: %d.%06d seconds in %d iterations ", rtime.tv_sec, rtime.tv_usec, i);
printf("(%d usec/fork)\n", ((rtime.tv_sec * 1000000) + rtime.tv_usec) / i);
exit(0);
}

--
Daniel

2002-02-17 20:12:06

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

Here's the patch as currently posted. I've been hacking on it to implement
the locking described in the previous mail, but really I think it's better to
go with the simple, incorrect and lockless version for group pondering.

http://people.nl.linux.org/~phillips/patches/ptab-2.4.17
(almost the same as posted to lkml/linus yesterday)

In the posted patch, tracing is configed off, see:

#if 0
# define ptab(cmd) cmd
#else
# define ptab(cmd) nil
#endif

in mm.h. Anybody who actually wants to hack on this will probably want to
turn it on. Sharing is also still restricted to id 9999, even though I find
I'm able to boot and run pretty well with system-wide sharing. It's not fully
correct though, because there are lockups happening in some benchmarks
(unixbench and some crazy things invented by Andrew Morton) and UML fails to
start properly. UML does work properly when sharing is restricted to just
one ID, i.e., something deep in the system doesn't like sharing page tables
at this point.

--
Daniel

2002-02-18 09:36:59

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

In short, you were entirely right about the spinlocks. Linus and I were
tilting at windmills. This gradually became clear as I coded it and
hopefully what I have now is pretty tight. In fact, I just booted it and it
works a lot better, UML boots up now whereas before it bombed with some weird
having to do with process ids.

On February 18, 2002 09:09 am, Hugh Dickins wrote:
> On Mon, 18 Feb 2002, Daniel Phillips wrote:
> > On February 17, 2002 11:16 pm, Hugh Dickins wrote:
> > > Consider copy_page_range from mm1 or __pte_alloc in mm1
> > > while try_to_swap_out is acting on shared page table in mm2. In fact,
> > > I think even the read faults are vulnerable to races (mm1 and mm2
> > > bringing page in at the same time so double-counting it), since your
> > > __pte_alloc doesn't regard a read fault as reason to break the share.
> >
> > This is exactly what I've been considering, intensively, for days.
> > (Sleeping has been optional ;-) Please re-evaluate this in light of the
> > exclusive owner observation above.
>
> I only see such page_count code under zap_page_range, and in __pte_alloc
> for write-access case. mm/vmscan.c isn't even in the patch (I'm looking
> at the one you emailed on Saturday night), and there's no code of that
> kind in the header files in the patch.

I don't have to exclude against vmscan.c, which is just unmapping pages, as
opposed to page tables.

> So how is the page_table_lock taken by swap_out effective when it's
> dealing with a page table shared by another mm than the one it is
> locking? And when handling a read-fault, again no such code (but
> when handling a write-fault, __pte_alloc has unshared in advance).
>
> Since copy_page_range would not copy shared page tables, I'm wrong to
> point there. But __pte_alloc does copy shared page tables (to unshare
> them), and needs them to be stable while it does so: so locking against
> swap_out really is required. It also needs locking against read faults,
> and they against each other: but there I imagine it's just a matter of
> dropping the write arg to __pte_alloc, going back to pte_alloc again.

You're right about the read faults, wrong about swap_out. In general you've
been more right than wrong, so thanks. I'll post a new patch pretty soon and
I'd appreciate your comments.

After messing around a lot with braindamaged locking strategies involving
dec-and-test-zero I finally realized it's never going to work if everybody is
dec'ing at the same time, as you obviously knew right away. I switched to a
boring but effective spin lock strategy and was rewarded with an immediate
improvement in stability under smp. It might even be right now, though it
needs considerably more auditing.

I'll right up a detailed rationale of what I ended up doing, it really
needs it. I'm not finished debugging. There's a memory leak, it's pretty
gross and it should be easy to track down. Here are the latest timings:

1000 forks from parent with 100 pages of mapped memory...
Old total fork time: 0.325903 seconds in 1000 iterations (325 usec/fork)
New total fork time: 0.351586 seconds in 1000 iterations (351 usec/fork)

1000 forks from parent with 500 pages of mapped memory...
Old total fork time: 0.471532 seconds in 1000 iterations (471 usec/fork)
New total fork time: 0.533865 seconds in 1000 iterations (533 usec/fork)

1000 forks from parent with 1000 pages of mapped memory...
Old total fork time: 0.671050 seconds in 1000 iterations (671 usec/fork)
New total fork time: 0.637614 seconds in 1000 iterations (637 usec/fork)

100 forks from parent with 10000 pages of mapped memory...
Old total fork time: 0.490882 seconds in 100 iterations (4908 usec/fork)
New total fork time: 0.213267 seconds in 100 iterations (2132 usec/fork)

100 forks from parent with 50000 pages of mapped memory...
Old total fork time: 2.187904 seconds in 100 iterations (21879 usec/fork)
New total fork time: 0.954625 seconds in 100 iterations (9546 usec/fork)

100 forks from parent with 100000 pages of mapped memory...
Old total fork time: 4.277711 seconds in 100 iterations (42777 usec/fork)
New total fork time: 1.940009 seconds in 100 iterations (19400 usec/fork)

100 forks from parent with 200000 pages of mapped memory...
Old total fork time: 15.873854 seconds in 100 iterations (158738 usec/fork)
New total fork time: 2.651530 seconds in 100 iterations (26515 usec/fork)

100 forks from parent with 200000 pages of mapped memory...
Old total fork time: 19.209013 seconds in 100 iterations (192090 usec/fork)
New total fork time: 3.684045 seconds in 100 iterations (36840 usec/fork)

(Look at the last one, the nonshared fork forces the system into swap. I ran
it twice to verify, the second time from a clean reboot. This is another
reason why shared page tables are good.)

--
Daniel

2002-02-18 08:08:04

by Hugh Dickins

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On Mon, 18 Feb 2002, Daniel Phillips wrote:
> On February 17, 2002 11:16 pm, Hugh Dickins wrote:
>
> > You need your "page_table_share_lock" (better, per-page-table spinlock)
> > much more than you seem to realize. If mm1 and mm2 share a page table,
> > mm1->page_table_lock and mm2->page_table_lock give no protection against
> > each other.
>
> Unless we decrement and find that the count was originally 1, that means
> we are the exclusive owner and are protected by the mm->page_table_lock
> we hold. Only if that is not the case do we need the extra spinlock.

Correct (assuming it's coded correctly).

> > Consider copy_page_range from mm1 or __pte_alloc in mm1
> > while try_to_swap_out is acting on shared page table in mm2. In fact,
> > I think even the read faults are vulnerable to races (mm1 and mm2
> > bringing page in at the same time so double-counting it), since your
> > __pte_alloc doesn't regard a read fault as reason to break the share.
>
> This is exactly what I've been considering, intensively, for days.
> (Sleeping has been optional ;-) Please re-evaluate this in light of the
> exclusive owner observation above.

I only see such page_count code under zap_page_range, and in __pte_alloc
for write-access case. mm/vmscan.c isn't even in the patch (I'm looking
at the one you emailed on Saturday night), and there's no code of that
kind in the header files in the patch.

So how is the page_table_lock taken by swap_out effective when it's
dealing with a page table shared by another mm than the one it is
locking? And when handling a read-fault, again no such code (but
when handling a write-fault, __pte_alloc has unshared in advance).

Since copy_page_range would not copy shared page tables, I'm wrong to
point there. But __pte_alloc does copy shared page tables (to unshare
them), and needs them to be stable while it does so: so locking against
swap_out really is required. It also needs locking against read faults,
and they against each other: but there I imagine it's just a matter of
dropping the write arg to __pte_alloc, going back to pte_alloc again.

Hugh

2002-02-17 19:35:05

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

(lots of cc's added because vger is down)

On February 17, 2002 07:23 am, Linus Torvalds wrote:
> On Sat, 16 Feb 2002, Daniel Phillips wrote:
> > >
> > > if (put_page_testzero(pmd_page)) {
> > > .. free the actual page table entries ..
> > > __free_pages_ok(pmd_page, 0);
> > > }
> > >
> > > instead of using the free_page() logic. Maybe you do that already, I
> > > didn't go through the patches _that_ closely.
> >
> > I do something similar in clear_page_tables->free_one_pmd, after the entries
> > are all gone. I have to do something different in zap_page_range - it wants
> > to free the pmd only if the count is *greater* than one, and can't tolerate
> > two mms thinking that at the same time. I think I'd better lock the pmd page
> > there.
>
> But that's ok.

You're right, I gave up trying to find the minimal solution too soon. Now,
the solution I'm looking for is even more minimal than what you had in mind,
which would involve some reorganization of zap_*. Please bear with me as I
circle around this problem a little more...

> If you have the logic that
>
> if (put_page_testzero(pmd_page)) {
> ... do the lower-level free ...
> __free_pages_ok(pmd_page, 0);
> }
>
> then you automatically have exactly the behaviour you want, with no
> locking at all (except for the "local" locking inherent in the atomic
> decrement-and-test).

This is headed in the right direction. The key thing we learn when we
hit zero there is that this mm is the exclusive owner, so we can safely
set the use count back to one and continue with the normal zap, because
anybody who wants to share this pmd will need this mm's page_table_lock,
which we hold. (Note that I have to fix copy_page_range a little to
enforce this exclusion.)

Hey, that lets me keep using tlb_put_page as well, which I want to do
because it's carefully tuned to be optimal for arches with special tlb
purging requirements, and because otherwise I have to fight with the
UP/SMP difference in the tlb parameter (in the first case it's an mm, in
the second case it's a fancier thing that points at an mm, I don't want
to have to add an #ifdef SMP if I can avoid it). Once again, what I
have to do here is re-increment the use count, to set the proper initial
conditions for tlb_put_page, and that's where the exclusion goes:

zap_pte_range:

... ensure the page table exists ...

if (!put_page_testzero(page_table_page)) {
spin_lock(page_table_share_lock);
if (page_count(page_table_page)) {
get_page(page_table_page);
tlb_remove_page(...pmd...);
spin_unlock(page_table_share_lock);
return 0; // no ptes cleared here
}
spin_unlock(page_table_share_lock);
}
get_page(page_table_page); // count was one, we own it

... continue with normal zap ...

Nice huh? I hope it's right :-)

OK, there's another hairy aspect of this that you may have missed in
your quick read-through: we have to worry about the difference between
partial and full zapping of the pmd, because in the first case, it's a
BUG if the pmd is shared. So we have to unshare the partial pmd's
before zapping them. Which I'm doing, and then I can rely on count = 1
to tell me that what's happening is a full zap of the pmd. Once we get
to count = 1, out mm->page_table_lock keeps it there. I think that
approach is ok, if a little terse.

The last loose end to take care of is the 'page_table_share_lock' (which
doesn't really have to be global as I've shown it, it just simplifies
the discussion a little and anyway it won't be heavily contended). This
lock will protect zap from being surprised by an unshare in __pte_alloc.
I think it goes something like this:

... we have a newly allocated page table for the unshare ...
if (put_page_testzero(page_table_page))
goto unshared_while_sleeping;

spin_lock(page_table_share_lock);
if (!page_count(page_table_page))
goto unshared_while_locking;

... plug in the new page table ...
... copy in the ptes ...
spin_unlock(page_table_share_lock);
out:
return pte_offset(pmd, address);

unshared_while_locking:
spin_unlock(page_table_share_lock);
unshared_while_sleeping:
get_page(page_table_page)
... give back the newly allocate page table ...
goto out;

This gets subtle: any time put_page_testzero(page_table_page) hits zero we
know that it's exclusively owned by the mm->page_table_lock we hold, and
thus protected from all other decrementers. (Is it really as subtle as I
think it is, or is this normal?)

Note that we have to hold the page_table_share_lock until we're finished
copying in the ptes, otherwise the source could go away. This can turn
into a lock on the page table itself eventually, so whatever contention
there might be will be eliminated.

Fixing up copy_page_range to bring the pmd populate inside the
mm->page_table_lock is trivial, I won't go into it here. With that plus
changes as above, I think it's tight. Though I would not bet my life on
it ;-)

--
Daniel

2002-02-18 11:28:08

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

The latest patch is here:

nl.linux.org/~phillips/public_html/patches/ptab-2.4.17-2

I'll make a first cut at describing the locking strategy now. This won't be
that precise and it might even be wrong, but it's a start and I intend to
refine the description until it is accurate.

Rule: the page table use count counts exactly the number of pmds that
reference the page table.

free_one_pmd:

Needs a spinlock if the use count is higher than one - if it's equal to one
then we know we're covered by the mm->page_table_lock that we hold. It
decrements the use count without taking the page_table_share_lock right now,
so that's a bug.

copy_page_range:

The use count increment on the shared page table is now inside the
mm->page_table_lock. This function only increases the use count. If the use
count is equal to one then the parent's mm owns the page table exclusively
and we hold the the parent's mm->page_table_lock, so no extra exclusion is
needed: if the use count is two or more it doesn't matter to anyone that we
increment it asynchronously because only transitions from 1 to 2 are
interesting.

zap_pte_range:

We take the page_table_share_lock before checking the page table's use count.
We must be absolutely sure to catch every shared page page table here,
because we must exit and not clear any ptes on the page table - that would
modify somebody else's address space, and that's not allowed. If the use
count is equal to one we know this mm owns the page table exclusively and we
can continue the zap holding just the mm->page_table_lock.

__pte_alloc:

If the unshare parameter is passed then once we determine the page table is
shared we have to drop the mm->page_table_lock and allocate a new page, which
case sleep. When we come back we reacquire the mm->page_table_lock then take
the page_table_share_lock. At this point the page table use count may have
dropped to as low as one while we slept, but not to zero because we still
have a pointer to it from this pmd. If the page table's use count is still
greater than one we know nobody can reduce it to one because all paths that
decrement the use count are excluded by the page_table_share_lock. We want
to keep it that way, so we hold this lock until the entire unshare is
completed to be sure that the page table doesn't go away or get modified
because the other sharer(s) went away and dropped the use count.

--
Daniel

--- ../2.4.17.clean/fs/exec.c Fri Dec 21 12:41:55 2001
+++ ./fs/exec.c Mon Feb 18 09:36:20 2002
@@ -860,6 +860,7 @@
int retval;
int i;

+ ptab(printk(">>> execve %s\n", filename));
file = open_exec(filename);

retval = PTR_ERR(file);
--- ../2.4.17.clean/include/linux/mm.h Fri Dec 21 12:42:03 2001
+++ ./include/linux/mm.h Mon Feb 18 09:36:20 2002
@@ -411,7 +411,7 @@

extern int vmtruncate(struct inode * inode, loff_t offset);
extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(__pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address, int write));
extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
extern int make_pages_present(unsigned long addr, unsigned long end);
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
@@ -424,6 +424,16 @@

int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+
+static inline pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+ return __pte_alloc(mm, pmd, address, 1);
+}
+
+#define nil do { } while (0)
+#define ptab_off(cmd) nil
+#define ptab_on(cmd) do { if (current->flags & PF_SHARE_TABLES) cmd; } while (0)
+#define ptab ptab_off

/*
* On a two-level page table, this ends up being trivial. Thus the
--- ../2.4.17.clean/include/linux/sched.h Fri Dec 21 12:42:03 2001
+++ ./include/linux/sched.h Mon Feb 18 09:36:20 2002
@@ -427,7 +427,7 @@
#define PF_MEMDIE 0x00001000 /* Killed for out-of-memory */
#define PF_FREE_PAGES 0x00002000 /* per process page freeing */
#define PF_NOIO 0x00004000 /* avoid generating further I/O */
-
+#define PF_SHARE_TABLES 0x00008000 /* share page tables (testing) */
#define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */

/*
--- ../2.4.17.clean/kernel/fork.c Wed Nov 21 13:18:42 2001
+++ ./kernel/fork.c Mon Feb 18 09:36:20 2002
@@ -140,6 +140,7 @@
mm->swap_address = 0;
pprev = &mm->mmap;

+ ptab(printk(">>> dup_mmap\n"));
/*
* Add it to the mmlist after the parent.
* Doing it this way means that we can order the list,
@@ -566,9 +567,10 @@
struct task_struct *p;
struct completion vfork;

+ ptab(printk(">>> fork, stack=%lx\n", stack_start));
retval = -EPERM;

- /*
+ /*
* CLONE_PID is only allowed for the initial SMP swapper
* calls
*/
--- ../2.4.17.clean/kernel/sys.c Tue Sep 18 17:10:43 2001
+++ ./kernel/sys.c Mon Feb 18 09:36:20 2002
@@ -514,6 +514,11 @@
current->uid = new_ruid;
current->user = new_user;
free_uid(old_user);
+
+ if (current->uid == 9999)
+ current->flags |= PF_SHARE_TABLES;
+ printk(">>> user: uid=%i pid=%i pf=%x\n", current->uid, current->pid, current->flags);
+
return 0;
}

--- ../2.4.17.clean/mm/memory.c Fri Dec 21 12:42:05 2001
+++ ./mm/memory.c Mon Feb 18 10:52:50 2002
@@ -34,6 +34,9 @@
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* ([email protected])
+ *
+ * Feb 2002 - Shared page tables added by Daniel Phillips
+ * ([email protected])
*/

#include <linux/mm.h>
@@ -49,6 +52,8 @@
#include <asm/uaccess.h>
#include <asm/tlb.h>

+#define assert(cond) do { if (!(cond)) BUG(); } while (0)
+
unsigned long max_mapnr;
unsigned long num_physpages;
void * high_memory;
@@ -100,8 +105,11 @@
return;
}
pte = pte_offset(dir, 0);
- pmd_clear(dir);
- pte_free(pte);
+ ptab(printk(">>> free page table %p (%i--)\n", pte, page_count(virt_to_page(pte))));
+ if (put_page_testzero(virt_to_page(pte))) {
+ pmd_clear(dir);
+ pte_free_slow(pte);
+ }
}

static inline void free_one_pgd(pgd_t * dir)
@@ -143,8 +151,8 @@
*/
void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
{
- pgd_t * page_dir = mm->pgd;
-
+ pgd_t *page_dir = mm->pgd;
+ ptab(printk(">>> clear_page_tables\n"));
spin_lock(&mm->page_table_lock);
page_dir += first;
do {
@@ -171,13 +179,23 @@
* dst->page_table_lock is held on entry and exit,
* but may be dropped within pmd_alloc() and pte_alloc().
*/
-int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
- struct vm_area_struct *vma)
+spinlock_t page_table_share_lock = SPIN_LOCK_UNLOCKED;
+
+int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma)
{
- pgd_t * src_pgd, * dst_pgd;
+ pgd_t *src_pgd, *dst_pgd;
unsigned long address = vma->vm_start;
unsigned long end = vma->vm_end;
unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
+ int share_page_tables = !!(current->flags & PF_SHARE_TABLES);
+
+#if 0
+ static int teststart = 2, testcount = 99, tests = 0;
+ if (share_page_tables && (tests++ < teststart || tests > teststart + testcount))
+ share_page_tables = 0;
+ if (share_page_tables)
+ printk(">>> copy_page_range test %i\n", tests - 1);
+#endif

src_pgd = pgd_offset(src, address)-1;
dst_pgd = pgd_offset(dst, address)-1;
@@ -186,15 +204,15 @@
pmd_t * src_pmd, * dst_pmd;

src_pgd++; dst_pgd++;
-
+
/* copy_pmd_range */
-
+
if (pgd_none(*src_pgd))
- goto skip_copy_pmd_range;
+ goto skip_pmd_range;
if (pgd_bad(*src_pgd)) {
pgd_ERROR(*src_pgd);
pgd_clear(src_pgd);
-skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
+skip_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
if (!address || (address >= end))
goto out;
continue;
@@ -204,34 +222,58 @@
dst_pmd = pmd_alloc(dst, dst_pgd, address);
if (!dst_pmd)
goto nomem;
-
do {
- pte_t * src_pte, * dst_pte;
-
- /* copy_pte_range */
-
+ pte_t *src_ptb, *dst_ptb;
+
if (pmd_none(*src_pmd))
- goto skip_copy_pte_range;
+ goto skip_ptb_range;
if (pmd_bad(*src_pmd)) {
pmd_ERROR(*src_pmd);
pmd_clear(src_pmd);
-skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
+skip_ptb_range: address = (address + PMD_SIZE) & PMD_MASK;
if (address >= end)
goto out;
- goto cont_copy_pmd_range;
+ goto cont_pmd_range;
}

- src_pte = pte_offset(src_pmd, address);
- dst_pte = pte_alloc(dst, dst_pmd, address);
- if (!dst_pte)
+ src_ptb = pte_offset(src_pmd, address);
+
+ if (!share_page_tables) goto no_share;
+
+ if (pmd_none(*dst_pmd)) {
+ spin_lock(&src->page_table_lock);
+ get_page(virt_to_page(src_ptb));
+ pmd_populate(dst, dst_pmd, ((ulong) src_ptb & PAGE_MASK));
+ ptab(printk(">>> share %p @ %p (++%i)\n", src_ptb, address, page_count(virt_to_page(src_ptb))));
+ goto share;
+ }
+ if ((pmd_val(*src_pmd) & PAGE_MASK) != ((pmd_val(*dst_pmd) & PAGE_MASK)))
+ goto no_share;
+ spin_lock(&src->page_table_lock);
+share:
+ do {
+ pte_t pte = *src_ptb;
+ if (!pte_none(pte) && pte_present(pte)) {
+ struct page *page = pte_page(pte);
+ if (VALID_PAGE(page) && !PageReserved(page) && cow)
+ ptep_set_wrprotect(src_ptb);
+ }
+ if ((address += PAGE_SIZE) >= end)
+ goto out_unlock;
+ src_ptb++;
+ } while ((ulong) src_ptb & PTE_TABLE_MASK);
+ spin_unlock(&src->page_table_lock);
+
+ goto cont_pmd_range;
+no_share:
+ dst_ptb = __pte_alloc(dst, dst_pmd, address, 0);
+ if (!dst_ptb)
goto nomem;

- spin_lock(&src->page_table_lock);
+ spin_lock(&src->page_table_lock);
do {
- pte_t pte = *src_pte;
+ pte_t pte = *src_ptb;
struct page *ptepage;
-
- /* copy_one_pte */

if (pte_none(pte))
goto cont_copy_pte_range_noset;
@@ -240,14 +282,14 @@
goto cont_copy_pte_range;
}
ptepage = pte_page(pte);
- if ((!VALID_PAGE(ptepage)) ||
+ if ((!VALID_PAGE(ptepage)) ||
PageReserved(ptepage))
goto cont_copy_pte_range;

/* If it's a COW mapping, write protect it both in the parent and the child */
if (cow) {
- ptep_set_wrprotect(src_pte);
- pte = *src_pte;
+ ptep_set_wrprotect(src_ptb);
+ pte = *src_ptb;
}

/* If it's a shared mapping, mark it clean in the child */
@@ -257,16 +299,16 @@
get_page(ptepage);
dst->rss++;

-cont_copy_pte_range: set_pte(dst_pte, pte);
+cont_copy_pte_range: set_pte(dst_ptb, pte);
cont_copy_pte_range_noset: address += PAGE_SIZE;
if (address >= end)
goto out_unlock;
- src_pte++;
- dst_pte++;
- } while ((unsigned long)src_pte & PTE_TABLE_MASK);
+ src_ptb++;
+ dst_ptb++;
+ } while ((ulong) src_ptb & PTE_TABLE_MASK);
spin_unlock(&src->page_table_lock);
-
-cont_copy_pmd_range: src_pmd++;
+
+cont_pmd_range: src_pmd++;
dst_pmd++;
} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
}
@@ -302,7 +344,18 @@
pmd_clear(pmd);
return 0;
}
+
ptep = pte_offset(pmd, address);
+
+ spin_lock(&page_table_share_lock);
+ if (page_count(virt_to_page(ptep)) > 1) {
+ ptab(printk(">>> zap table!!! %p (%i)\n", ptep, page_count(virt_to_page(ptep))));
+ tlb_remove_page(tlb, (pte_t *) pmd, pmd_val(*pmd));
+ spin_unlock(&page_table_share_lock);
+ return 0;
+ }
+ spin_unlock(&page_table_share_lock);
+
offset = address & ~PMD_MASK;
if (offset + size > PMD_SIZE)
size = PMD_SIZE - offset;
+static int unshare_one(struct mm_struct *mm, pgd_t *dir, unsigned long address)
+{
+ if ((address & PMD_MASK)) {
+ address &= PMD_MASK;
+ if (!pgd_none(*dir)) {
+ pmd_t *pmd = pmd_offset(dir, address);
+ if (!pmd_none(*pmd)) {
+ pte_t *ptb = pte_offset(pmd, address);
+ if (page_count(virt_to_page(ptb)) > 1) {
+ __pte_alloc(mm, pmd, address, 1);
+ return 1;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
/*
* remove user pages in a given range.
*/
@@ -361,6 +432,7 @@
pgd_t * dir;
unsigned long start = address, end = address + size;
int freed = 0;
+ ptab_off(printk(">>> zap_page_range %lx+%lx\n", address, size));

dir = pgd_offset(mm, address);

@@ -374,6 +446,14 @@
if (address >= end)
BUG();
spin_lock(&mm->page_table_lock);
+
+ /*
+ * Ensure first and last partial page tables are unshared
+ */
+ do {
+ unshare_one(mm, dir, address);
+ } while (unshare_one(mm, pgd_offset(mm, end & PMD_MASK), end));
+
flush_cache_range(mm, address, end);
tlb = tlb_gather_mmu(mm);

@@ -1348,7 +1428,8 @@
pmd = pmd_alloc(mm, pgd, address);

if (pmd) {
- pte_t * pte = pte_alloc(mm, pmd, address);
+ pte_t *pte = __pte_alloc(mm, pmd, address,
+ write_access /*&& !(vma->vm_flags & VM_SHARED)*/);
if (pte)
return handle_pte_fault(mm, vma, address, write_access, pte);
}
@@ -1398,33 +1479,73 @@
* We've already handled the fast-path in-line, and we own the
* page table lock.
*/
-pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+pte_t *__pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address, int unshare)
{
- if (pmd_none(*pmd)) {
- pte_t *new;
+ struct page *ptb_page;
+ pte_t *new, *src_ptb, *dst_ptb;

- /* "fast" allocation can happen without dropping the lock.. */
- new = pte_alloc_one_fast(mm, address);
- if (!new) {
- spin_unlock(&mm->page_table_lock);
- new = pte_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
- if (!new)
- return NULL;
+ if (pmd_none(*pmd)) {
+ spin_unlock(&mm->page_table_lock);
+ new = pte_alloc_one(mm, address);
+ spin_lock(&mm->page_table_lock);
+ if (!new) return NULL;

- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (!pmd_none(*pmd)) {
- pte_free(new);
- goto out;
- }
- }
+ /* Populated while unlocked? */
+ if (!pmd_none(*pmd))
+ goto unshared_free;
pmd_populate(mm, pmd, new);
+unshared: return pte_offset(pmd, address);
}
-out:
- return pte_offset(pmd, address);
+ if (!unshare || page_count(virt_to_page(pmd_page(*pmd))) == 1)
+ goto unshared;
+ spin_unlock(&mm->page_table_lock);
+ new = pte_alloc_one(mm, address);
+ spin_lock(&mm->page_table_lock);
+ if (!new) return NULL;
+
+ assert(page_count(virt_to_page(new)) == 1);
+ spin_lock(&page_table_share_lock);
+ ptb_page = virt_to_page(pmd_page(*pmd));
+ if (page_count(ptb_page) == 1)
+ goto unshared_unlock;
+
+ ptab(printk(">>> make page table %p @ %p %s\n", new, address,
+ unshare == 2? "write fault":
+ unshare == 1? "unshared":
+ "bogus"));
+
+ src_ptb = pte_offset(pmd, 0);
+ dst_ptb = new;
+ ptab(printk(">>> unshare %p (%i--)\n", *pmd, page_count(ptb_page)));
+ // assert(ptb_page->flags & PG_shared_debug);
+ do {
+ pte_t pte = *src_ptb;
+ if (!pte_none(pte)) {
+ if (pte_present(pte)) {
+ struct page *page = pte_page(pte);
+ if (VALID_PAGE(page) && !PageReserved(page)) {
+ get_page(page);
+ pte = pte_mkold(pte_mkclean(pte));
+ mm->rss++;
+ }
+ } else
+ swap_duplicate(pte_to_swp_entry(pte));
+ set_pte(dst_ptb, pte);
+ }
+ src_ptb++;
+ dst_ptb++;
+ } while ((ulong) dst_ptb & PTE_TABLE_MASK);
+ pmd_populate(mm, pmd, new);
+ put_page(ptb_page);
+ spin_unlock(&page_table_share_lock);
+ goto unshared;
+
+unshared_unlock:
+ get_page(ptb_page);
+ spin_unlock(&page_table_share_lock);
+unshared_free:
+ pte_free_slow(new);
+ goto unshared;
}

int make_pages_present(unsigned long addr, unsigned long end)
--- ../2.4.17.clean/mm/mremap.c Thu Sep 20 23:31:26 2001
+++ ./mm/mremap.c Mon Feb 18 09:36:20 2002
@@ -92,6 +92,7 @@
{
unsigned long offset = len;

+ ptab(printk(">>> mremap\n"));
flush_cache_range(mm, old_addr, old_addr + len);

/*

2002-02-18 19:07:22

by Hugh Dickins

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On Mon, 18 Feb 2002, Daniel Phillips wrote:
> On February 18, 2002 09:09 am, Hugh Dickins wrote:
>
> > So how is the page_table_lock taken by swap_out effective when it's
> > dealing with a page table shared by another mm than the one it is
> > locking? And when handling a read-fault, again no such code (but
> > when handling a write-fault, __pte_alloc has unshared in advance).
> >
> > Since copy_page_range would not copy shared page tables, I'm wrong to
> > point there. But __pte_alloc does copy shared page tables (to unshare
> > them), and needs them to be stable while it does so: so locking against
> > swap_out really is required. It also needs locking against read faults,
> > and they against each other: but there I imagine it's just a matter of
> > dropping the write arg to __pte_alloc, going back to pte_alloc again.
>
> You're right about the read faults, wrong about swap_out. In general you've
> been more right than wrong, so thanks. I'll post a new patch pretty soon and
> I'd appreciate your comments.

On the read faults: I see no change there in the patch you then posted,
handle_mm_fault still calls __pte_alloc with write_access argument, so
concurrent read faults on the same pte can still slot the page into the
shared page table at the same time, doubly counting it - no problem if
it's the Reserved empty_zero_page, and I think no problem at present
if it's a SwapCache page, since that is PageLocked in the current tree
(but not in -aa, and in due course we should go Andrea's way there);
but if it's a file page the double count will leave it unfreeable.

On swap_out versus __pte_alloc: I was misreading it and you're almost
right there: but you do need to change that "pte_t pte = *src_ptb;"
to something atomic - hmm, do we have any primitive for doing that?
neither set_pte nor ptep_get_and_clear is right. Otherwise, on PAE
HIGHMEM64G systems the two halves of "pte" could be assigned before
and after try_to_swap_out's ptep_get_and_clear. But once you've got
"pte", yes, you're basing all your decisions on your one local copy,
that gives all the stability you need.

Hugh

(By the way, the patch you appended to your next mail did not apply
- I think you'd hand-edited an incidental irrelevant cleanup out of
the patch to memory.c, without adjusting its line counts; and also
had to edit "public_html/" out of the URL you gave.)

2002-02-18 23:33:46

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 18, 2002 08:04 pm, Hugh Dickins wrote:
> (By the way, the patch you appended to your next mail did not apply
> - I think you'd hand-edited an incidental irrelevant cleanup out of
> the patch to memory.c, without adjusting its line counts; and also
> had to edit "public_html/" out of the URL you gave.)

Thanks, here it is again. This time I left the gratuitous whitespace
cleanup in as the route of least resistance ;-)

nl.linux.org/~phillips/patches/ptab-2.4.17-2

--
Daniel

--- ../2.4.17.clean/fs/exec.c Fri Dec 21 12:41:55 2001
+++ ./fs/exec.c Mon Feb 18 22:57:10 2002
@@ -860,6 +860,7 @@
int retval;
int i;

+ ptab(printk(">>> execve %s\n", filename));
file = open_exec(filename);

retval = PTR_ERR(file);
--- ../2.4.17.clean/include/linux/mm.h Fri Dec 21 12:42:03 2001
+++ ./include/linux/mm.h Mon Feb 18 22:57:10 2002
@@ -411,7 +411,7 @@

extern int vmtruncate(struct inode * inode, loff_t offset);
extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(__pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address, int write));
extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
extern int make_pages_present(unsigned long addr, unsigned long end);
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
@@ -424,6 +424,16 @@

int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+
+static inline pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+ return __pte_alloc(mm, pmd, address, 1);
+}
+
+#define nil do { } while (0)
+#define ptab_off(cmd) nil
+#define ptab_on(cmd) do { if (current->flags & PF_SHARE_TABLES) cmd; } while (0)
+#define ptab ptab_off

/*
* On a two-level page table, this ends up being trivial. Thus the
--- ../2.4.17.clean/include/linux/sched.h Fri Dec 21 12:42:03 2001
+++ ./include/linux/sched.h Mon Feb 18 22:57:10 2002
@@ -427,7 +427,7 @@
#define PF_MEMDIE 0x00001000 /* Killed for out-of-memory */
#define PF_FREE_PAGES 0x00002000 /* per process page freeing */
#define PF_NOIO 0x00004000 /* avoid generating further I/O */
-
+#define PF_SHARE_TABLES 0x00008000 /* share page tables (testing) */
#define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */

/*
--- ../2.4.17.clean/kernel/fork.c Wed Nov 21 13:18:42 2001
+++ ./kernel/fork.c Mon Feb 18 22:57:10 2002
@@ -140,6 +140,7 @@
mm->swap_address = 0;
pprev = &mm->mmap;

+ ptab(printk(">>> dup_mmap\n"));
/*
* Add it to the mmlist after the parent.
* Doing it this way means that we can order the list,
@@ -566,9 +567,10 @@
struct task_struct *p;
struct completion vfork;

+ ptab(printk(">>> fork, stack=%lx\n", stack_start));
retval = -EPERM;

- /*
+ /*
* CLONE_PID is only allowed for the initial SMP swapper
* calls
*/
--- ../2.4.17.clean/kernel/sys.c Tue Sep 18 17:10:43 2001
+++ ./kernel/sys.c Mon Feb 18 22:57:10 2002
@@ -514,6 +514,11 @@
current->uid = new_ruid;
current->user = new_user;
free_uid(old_user);
+
+ if (current->uid == 9999)
+ current->flags |= PF_SHARE_TABLES;
+ printk(">>> user: uid=%i pid=%i pf=%x\n", current->uid, current->pid, current->flags);
+
return 0;
}

--- ../2.4.17.clean/mm/memory.c Fri Dec 21 12:42:05 2001
+++ ./mm/memory.c Mon Feb 18 23:14:44 2002
@@ -34,6 +34,9 @@
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* ([email protected])
+ *
+ * Feb 2002 - Shared page tables added by Daniel Phillips
+ * ([email protected])
*/

#include <linux/mm.h>
@@ -49,6 +52,8 @@
#include <asm/uaccess.h>
#include <asm/tlb.h>

+#define assert(cond) do { if (!(cond)) BUG(); } while (0)
+
unsigned long max_mapnr;
unsigned long num_physpages;
void * high_memory;
@@ -100,8 +105,11 @@
return;
}
pte = pte_offset(dir, 0);
- pmd_clear(dir);
- pte_free(pte);
+ ptab(printk(">>> free page table %p (%i--)\n", pte, page_count(virt_to_page(pte))));
+ if (put_page_testzero(virt_to_page(pte))) {
+ pmd_clear(dir);
+ pte_free_slow(pte);
+ }
}

static inline void free_one_pgd(pgd_t * dir)
@@ -143,8 +151,8 @@
*/
void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
{
- pgd_t * page_dir = mm->pgd;
-
+ pgd_t *page_dir = mm->pgd;
+ ptab(printk(">>> clear_page_tables\n"));
spin_lock(&mm->page_table_lock);
page_dir += first;
do {
@@ -171,13 +179,23 @@
* dst->page_table_lock is held on entry and exit,
* but may be dropped within pmd_alloc() and pte_alloc().
*/
-int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
- struct vm_area_struct *vma)
+spinlock_t page_table_share_lock = SPIN_LOCK_UNLOCKED;
+
+int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma)
{
- pgd_t * src_pgd, * dst_pgd;
+ pgd_t *src_pgd, *dst_pgd;
unsigned long address = vma->vm_start;
unsigned long end = vma->vm_end;
unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
+ int share_page_tables = !!(current->flags & PF_SHARE_TABLES);
+
+#if 0
+ static int teststart = 2, testcount = 99, tests = 0;
+ if (share_page_tables && (tests++ < teststart || tests > teststart + testcount))
+ share_page_tables = 0;
+ if (share_page_tables)
+ printk(">>> copy_page_range test %i\n", tests - 1);
+#endif

src_pgd = pgd_offset(src, address)-1;
dst_pgd = pgd_offset(dst, address)-1;
@@ -186,15 +204,15 @@
pmd_t * src_pmd, * dst_pmd;

src_pgd++; dst_pgd++;
-
+
/* copy_pmd_range */
-
+
if (pgd_none(*src_pgd))
- goto skip_copy_pmd_range;
+ goto skip_pmd_range;
if (pgd_bad(*src_pgd)) {
pgd_ERROR(*src_pgd);
pgd_clear(src_pgd);
-skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
+skip_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
if (!address || (address >= end))
goto out;
continue;
@@ -204,34 +222,58 @@
dst_pmd = pmd_alloc(dst, dst_pgd, address);
if (!dst_pmd)
goto nomem;
-
do {
- pte_t * src_pte, * dst_pte;
-
- /* copy_pte_range */
-
+ pte_t *src_ptb, *dst_ptb;
+
if (pmd_none(*src_pmd))
- goto skip_copy_pte_range;
+ goto skip_ptb_range;
if (pmd_bad(*src_pmd)) {
pmd_ERROR(*src_pmd);
pmd_clear(src_pmd);
-skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
+skip_ptb_range: address = (address + PMD_SIZE) & PMD_MASK;
if (address >= end)
goto out;
- goto cont_copy_pmd_range;
+ goto cont_pmd_range;
}

- src_pte = pte_offset(src_pmd, address);
- dst_pte = pte_alloc(dst, dst_pmd, address);
- if (!dst_pte)
+ src_ptb = pte_offset(src_pmd, address);
+
+ if (!share_page_tables) goto no_share;
+
+ if (pmd_none(*dst_pmd)) {
+ spin_lock(&src->page_table_lock);
+ get_page(virt_to_page(src_ptb));
+ pmd_populate(dst, dst_pmd, ((ulong) src_ptb & PAGE_MASK));
+ ptab(printk(">>> share %p @ %p (++%i)\n", src_ptb, address, page_count(virt_to_page(src_ptb))));
+ goto share;
+ }
+ if ((pmd_val(*src_pmd) & PAGE_MASK) != ((pmd_val(*dst_pmd) & PAGE_MASK)))
+ goto no_share;
+ spin_lock(&src->page_table_lock);
+share:
+ do {
+ pte_t pte = *src_ptb;
+ if (!pte_none(pte) && pte_present(pte)) {
+ struct page *page = pte_page(pte);
+ if (VALID_PAGE(page) && !PageReserved(page) && cow)
+ ptep_set_wrprotect(src_ptb);
+ }
+ if ((address += PAGE_SIZE) >= end)
+ goto out_unlock;
+ src_ptb++;
+ } while ((ulong) src_ptb & PTE_TABLE_MASK);
+ spin_unlock(&src->page_table_lock);
+
+ goto cont_pmd_range;
+no_share:
+ dst_ptb = __pte_alloc(dst, dst_pmd, address, 0);
+ if (!dst_ptb)
goto nomem;

- spin_lock(&src->page_table_lock);
+ spin_lock(&src->page_table_lock);
do {
- pte_t pte = *src_pte;
+ pte_t pte = *src_ptb;
struct page *ptepage;
-
- /* copy_one_pte */

if (pte_none(pte))
goto cont_copy_pte_range_noset;
@@ -240,14 +282,14 @@
goto cont_copy_pte_range;
}
ptepage = pte_page(pte);
- if ((!VALID_PAGE(ptepage)) ||
+ if ((!VALID_PAGE(ptepage)) ||
PageReserved(ptepage))
goto cont_copy_pte_range;

/* If it's a COW mapping, write protect it both in the parent and the child */
if (cow) {
- ptep_set_wrprotect(src_pte);
- pte = *src_pte;
+ ptep_set_wrprotect(src_ptb);
+ pte = *src_ptb;
}

/* If it's a shared mapping, mark it clean in the child */
@@ -257,16 +299,16 @@
get_page(ptepage);
dst->rss++;

-cont_copy_pte_range: set_pte(dst_pte, pte);
+cont_copy_pte_range: set_pte(dst_ptb, pte);
cont_copy_pte_range_noset: address += PAGE_SIZE;
if (address >= end)
goto out_unlock;
- src_pte++;
- dst_pte++;
- } while ((unsigned long)src_pte & PTE_TABLE_MASK);
+ src_ptb++;
+ dst_ptb++;
+ } while ((ulong) src_ptb & PTE_TABLE_MASK);
spin_unlock(&src->page_table_lock);
-
-cont_copy_pmd_range: src_pmd++;
+
+cont_pmd_range: src_pmd++;
dst_pmd++;
} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
}
@@ -288,7 +330,6 @@
BUG();
}
}
-
static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
{
unsigned long offset;
@@ -299,10 +340,21 @@
return 0;
if (pmd_bad(*pmd)) {
pmd_ERROR(*pmd);
- pmd_clear(pmd);
+ pmd_clear(pmd);
return 0;
}
+
ptep = pte_offset(pmd, address);
+
+ spin_lock(&page_table_share_lock);
+ if (page_count(virt_to_page(ptep)) > 1) {
+ ptab(printk(">>> zap table!!! %p (%i)\n", ptep, page_count(virt_to_page(ptep))));
+ tlb_remove_page(tlb, (pte_t *) pmd, pmd_val(*pmd));
+ spin_unlock(&page_table_share_lock);
+ return 0;
+ }
+ spin_unlock(&page_table_share_lock);
+
offset = address & ~PMD_MASK;
if (offset + size > PMD_SIZE)
size = PMD_SIZE - offset;
@@ -352,6 +404,24 @@
return freed;
}

+static int unshare_one(struct mm_struct *mm, pgd_t *dir, unsigned long address)
+{
+ if ((address & PMD_MASK)) {
+ address &= PMD_MASK;
+ if (!pgd_none(*dir)) {
+ pmd_t *pmd = pmd_offset(dir, address);
+ if (!pmd_none(*pmd)) {
+ pte_t *ptb = pte_offset(pmd, address);
+ if (page_count(virt_to_page(ptb)) > 1) {
+ __pte_alloc(mm, pmd, address, 1);
+ return 1;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
/*
* remove user pages in a given range.
*/
@@ -374,6 +444,14 @@
if (address >= end)
BUG();
spin_lock(&mm->page_table_lock);
+
+ /*
+ * Ensure first and last partial page tables are unshared
+ */
+ do {
+ unshare_one(mm, dir, address);
+ } while (unshare_one(mm, pgd_offset(mm, end & PMD_MASK), end));
+
flush_cache_range(mm, address, end);
tlb = tlb_gather_mmu(mm);

@@ -517,7 +595,7 @@

mm = current->mm;
dprintk ("map_user_kiobuf: begin\n");
-
+
pgcount = (va + len + PAGE_SIZE - 1)/PAGE_SIZE - va/PAGE_SIZE;
/* mapping 0 bytes is not permitted */
if (!pgcount) BUG();
@@ -1398,33 +1476,73 @@
* We've already handled the fast-path in-line, and we own the
* page table lock.
*/
-pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+pte_t *__pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address, int unshare)
{
- if (pmd_none(*pmd)) {
- pte_t *new;
+ struct page *ptb_page;
+ pte_t *new, *src_ptb, *dst_ptb;

- /* "fast" allocation can happen without dropping the lock.. */
- new = pte_alloc_one_fast(mm, address);
- if (!new) {
- spin_unlock(&mm->page_table_lock);
- new = pte_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
- if (!new)
- return NULL;
+ if (pmd_none(*pmd)) {
+ spin_unlock(&mm->page_table_lock);
+ new = pte_alloc_one(mm, address);
+ spin_lock(&mm->page_table_lock);
+ if (!new) return NULL;

- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (!pmd_none(*pmd)) {
- pte_free(new);
- goto out;
- }
- }
+ /* Populated while unlocked? */
+ if (!pmd_none(*pmd))
+ goto unshared_free;
pmd_populate(mm, pmd, new);
+unshared: return pte_offset(pmd, address);
}
-out:
- return pte_offset(pmd, address);
+ if (!unshare || page_count(virt_to_page(pmd_page(*pmd))) == 1)
+ goto unshared;
+ spin_unlock(&mm->page_table_lock);
+ new = pte_alloc_one(mm, address);
+ spin_lock(&mm->page_table_lock);
+ if (!new) return NULL;
+
+ assert(page_count(virt_to_page(new)) == 1);
+ spin_lock(&page_table_share_lock);
+ ptb_page = virt_to_page(pmd_page(*pmd));
+ if (page_count(ptb_page) == 1)
+ goto unshared_unlock;
+
+ ptab(printk(">>> make page table %p @ %p %s\n", new, address,
+ unshare == 2? "write fault":
+ unshare == 1? "unshared":
+ "bogus"));
+
+ src_ptb = pte_offset(pmd, 0);
+ dst_ptb = new;
+ ptab(printk(">>> unshare %p (%i--)\n", *pmd, page_count(ptb_page)));
+ // assert(ptb_page->flags & PG_shared_debug);
+ do {
+ pte_t pte = *src_ptb;
+ if (!pte_none(pte)) {
+ if (pte_present(pte)) {
+ struct page *page = pte_page(pte);
+ if (VALID_PAGE(page) && !PageReserved(page)) {
+ get_page(page);
+ pte = pte_mkold(pte_mkclean(pte));
+ mm->rss++;
+ }
+ } else
+ swap_duplicate(pte_to_swp_entry(pte));
+ set_pte(dst_ptb, pte);
+ }
+ src_ptb++;
+ dst_ptb++;
+ } while ((ulong) dst_ptb & PTE_TABLE_MASK);
+ pmd_populate(mm, pmd, new);
+ put_page(ptb_page);
+ spin_unlock(&page_table_share_lock);
+ goto unshared;
+
+unshared_unlock:
+ get_page(ptb_page);
+ spin_unlock(&page_table_share_lock);
+unshared_free:
+ pte_free_slow(new);
+ goto unshared;
}

int make_pages_present(unsigned long addr, unsigned long end)
--- ../2.4.17.clean/mm/mremap.c Thu Sep 20 23:31:26 2001
+++ ./mm/mremap.c Mon Feb 18 22:57:54 2002
@@ -92,6 +92,7 @@
{
unsigned long offset = len;

+ ptab(printk(">>> mremap\n"));
flush_cache_range(mm, old_addr, old_addr + len);

/*

2002-02-18 23:44:17

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 18, 2002 08:04 pm, Hugh Dickins wrote:
> On Mon, 18 Feb 2002, Daniel Phillips wrote:
> > On February 18, 2002 09:09 am, Hugh Dickins wrote:
> > > So how is the page_table_lock taken by swap_out effective when it's
> > > dealing with a page table shared by another mm than the one it is
> > > locking? And when handling a read-fault, again no such code (but
> > > when handling a write-fault, __pte_alloc has unshared in advance).
> > >
> > > Since copy_page_range would not copy shared page tables, I'm wrong to
> > > point there. But __pte_alloc does copy shared page tables (to unshare
> > > them), and needs them to be stable while it does so: so locking against
> > > swap_out really is required. It also needs locking against read faults,
> > > and they against each other: but there I imagine it's just a matter of
> > > dropping the write arg to __pte_alloc, going back to pte_alloc again.
> >
> > You're right about the read faults, wrong about swap_out. In general you've
> > been more right than wrong, so thanks. I'll post a new patch pretty soon and
> > I'd appreciate your comments.
>
> On the read faults: I see no change there in the patch you then posted,
> handle_mm_fault still calls __pte_alloc with write_access argument, so
> concurrent read faults on the same pte can still slot the page into the
> shared page table at the same time, doubly counting it.

Right. Oops. Let me contemplate for a moment.

> - no problem if
> it's the Reserved empty_zero_page, and I think no problem at present
> if it's a SwapCache page, since that is PageLocked in the current tree
> (but not in -aa, and in due course we should go Andrea's way there);
> but if it's a file page the double count will leave it unfreeable.
>
> On swap_out versus __pte_alloc: I was misreading it and you're almost
> right there: but you do need to change that "pte_t pte = *src_ptb;"
> to something atomic - hmm, do we have any primitive for doing that?

I guess we need one now.

> neither set_pte nor ptep_get_and_clear is right. Otherwise, on PAE
> HIGHMEM64G systems the two halves of "pte" could be assigned before
> and after try_to_swap_out's ptep_get_and_clear. But once you've got
> "pte", yes, you're basing all your decisions on your one local copy,
> that gives all the stability you need.

Thanks a lot, let me digest this and I'm close that hole shortly, or
feel free to suggest a fix. You just explained the memory leak I'm
seeing.

--
Daniel

2002-02-18 23:56:53

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

Correction, the latest patch is here:

nl.linux.org/~phillips/patches/ptab-2.4.17-2

--
Daniel

2002-02-18 23:54:54

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 18, 2002 08:04 pm, Hugh Dickins wrote:
> On Mon, 18 Feb 2002, Daniel Phillips wrote:
> > On February 18, 2002 09:09 am, Hugh Dickins wrote:
> > > Since copy_page_range would not copy shared page tables, I'm wrong to
> > > point there. But __pte_alloc does copy shared page tables (to unshare
> > > them), and needs them to be stable while it does so: so locking against
> > > swap_out really is required. It also needs locking against read faults,
> > > and they against each other: but there I imagine it's just a matter of
> > > dropping the write arg to __pte_alloc, going back to pte_alloc again.

I'm not sure what you mean here, you're not suggesting we should unshare the
page table on read fault are you?

--
Daniel

2002-02-19 00:02:13

by Hugh Dickins

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On Tue, 19 Feb 2002, Daniel Phillips wrote:
> On February 18, 2002 08:04 pm, Hugh Dickins wrote:
> > On Mon, 18 Feb 2002, Daniel Phillips wrote:
> > > On February 18, 2002 09:09 am, Hugh Dickins wrote:
> > > > Since copy_page_range would not copy shared page tables, I'm wrong to
> > > > point there. But __pte_alloc does copy shared page tables (to unshare
> > > > them), and needs them to be stable while it does so: so locking against
> > > > swap_out really is required. It also needs locking against read faults,
> > > > and they against each other: but there I imagine it's just a matter of
> > > > dropping the write arg to __pte_alloc, going back to pte_alloc again.
>
> I'm not sure what you mean here, you're not suggesting we should unshare the
> page table on read fault are you?

I am. But I can understand that you'd prefer not to do it that way.
Hugh

2002-02-19 00:23:35

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 01:03 am, Hugh Dickins wrote:
> On Tue, 19 Feb 2002, Daniel Phillips wrote:
> > On February 18, 2002 08:04 pm, Hugh Dickins wrote:
> > > On Mon, 18 Feb 2002, Daniel Phillips wrote:
> > > > On February 18, 2002 09:09 am, Hugh Dickins wrote:
> > > > > Since copy_page_range would not copy shared page tables, I'm wrong to
> > > > > point there. But __pte_alloc does copy shared page tables (to unshare
> > > > > them), and needs them to be stable while it does so: so locking against
> > > > > swap_out really is required. It also needs locking against read faults,
> > > > > and they against each other: but there I imagine it's just a matter of
> > > > > dropping the write arg to __pte_alloc, going back to pte_alloc again.
> >
> > I'm not sure what you mean here, you're not suggesting we should unshare the
> > page table on read fault are you?
>
> I am. But I can understand that you'd prefer not to do it that way.
> Hugh

No, that's not nearly studly enough ;-)

Since we have gone to all the trouble of sharing the page table, we should
swap in/out for all sharers at the same time. That is, keep it shared, saving
memory and cpu.

Now I finally see what you were driving at: before, we could count on the
mm->page_table_lock for exclusion on read fault, now we can't, at least not
when ptb->count is great than one[1]. So let's come up with something nice as
a substitute, any suggestions?

[1] I think that's a big, broad hint.

--
Daniel

2002-02-19 00:58:44

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC] Page table sharing



On Tue, 19 Feb 2002, Daniel Phillips wrote:
>
> Thanks, here it is again. This time I left the gratuitous whitespace
> cleanup in as the route of least resistance ;-)

Daniel, there's something wrong in the locking.

I can see _no_ reason to have "page_table_share_lock". What is the point
of that one?

Whenever we end up touching the pmd counts, we already hold the local
mm->page_table_lock. That means that we are always guaranteed to have a
count of at least one when we start out on it.

We have two cases:
(a) sharing a new pmd
(b) unsharing

So let's go through the cases.

(a) Sharing
- test old count. It is either 1 or not.
- if old count > 1, then we know it was already marked read-only,
and we have nothing to do but to increment the count.
- if the old count == 1, then we are the only users and nobody
else can even try to add sharing due to mm->page_table_lock.
So just mark the thing read-only, invalidate the current TLB,
and increment the count.

Do you see any reason to lock anything?

(b) Unsharing
- test old count. It is either 1 or not.
- if old-count > 1, then we know somebody else might unshare in
parallel, but that doesn't matter. Everybody does:

- allocate new pmd
- copy over into new pmd and mark writable, increment page
counts.
- install new pmd
- unuse_pmd() on old pmd, ie:
if (put_page_testzero(pmd_page)) {
for-each-page()
free the pages
__free_pages_ok(pmd_page);
}

- if old-count == 1, we know we're exclusive, and nobody else can
ever get at it, we just mark everything writable and do not play
any games at all with the count.

Do you see any reason to lock anything?

In short, I do not believe that that lock is needed. And if it isn't
needed, it is _incorrect_. Locking that doesn't buy you anything is not
just a performance overhead, it's bad thinking.

Look at all the potential races:

- share+share:
no races. Either the page started out exclusive (in which case
only one user can see it at 1), or it started out shared (in which
case all users see it > 1).

- share+unshare:
This implies that the count _must_ have been at least
"1+(nr-of-unsharers)", as one user is obviously adding sharing
(the "1"), and everybody else must be have it shared. (And we will
end up with a count of at least 2 after the share, because of the
sharer)

Regardless of _what_ the order is, the unsharers must have seen at
least a count of 2 to start with (their own + the one who si
about to share or has just shared). So all unsharers will clearly
allocate a new page, and will end up with

if (put_page_testzero(pmd_page)) {
...
}

where the "testzero" case will never happen.

- unshare+unshare:
This is the only halfway interesting case. You might have both
unsharers seeing the count > 1, in which case _both_ will do the
copy, and both will do the "put_page_testzero()".

However, only one will actually end up triggering the
"put_page_testzero()", and by the time that happens, the pmd is
totally exclusive, and the only thing it should do (and does) is
to just go through the pmd and decrement the counts for the pte
entries.

Note that no way can both unsharers see a count of 1 - the
unsharers both see at least their own counts, and the other
unsharers count will only be decremented after it has already
copied the page away into its private storage.

The other possibility is that one sees a count > 1, and gets to
copy and decrement it before the other racer even sees the count,
so the other one sees the 1, and won't bother with the copy.
That's likely to be the common case (serialized by other things),
and is the case we want.

Does anybody see any reason why this doesn't work totally without the
lock?

Linus

2002-02-19 01:06:44

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 02:01 am, Mark Hahn wrote:
> > [benchmarks]
> >
> > (Look at the last one, the nonshared fork forces the system into swap. I ran
> > it twice to verify, the second time from a clean reboot. This is another
> > reason why shared page tables are good.)
>
> that's really nice. http://hahn.mcmaster.ca/~hahn/shpg.png
> if you like graphs. but did you swap the last two sets,
> or does a reboot actually make it take longer?

I wouldn't read too much into the variance, swapping is fairly brain-damaged at
the moment. Please stand by for an update after we kick around the question of
how to repair the swap locking and we'll get better numbers. They are going to
be in the same ballpark, hopefully with less variance. Thanks a lot for the
graph.

(I took the liberty of replying to the kernel list.)

--
Daniel

2002-02-19 01:19:16

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 01:56 am, Linus Torvalds wrote:
> On Tue, 19 Feb 2002, Daniel Phillips wrote:
> >
> > Thanks, here it is again. This time I left the gratuitous whitespace
> > cleanup in as the route of least resistance ;-)
>
> Daniel, there's something wrong in the locking.
>
> I can see _no_ reason to have "page_table_share_lock". What is the point
> of that one?

Before I put it in I was getting a weird error trying to run UML on a
native kernel with page table sharing. After it was solid. That's emprical,
but...

> Whenever we end up touching the pmd counts, we already hold the local
> mm->page_table_lock. That means that we are always guaranteed to have a
> count of at least one when we start out on it.

Yes, good observation, I was looking at it inversely: when we have a
count of one then we must have exclusion from the mm->page_table_lock.

> [...]
>
> In short, I do not believe that that lock is needed. And if it isn't
> needed, it is _incorrect_. Locking that doesn't buy you anything is not
> just a performance overhead, it's bad thinking.

It would be very nice if the lock isn't needed. OK, it's going to take some
time to ponder over your post properly. In the mean time, there is exclusion
that's clearly missing elsewhere and needs to go it, i.e., in the read fault
path.

--
Daniel

2002-02-19 01:23:16

by Rik van Riel

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On Mon, 18 Feb 2002, Linus Torvalds wrote:
> On Tue, 19 Feb 2002, Daniel Phillips wrote:
> >
> > Thanks, here it is again.
>
> Daniel, there's something wrong in the locking.

> Does anybody see any reason why this doesn't work totally without the
> lock?

We'll need protection from the swapout code. It would be
embarassing if the page fault handler would run for one
mm while kswapd was holding the page_table_lock for another
mm.

I'm not sure how the page_table_share_lock is supposed
to fix that one, though.

regards,

Rik
--
"Linux holds advantages over the single-vendor commercial OS"
-- Microsoft's "Competing with Linux" document

http://www.surriel.com/ http://distro.conectiva.com/


2002-02-19 01:25:16

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 02:22 am, Rik van Riel wrote:
> On Mon, 18 Feb 2002, Linus Torvalds wrote:
> > On Tue, 19 Feb 2002, Daniel Phillips wrote:
> > >
> > > Thanks, here it is again.
> >
> > Daniel, there's something wrong in the locking.
>
> > Does anybody see any reason why this doesn't work totally without the
> > lock?
>
> We'll need protection from the swapout code. It would be
> embarassing if the page fault handler would run for one
> mm while kswapd was holding the page_table_lock for another
> mm.
>
> I'm not sure how the page_table_share_lock is supposed
> to fix that one, though.

It doesn't, at present. This needs to be addressed.

--
Daniel

2002-02-19 01:46:35

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 01:56 am, Linus Torvalds wrote:
> On Tue, 19 Feb 2002, Daniel Phillips wrote:
> >
> > Thanks, here it is again. This time I left the gratuitous whitespace
> > cleanup in as the route of least resistance ;-)
>
> Daniel, there's something wrong in the locking.
>
> I can see _no_ reason to have "page_table_share_lock". What is the point
> of that one?
>
> Whenever we end up touching the pmd counts, we already hold the local
> mm->page_table_lock. That means that we are always guaranteed to have a
> count of at least one when we start out on it.
>
> We have two cases:
> (a) sharing a new pmd
> (b) unsharing
>
> So let's go through the cases.
>
> (a) Sharing
> - test old count. It is either 1 or not.
> - if old count > 1, then we know it was already marked read-only,
> and we have nothing to do but to increment the count.
> - if the old count == 1, then we are the only users and nobody
> else can even try to add sharing due to mm->page_table_lock.
> So just mark the thing read-only, invalidate the current TLB,
> and increment the count.
>
> Do you see any reason to lock anything?

No, that's correct. Never mind that I'm currently doing setting the CoW at
that point, regardless of the share count. That's extra work that doesn't
need doing.

> (b) Unsharing
> - test old count. It is either 1 or not.
> - if old-count > 1, then we know somebody else might unshare in
> parallel, but that doesn't matter. Everybody does:
>
> - allocate new pmd
> - copy over into new pmd and mark writable, increment page
> counts.
> - install new pmd
> - unuse_pmd() on old pmd, ie:
> if (put_page_testzero(pmd_page)) {
> for-each-page()
> free the pages
> __free_pages_ok(pmd_page);
> }

Yes. I'm still working with the existing design that deletes page tables
only in one place. Because of that I wanted to be sure that two would not
unshare at the same time, requiring the original to be freed after two
copies are made. Your suggestions for freeing page tables agressively on
the fly are the way to go I agree, but it's a more agressive change and
I'd like to get the current, more boring design stable as a step in that
direction. I find it easier to start with a functioning system and remove
locking until it breaks than start with a broken system and add locking
until it works. Of course it would be even better to have a perfect
understanding of the situation from the very beginning, but things seem
to seldom work out that way.

I feel the same way about the RO on the page table, it's where we want to
go but I'd like to go there like an inchworm as opposed to a grasshopper.

> - if old-count == 1, we know we're exclusive, and nobody else can
> ever get at it, we just mark everything writable and do not play
> any games at all with the count.
>
> Do you see any reason to lock anything?

Somebody might read fault, changing an entry when we're in the middle of
copying it and might might do a duplicated read fault.

> In short, I do not believe that that lock is needed. And if it isn't
> needed, it is _incorrect_. Locking that doesn't buy you anything is not
> just a performance overhead, it's bad thinking.
>
> Look at all the potential races:
>
> - share+share:
> no races. Either the page started out exclusive (in which case
> only one user can see it at 1), or it started out shared (in which
> case all users see it > 1).
>
> - share+unshare:
> This implies that the count _must_ have been at least
> "1+(nr-of-unsharers)", as one user is obviously adding sharing
> (the "1"), and everybody else must be have it shared. (And we will
> end up with a count of at least 2 after the share, because of the
> sharer)
>
> Regardless of _what_ the order is, the unsharers must have seen at
> least a count of 2 to start with (their own + the one who si
> about to share or has just shared). So all unsharers will clearly
> allocate a new page, and will end up with
>
> if (put_page_testzero(pmd_page)) {
> ...
> }
>
> where the "testzero" case will never happen.
>
> - unshare+unshare:
> This is the only halfway interesting case. You might have both
> unsharers seeing the count > 1, in which case _both_ will do the
> copy, and both will do the "put_page_testzero()".
>
> However, only one will actually end up triggering the
> "put_page_testzero()", and by the time that happens, the pmd is
> totally exclusive, and the only thing it should do (and does) is
> to just go through the pmd and decrement the counts for the pte
> entries.
>
> Note that no way can both unsharers see a count of 1 - the
> unsharers both see at least their own counts, and the other
> unsharers count will only be decremented after it has already
> copied the page away into its private storage.
>
> The other possibility is that one sees a count > 1, and gets to
> copy and decrement it before the other racer even sees the count,
> so the other one sees the 1, and won't bother with the copy.
> That's likely to be the common case (serialized by other things),
> and is the case we want.
>
> Does anybody see any reason why this doesn't work totally without the
> lock?

With the exception of the missing read fault exclusion this looks good.

OK, still working on it. There is a massive memory leak to hunt down
also, so even though I may have more locking than needed, something's
still missing.

--
Daniel

2002-02-19 01:50:15

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC] Page table sharing



On Mon, 18 Feb 2002, Rik van Riel wrote:
>
> We'll need protection from the swapout code.

Absolutely NOT.

If the swapout code unshares or shares the PMD, that's a major bug.

The swapout code doesn't need to know one way or the other, because the
swapout code never actually touches the pmd itself, it just follows the
pointers - it doesn't ever need to worry about the pmd counts at all.

Linus

2002-02-19 01:53:45

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 02:48 am, Linus Torvalds wrote:
> On Mon, 18 Feb 2002, Rik van Riel wrote:
> >
> > We'll need protection from the swapout code.
>
> Absolutely NOT.
>
> If the swapout code unshares or shares the PMD, that's a major bug.

What it will do is change entries on the page table. We have to be sure
two processes don't read/evict the same page in at the same time.

> The swapout code doesn't need to know one way or the other, because the
> swapout code never actually touches the pmd itself, it just follows the
> pointers - it doesn't ever need to worry about the pmd counts at all.

That was my original, incomplete view of the situation at first as well.

--
Daniel

2002-02-19 01:54:25

by Rik van Riel

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On Mon, 18 Feb 2002, Linus Torvalds wrote:
> On Mon, 18 Feb 2002, Rik van Riel wrote:
> >
> > We'll need protection from the swapout code.
>
> Absolutely NOT.
>
> If the swapout code unshares or shares the PMD, that's a major bug.
>
> The swapout code doesn't need to know one way or the other, because the
> swapout code never actually touches the pmd itself, it just follows the
> pointers - it doesn't ever need to worry about the pmd counts at all.

The swapout code can remove a page from the page table
while another process is in the process of unsharing
the page table.

We really want to make sure that the copied-over page
table doesn't point to a page which just got swapped
out.

regards,

Rik
--
"Linux holds advantages over the single-vendor commercial OS"
-- Microsoft's "Competing with Linux" document

http://www.surriel.com/ http://distro.conectiva.com/

2002-02-19 01:55:45

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC] Page table sharing



On Tue, 19 Feb 2002, Daniel Phillips wrote:
>
> Somebody might read fault, changing an entry when we're in the middle of
> copying it and might might do a duplicated read fault.

You're confusing the mm->mmap_sem with the page_table_lock.

The mm semaphore is really a read-write semaphore, and yes, there can be
multiple faulters active at the same time readin gin pages.

But the page_table_lock is 100% exclusive, and while you hold the
page_table_lock there is absolutely _not_ going to be any concurrent page
faulting.

(NOTE! Sure, there might be another mm that has the same pmd shared, but
that one is going to do an unshare before it actually touches anything in
the pmd, so it's NOT going to change the values in the original pmd).

So I'm personally convinced that the locking shouldn't be needed at all,
if you just make sure that you do things in the right order (that, of
course, might need some memory barriers, which had better be implied by
the atomic dec-and-test anyway).

Linus

2002-02-19 02:07:37

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC] Page table sharing



On Mon, 18 Feb 2002, Rik van Riel wrote:
>
> The swapout code can remove a page from the page table
> while another process is in the process of unsharing
> the page table.

Ok, I'll buy that. However, looking at that, the locking is not the real
issue at all:

When the swapper does a "ptep_get_and_clear()" on a shared pmd, it will
end up having to not just synchronize with anybody doing unsharing, it
will have to flush all the TLB's on all the mm's that might be implicated.

Which implies that the swapper needs to look up all mm's some way anyway,
so the locking gets solved that way.

(Now, that's really pushing the problem somewhere else, and that
"somewhere else" is actually a whole lot nastier than the original locking
problem ever was, so I'm not claiming this actually solves anything. I'm
just saying that the locking isn't the issue, and we actually have some
quite fundamental problems here..)

Linus

2002-02-19 02:08:36

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 02:53 am, Linus Torvalds wrote:
> On Tue, 19 Feb 2002, Daniel Phillips wrote:
> >
> > Somebody might read fault, changing an entry when we're in the middle of
> > copying it and might might do a duplicated read fault.
>
> You're confusing the mm->mmap_sem with the page_table_lock.
>
> The mm semaphore is really a read-write semaphore, and yes, there can be
> multiple faulters active at the same time readin gin pages.
>
> But the page_table_lock is 100% exclusive, and while you hold the
> page_table_lock there is absolutely _not_ going to be any concurrent page
> faulting.

Sure there can be, because we only hold the mm->page_table_lock for this,
somebody could be faulting through another mm sharing the page table. For
this reason I believe I have to look at the page table count, and unless
it's one, I have to do some extra exclusion.

> (NOTE! Sure, there might be another mm that has the same pmd shared, but
> that one is going to do an unshare before it actually touches anything in
> the pmd, so it's NOT going to change the values in the original pmd).

Actually, I was planning to keep the tables shared, even through swapin/
swapout. The data remains valid for all mm's, whether it's in ram or in
swap.

> So I'm personally convinced that the locking shouldn't be needed at all,
> if you just make sure that you do things in the right order (that, of
> course, might need some memory barriers, which had better be implied by
> the atomic dec-and-test anyway).

You've convinced me that it can be considerably streamlined, which is
great, but it can't all go, and even now there's some missing.

--
Daniel

2002-02-19 02:18:26

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 03:05 am, Linus Torvalds wrote:
> On Mon, 18 Feb 2002, Rik van Riel wrote:
> >
> > The swapout code can remove a page from the page table
> > while another process is in the process of unsharing
> > the page table.
>
> Ok, I'll buy that. However, looking at that, the locking is not the real
> issue at all:
>
> When the swapper does a "ptep_get_and_clear()" on a shared pmd, it will
> end up having to not just synchronize with anybody doing unsharing, it
> will have to flush all the TLB's on all the mm's that might be implicated.
>
> Which implies that the swapper needs to look up all mm's some way anyway,

Ick. With rmap this is straightforward, but without, what? flush_tlb_all?
Maybe page tables should be unshared on swapin/out after all, only on arches
that need special tlb treatment, or until we have rmap.

> so the locking gets solved that way.

--
Daniel

2002-02-19 02:37:29

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC] Page table sharing



On Tue, 19 Feb 2002, Daniel Phillips wrote:
> >
> > Which implies that the swapper needs to look up all mm's some way anyway,
>
> Ick. With rmap this is straightforward, but without, what?

It is not at ALL straightforward with rmap either.

Remember: one of the big original _points_ of the pmd sharing was to avoid
having to do the rmap overhead for shared page tables. The fact that it
works without rmap too was just a nice bonus, and makes apples-to-apples
comparisons possible.

So if you do the rmap overhead even when sharing, you're toast. No more
shared pmd's.

> Maybe page tables should be unshared on swapin/out after all, only on arches
> that need special tlb treatment, or until we have rmap.

There is no "or until we have rmap". It doesn't help. All the same issues
hold - if you have to invalidate multiple mm's, you have to find them all.
That's the same whether you have rmap or not, and is a fundamental issue
with sharing pmd's.

Dang, I should have noticed before this.

Note that "swapin" is certainly not the problem - we don't need to swap
the thing into all mm's at the same time, so if a unshare happens just
before/after the swapin and the unshared process doesn't get the thing,
we're still perfectly fine.

In fact, swapin is not even a spacial case. It's just the same as any
other page fault - we can continue to share page tables over "read-only"
page faults, and even that is _purely_ an optimization (yeah, it needs
some trivial "cmpxchg()" magic on the pmd to work, but it has no TLB
invalidation issues or anything really complex like that).

The only problem is swapout. And "swapout()" is always a problem, in fact.
It's always been special, because it is quite fundamentally the only VM
operation that ever is "nonlocal". We've had tons of races with swapout
over time, it's always been the nastiest VM operation by _far_ when it
comes to page table coherency.

We can, of course, introduce a "pmd-rmap" thing, with a pointer to a
circular list of all mm's using that pmd inside the "struct page *" of the
pmd. Right now the rmap patches just make the pointer point directly to
the one exclusive mm that holds the pmd, right?

(This could be a good "gradual introduction to some of the rmap data
structures" thing too).

Linus

2002-02-19 02:51:32

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 03:35 am, Linus Torvalds wrote:
> On Tue, 19 Feb 2002, Daniel Phillips wrote:
> > >
> > > Which implies that the swapper needs to look up all mm's some way anyway,
> >
> > Ick. With rmap this is straightforward, but without, what?
>
> It is not at ALL straightforward with rmap either.
>
> Remember: one of the big original _points_ of the pmd sharing was to avoid
> having to do the rmap overhead for shared page tables. The fact that it
> works without rmap too was just a nice bonus, and makes apples-to-apples
> comparisons possible.
>
> So if you do the rmap overhead even when sharing, you're toast. No more
> shared pmd's.
>
> > Maybe page tables should be unshared on swapin/out after all, only on arches
> > that need special tlb treatment, or until we have rmap.
>
> There is no "or until we have rmap". It doesn't help. All the same issues
> hold - if you have to invalidate multiple mm's, you have to find them all.
> That's the same whether you have rmap or not, and is a fundamental issue
> with sharing pmd's.
>
> Dang, I should have noticed before this.
>
> Note that "swapin" is certainly not the problem - we don't need to swap
> the thing into all mm's at the same time, so if a unshare happens just
> before/after the swapin and the unshared process doesn't get the thing,
> we're still perfectly fine.
>
> In fact, swapin is not even a spacial case. It's just the same as any
> other page fault - we can continue to share page tables over "read-only"
> page faults, and even that is _purely_ an optimization (yeah, it needs
> some trivial "cmpxchg()" magic on the pmd to work, but it has no TLB
> invalidation issues or anything really complex like that).
>
> The only problem is swapout. And "swapout()" is always a problem, in fact.
> It's always been special, because it is quite fundamentally the only VM
> operation that ever is "nonlocal". We've had tons of races with swapout
> over time, it's always been the nastiest VM operation by _far_ when it
> comes to page table coherency.
>
> We can, of course, introduce a "pmd-rmap" thing, with a pointer to a
> circular list of all mm's using that pmd inside the "struct page *" of the
> pmd.

Yes, exactly my thought.

> Right now the rmap patches just make the pointer point directly to
> the one exclusive mm that holds the pmd, right?

Correct.

> (This could be a good "gradual introduction to some of the rmap data
> structures" thing too).

Yup.

--
Daniel

2002-02-19 03:07:05

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 03:35 am, Linus Torvalds wrote:
> Right now the rmap patches just make the pointer point directly to
> the one exclusive mm that holds the pmd, right?

To be precise, the pte_chain ends up at an entry on the page table and
we get to the mm through the page table's struct page.

--
Daniel

2002-02-19 03:25:29

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC] Page table sharing



On Mon, 18 Feb 2002, Linus Torvalds wrote:
>
> We can, of course, introduce a "pmd-rmap" thing, with a pointer to a
> circular list of all mm's using that pmd inside the "struct page *" of the
> pmd. Right now the rmap patches just make the pointer point directly to
> the one exclusive mm that holds the pmd, right?

There's another approach:
- get rid of "page_table_lock"
- replace it with a "per-pmd lock"
- notice that we already _have_ such a lock

The lock we have is the lock that we've always had in "struct page".

There are some interesting advantages from this:
- we allow even more parallelism from threads across different CPU's.
- we already have the cacheline for the pmd "struct page" because we
needed it for the pmd count.

That still leaves the TLB invalidation issue, but we could handle that
with an alternate approach: use the same "free_pte_ctx" kind of gathering
that the zap_page_range() code uses for similar reasons (ie gather up the
pte entries that you're going to free first, and then do a global
invalidate later).

Note that this is likely to speed things up anyway (whether the pages are
gathered by rmap or by the current linear walk), by virtue of being able
to do just _one_ TLB invalidate (potentially cross-CPU) rather than having
to do it once for each page we free.

At that point you might as well make the TLB shootdown global (ie you keep
track of a mask of CPU's whose TLB's you want to kill, and any pmd that
has count > 1 just makes that mask be "all CPU's").

I'm a bit worried about the "lock each mm on the pmd-rmap list" approach,
because I think we need to lock them _all_ to be safe (as opposed to
locking them one at a time), which always implies all the nasty potential
deadlocks you get for doing multiple locking.

The "page-lock + potentially one global TLB flush" approach looks a lot
safer in this respect.

Linus

2002-02-19 03:41:41

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 04:22 am, Linus Torvalds wrote:
> On Mon, 18 Feb 2002, Linus Torvalds wrote:
> >
> > We can, of course, introduce a "pmd-rmap" thing, with a pointer to a
> > circular list of all mm's using that pmd inside the "struct page *" of the
> > pmd. Right now the rmap patches just make the pointer point directly to
> > the one exclusive mm that holds the pmd, right?
>
> There's another approach:
> - get rid of "page_table_lock"
> - replace it with a "per-pmd lock"
> - notice that we already _have_ such a lock
>
> The lock we have is the lock that we've always had in "struct page".

Yes, I even have an earlier version of the patch that implements a spinlock
on that bit. It doesn't use the normal lock_page of course.

> There are some interesting advantages from this:
> - we allow even more parallelism from threads across different CPU's.
> - we already have the cacheline for the pmd "struct page" because we
> needed it for the pmd count.Y
>
> That still leaves the TLB invalidation issue, but we could handle that
> with an alternate approach: use the same "free_pte_ctx" kind of gathering
> that the zap_page_range() code uses for similar reasons (ie gather up the
> pte entries that you're going to free first, and then do a global
> invalidate later).
>
> Note that this is likely to speed things up anyway (whether the pages are
> gathered by rmap or by the current linear walk), by virtue of being able
> to do just _one_ TLB invalidate (potentially cross-CPU) rather than having
> to do it once for each page we free.
>
> At that point you might as well make the TLB shootdown global (ie you keep
> track of a mask of CPU's whose TLB's you want to kill, and any pmd that
> has count > 1 just makes that mask be "all CPU's").
>
> I'm a bit worried about the "lock each mm on the pmd-rmap list" approach,
> because I think we need to lock them _all_ to be safe (as opposed to
> locking them one at a time), which always implies all the nasty potential
> deadlocks you get for doing multiple locking.
>
> The "page-lock + potentially one global TLB flush" approach looks a lot
> safer in this respect.

How do we know when to do the global tlb flush?

--
Daniel

2002-02-19 04:32:14

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

Daniel Phillips <[email protected]> writes:

> On February 19, 2002 01:03 am, Hugh Dickins wrote:
> > On Tue, 19 Feb 2002, Daniel Phillips wrote:
> > > On February 18, 2002 08:04 pm, Hugh Dickins wrote:
> > > > On Mon, 18 Feb 2002, Daniel Phillips wrote:
> > > > > On February 18, 2002 09:09 am, Hugh Dickins wrote:
> > > > > > Since copy_page_range would not copy shared page tables, I'm wrong to
> > > > > > point there. But __pte_alloc does copy shared page tables (to unshare
>
> > > > > > them), and needs them to be stable while it does so: so locking
> against
>
> > > > > > swap_out really is required. It also needs locking against read
> faults,
>
> > > > > > and they against each other: but there I imagine it's just a matter of
>
> > > > > > dropping the write arg to __pte_alloc, going back to pte_alloc again.
> > >
> > > I'm not sure what you mean here, you're not suggesting we should unshare the
>
> > > page table on read fault are you?
> >
> > I am. But I can understand that you'd prefer not to do it that way.
> > Hugh
>
> No, that's not nearly studly enough ;-)
>
> Since we have gone to all the trouble of sharing the page table, we should
> swap in/out for all sharers at the same time. That is, keep it shared, saving
> memory and cpu.
>
> Now I finally see what you were driving at: before, we could count on the
> mm->page_table_lock for exclusion on read fault, now we can't, at least not
> when ptb->count is great than one[1]. So let's come up with something nice as
> a substitute, any suggestions?
>
> [1] I think that's a big, broad hint.

Something like:
struct mm_share {
spinlock_t page_table_lock;
struct list_head mm_list;
};

struct mm {
struct list_head mm_list;
struct mm_share *mm_share;
.....
};

So we have an overarching structure for all of the shared mm's.

Eric

2002-02-19 10:03:33

by Roman Zippel

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

Hi,

On Mon, 18 Feb 2002, Linus Torvalds wrote:

> We can, of course, introduce a "pmd-rmap" thing, with a pointer to a
> circular list of all mm's using that pmd inside the "struct page *" of the
> pmd.

Isn't that information basically already available via
vma->vm_(pprev|next)_share?

bye, Roman

2002-02-19 11:35:45

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 04:22 am, Linus Torvalds wrote:
> That still leaves the TLB invalidation issue, but we could handle that
> with an alternate approach: use the same "free_pte_ctx" kind of gathering
> that the zap_page_range() code uses for similar reasons (ie gather up the
> pte entries that you're going to free first, and then do a global
> invalidate later).

I think I'll fall back to unsharing the page table on swapout as Hugh
suggested, until we sort this out.

It's not that bad.

--
Daniel

2002-02-19 12:21:00

by Hugh Dickins

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On Tue, 19 Feb 2002, Daniel Phillips wrote:
> On February 19, 2002 04:22 am, Linus Torvalds wrote:
> > That still leaves the TLB invalidation issue, but we could handle that
> > with an alternate approach: use the same "free_pte_ctx" kind of gathering
> > that the zap_page_range() code uses for similar reasons (ie gather up the
> > pte entries that you're going to free first, and then do a global
> > invalidate later).
>
> I think I'll fall back to unsharing the page table on swapout as Hugh
> suggested, until we sort this out.

My proposal was to unshare the page table on read fault, to avoid race.
I suppose you could, just for your current testing, use that technique
in swapout, to avoid the much more serious TLB issue that Linus has now
raised. But don't do so without realizing that it is a very deadlocky
idea for swapout (making pages freeable) to need to allocate pages.
And it's not much use for swapout to skip them either, since the shared
page tables become valuable on the very large address spaces which we'd
want swapout to be hitting.

Hugh

2002-02-19 12:39:42

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 01:22 pm, Hugh Dickins wrote:
> On Tue, 19 Feb 2002, Daniel Phillips wrote:
> > On February 19, 2002 04:22 am, Linus Torvalds wrote:
> > > That still leaves the TLB invalidation issue, but we could handle that
> > > with an alternate approach: use the same "free_pte_ctx" kind of gathering
> > > that the zap_page_range() code uses for similar reasons (ie gather up the
> > > pte entries that you're going to free first, and then do a global
> > > invalidate later).
> >
> > I think I'll fall back to unsharing the page table on swapout as Hugh
> > suggested, until we sort this out.
>
> My proposal was to unshare the page table on read fault, to avoid race.
> I suppose you could, just for your current testing, use that technique
> in swapout, to avoid the much more serious TLB issue that Linus has now
> raised. But don't do so without realizing that it is a very deadlocky
> idea for swapout (making pages freeable) to need to allocate pages.

I didn't fail to notice that. It's no worse than any other page reservation
issue, of which we have plenty. One day we're going to have to solve them all.

> And it's not much use for swapout to skip them either, since the shared
> page tables become valuable on the very large address spaces which we'd
> want swapout to be hitting.

Unsharing is the route of least resistance at the moment. If necessary I can
keep a page around for that purpose, then reestablish that reserve after using
it.

--
Daniel

2002-02-19 17:32:24

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC] Page table sharing



On Tue, 19 Feb 2002, Daniel Phillips wrote:
> >
> > At that point you might as well make the TLB shootdown global (ie you keep
> > track of a mask of CPU's whose TLB's you want to kill, and any pmd that
> > has count > 1 just makes that mask be "all CPU's").
>
> How do we know when to do the global tlb flush?

See above.

Basically, the algorithm is:

invalidate_cpu_mask = 0;

.. for each page swapped out ..

pte = ptep_get_and_clear(ptep);
save_pte_and_mm(pte_page(pte));
mask = mm->cpu_vm_mask;
if (page_count(pmd_page) > 1)
mask = ~0UL;
invalidate_cpu_mask |= mask;

and then at the end you just do

flush_tlb_cpus(invalidate_cpu_mask);
for_each_page_saved() {
free_page(page);
}

(yeah, yeah, add cache coherency etc).

Linus


2002-02-19 17:32:34

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC] Page table sharing



On 18 Feb 2002, Eric W. Biederman wrote:
> > [1] I think that's a big, broad hint.
>
> Something like:
> struct mm_share {
> spinlock_t page_table_lock;
> struct list_head mm_list;
> };
>
> struct mm {
> struct list_head mm_list;
> struct mm_share *mm_share;
> .....
> };
>
> So we have an overarching structure for all of the shared mm's.

No, but the mm's aren't shared, only the pmd's are.

So one mm can share one pmd with mm2, and another with mm3.

Sure, you could have a list of "all mm's that _could_ share, and that
might work out well enough. An execve() removes a process from the list,
so usually the list is quite small.

Linus

2002-02-19 18:10:00

by Hugh Dickins

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On Tue, 19 Feb 2002, Linus Torvalds wrote:
> On Tue, 19 Feb 2002, Daniel Phillips wrote:
> > >
> > > At that point you might as well make the TLB shootdown global (ie you keep
> > > track of a mask of CPU's whose TLB's you want to kill, and any pmd that
> > > has count > 1 just makes that mask be "all CPU's").
> >
> > How do we know when to do the global tlb flush?
>
> See above.
>
> Basically, the algorithm is:
>
> invalidate_cpu_mask = 0;
>
> .. for each page swapped out ..
>
> pte = ptep_get_and_clear(ptep);
> save_pte_and_mm(pte_page(pte));
> mask = mm->cpu_vm_mask;
> if (page_count(pmd_page) > 1)
> mask = ~0UL;
> invalidate_cpu_mask |= mask;
>
> and then at the end you just do
>
> flush_tlb_cpus(invalidate_cpu_mask);
> for_each_page_saved() {
> free_page(page);
> }
>
> (yeah, yeah, add cache coherency etc).

It's a little worse than this, I think. Propagating pte_dirty(pte) to
set_page_dirty(page) cannot be done until after the flush_tlb_cpus, if
the ptes are writable: and copy_page_range is not setting "cow", so not
write protecting, when it's a shared writable mapping. Easy answer is
to scrap "cow" there and always do the write protection; but I doubt
that's the correct answer. swap_out could keep an array of pointers to
ptes, to propagate dirty after flushing TLB and before freeing pages,
but that's not very pretty.

Hugh

2002-02-19 18:19:11

by Qing Huang

[permalink] [raw]

2002-02-20 14:06:46

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 06:29 pm, Linus Torvalds wrote:
> On Tue, 19 Feb 2002, Daniel Phillips wrote:
> > Linus Torvalds wrote:
> > > At that point you might as well make the TLB shootdown global (ie you
> > > keep track of a mask of CPU's whose TLB's you want to kill, and any pmd
> > > that has count > 1 just makes that mask be "all CPU's").
> >
> > How do we know when to do the global tlb flush?
>
> See above.
>
> Basically, the algorithm is:
>
> invalidate_cpu_mask = 0;
>
> .. for each page swapped out ..
>
> pte = ptep_get_and_clear(ptep);
> save_pte_and_mm(pte_page(pte));
> mask = mm->cpu_vm_mask;
> if (page_count(pmd_page) > 1)
> mask = ~0UL;
> invalidate_cpu_mask |= mask;
>
> and then at the end you just do
>
> flush_tlb_cpus(invalidate_cpu_mask);
> for_each_page_saved() {
> free_page(page);
> }

Silence is the sound of me researching tlb shootdowns, ipi's and the like, to
prepare for doing this work. We don't have a flush_tlb_cpus at the moment,
however it doesn't look hard to write. We don't have save_pte_and_mm either,
and it seems that it's only valid in the case the page table use count is
one, otherwise we need a page table reverse mapping scheme, a practical and
worthwhile optimization, but not essential to get something working.

<rant>
This topic is very poorly covered in terms of background material. What
information there is seems to be scattered through various Intel manuals or
old lkml posts, or tucked away in professional seminars and higher level
computer engineering courses. Once again we have a situation where hackers
are divided into two groups: those that know the material and haven't got
time or inclination to document it, or those that don't know it and aren't
willling to admit that for fear of seeming ignorant.
</rant>

Davem has done a nice job of documenting the existing tlb operations, what's
missing is the information required to construct new ones. Can anybody out
there point me to a primer, or write one?

Looking at the current try_to_swap_out code I see only a local invalidate,
flush_tlb_page(vma, address), why is that? How do we know that this mm could
not be in context on another cpu?

--
Daniel

2002-02-20 14:14:15

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 07:11 pm, Hugh Dickins wrote:
> On Tue, 19 Feb 2002, Linus Torvalds wrote:
> > On Tue, 19 Feb 2002, Daniel Phillips wrote:
> > > >
> > > > At that point you might as well make the TLB shootdown global (ie you keep
> > > > track of a mask of CPU's whose TLB's you want to kill, and any pmd that
> > > > has count > 1 just makes that mask be "all CPU's").
> > >
> > > How do we know when to do the global tlb flush?
> >
> > See above.
> >
> > Basically, the algorithm is:
> >
> > invalidate_cpu_mask = 0;
> >
> > .. for each page swapped out ..
> >
> > pte = ptep_get_and_clear(ptep);
> > save_pte_and_mm(pte_page(pte));
> > mask = mm->cpu_vm_mask;
> > if (page_count(pmd_page) > 1)
> > mask = ~0UL;
> > invalidate_cpu_mask |= mask;
> >
> > and then at the end you just do
> >
> > flush_tlb_cpus(invalidate_cpu_mask);
> > for_each_page_saved() {
> > free_page(page);
> > }
> >
> > (yeah, yeah, add cache coherency etc).
>
> It's a little worse than this, I think. Propagating pte_dirty(pte) to
> set_page_dirty(page) cannot be done until after the flush_tlb_cpus,

You mean, because somebody might re-dirty an already cleaned page? Or are
you driving at something more subtle?

> if the ptes are writable: and copy_page_range is not setting "cow", so not
> write protecting, when it's a shared writable mapping. Easy answer is
> to scrap "cow" there and always do the write protection; but I doubt
> that's the correct answer.

Nope. For shared mmaps you'd get tons of unecessary faults.

> swap_out could keep an array of pointers to
> ptes, to propagate dirty after flushing TLB and before freeing pages,
> but that's not very pretty.

It's not horrible, not worse than the already-existing tlb_remove_page
code anyway. I think we're not stopped here, just slowed down for some
head scratching.

--
Daniel

2002-02-20 14:38:31

by Hugh Dickins

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On Wed, 20 Feb 2002, Daniel Phillips wrote:
>
> Looking at the current try_to_swap_out code I see only a local invalidate,
> flush_tlb_page(vma, address), why is that? How do we know that this mm could
> not be in context on another cpu?

I made the same mistake a few months ago: not noticing #ifndef CONFIG_SMP
in the header. arch/i386/kernel/smp.c has the real i386 flush_tlb_page().

Hugh

2002-02-20 14:53:33

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 20, 2002 03:38 pm, Hugh Dickins wrote:
> On Wed, 20 Feb 2002, Daniel Phillips wrote:
> >
> > Looking at the current try_to_swap_out code I see only a local invalidate,
> > flush_tlb_page(vma, address), why is that? How do we know that this mm could
> > not be in context on another cpu?
>
> I made the same mistake a few months ago: not noticing #ifndef CONFIG_SMP
> in the header. arch/i386/kernel/smp.c has the real i386 flush_tlb_page().

OK, well if I'm making the same mistakes then I'm likely on the right track ;)

So it seems that what we need for tlb invalidate of shared page tables is
not worse than what we already have, though there's some extra bookkeeping
to handle.

Why would we run into your page dirty propagation problem with shared page
tables and not with the current code?

--
Daniel

2002-02-20 15:28:37

by Hugh Dickins

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On Wed, 20 Feb 2002, Daniel Phillips wrote:
> On February 19, 2002 07:11 pm, Hugh Dickins wrote:
> >
> > It's a little worse than this, I think. Propagating pte_dirty(pte) to
> > set_page_dirty(page) cannot be done until after the flush_tlb_cpus,
>
> You mean, because somebody might re-dirty an already cleaned page? Or are
> you driving at something more subtle?

You are right to press me on this. Now I reflect upon it, I think I
was scare-mongering, and I'm sure you don't need that! I apologize.

If the i386 pte was already marked dirty, there's no issue at all.
If the i386 pte was not already marked dirty, but another processor
has that entry in its TLB (not marked dirty), and goes to dirty it
at the wrong moment, either it does so successfully just before the
ptep_get_and_clear (and we see the dirty bit), or it tries to do so
just after the (atomic part of) the ptep_get_and_clear, finds pte
not present and faults (page not yet dirtied). No problem.

This (one cpu invalidating pte while another is halfway through ucode
updating dirty or referenced bit) is errata territory on Pentium Pro.
But if we were going to worry about that, we should have done so
before, you're not introducing any new problem in that respect.

I'm unfamiliar with the other architectures, but I have no reason
to suppose they behave critically differently here. Ben will have
checked this all out when he brought in tlb_remove_page(): and though
his propagation of dirty from pte to page occurs after the flush TLB,
it's irrelevant: that pte comes from ptep_get_and_clear before.

Sorry again,
Hugh

2002-02-23 00:31:54

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

On February 19, 2002 03:05 am, Linus Torvalds wrote:
> On Mon, 18 Feb 2002, Rik van Riel wrote:
> >
> > The swapout code can remove a page from the page table
> > while another process is in the process of unsharing
> > the page table.
>
> Ok, I'll buy that. However, looking at that, the locking is not the real
> issue at all:
>
> When the swapper does a "ptep_get_and_clear()" on a shared pmd, it will
> end up having to not just synchronize with anybody doing unsharing, it
> will have to flush all the TLB's on all the mm's that might be implicated.
>
> Which implies that the swapper needs to look up all mm's some way anyway,
> so the locking gets solved that way.
>
> (Now, that's really pushing the problem somewhere else, and that
> "somewhere else" is actually a whole lot nastier than the original locking
> problem ever was, so I'm not claiming this actually solves anything. I'm
> just saying that the locking isn't the issue, and we actually have some
> quite fundamental problems here..)

Even though I'm quite certain you are right and my locking is way more
heavyweight that necessary, I'd like to see the heavyweight version running
reliably before embarking on a more extensive rearrangement of the code. So
for today I've just added exclusion on the swap out/in paths, and for tlb
invalidation I'm just doing a global invalidate. On my two-way machine it
makes no detectable difference for the relatively lightweight tests I'm
doing now, but granted, somebody is sure to come up with a demonstration that
does show a difference.

This patch isn't stable, though it seems very close. Since I have very
limited time for hacking on it I'm posting it now the hopes that more
eyeballs will help speed things up. I'm guessing there's a stupid mistake
and not some fundamental oversight, and I haven't noticed it because I
haven't gone over it with a fine-toothed comb.

Here's the style of protection I've used against swap in/out:

if (page_count(virt_to_page(pte)) == 1) {
entry = pte_mkyoung(entry);
establish_pte(vma, address, pte, entry);
} else {
spin_lock(&page_table_share_lock);
entry = pte_mkyoung(entry);
establish_pte(vma, address, pte, entry);
spin_unlock(&page_table_share_lock);
}

Totally crude. The tlb flushing is equally crude:

if (page_count(virt_to_page(page_table)) == 1) {
flush_cache_page(vma, address);
pte = ptep_get_and_clear(page_table);
flush_tlb_page(vma, address);
} else {
spin_lock(&page_table_share_lock);
flush_cache_all();
pte = ptep_get_and_clear(page_table);
flush_tlb_all();
spin_unlock(&page_table_share_lock);
}

I know, yuck, but I want to kill the remaining bugs, not make it faster just
now (it's still running more that 2X faster than the good old fork for fork
from parent with large vm).

Here is code to demonstrate the problem:

---------------------
useradd -u 9999 new

cat >testnew << END
echo \$1 forks from parent with \$2 pages of mapped memory...
echo -n "New "; sudo -u new ./forktime -n \$1 -m \$2
END

cat >watchnew << END
watch "sh tesnew 10 100000; cat /proc/meminfo"
END

sh watchnew
---------------------

9999 is the magic uid for which page table sharing is enabled. Run this
and watch free get eaten away. Things do seem to stabilize eventually,
but it's very definitely not the same behavior as with the following:

---------------------
useradd -u 8888 old

cat >testold << END
echo \$1 forks from parent with \$2 pages of mapped memory...
echo -n "Old "; sudo -u old ./forktime -n \$1 -m \$2
END

cat >watchold << END
watch "sh testold 10 100000; cat /proc/meminfo"
END

sh watchold
---------------------

Which does the same thing with a non-magic user.

Here is 'forktime.c':

---------------------
/*
* Time fork() system call.
*
* Original by Dave McCracken
* Reformat by Daniel Phillips
*/

#include <stdio.h>
#include <sys/time.h>
#include <sys/mman.h>

int main(int argc, char *argv[])
{
int n=1, m=1, i, pagesize = getpagesize();
struct timeval stime, etime, rtime;
extern char *optarg;
char *mem;

args: switch (getopt(argc, argv, "n:m:")) {
case 'n':
n = atoi(optarg);
goto args;
case 'm':
m = atoi(optarg);
goto args;
case -1:
break;
default:
fprintf(stderr, "Usage: %s [-n iterations] [-m mappedpages]\n", argv[0]);
exit(1);
}

mem = mmap(NULL, m * pagesize, PROT_READ|PROT_WRITE, MAP_PRIVATE | MAP_ANON, 0, 0);

for (i = 0; i < m; i++)
mem[i * pagesize] = 0;

gettimeofday (&stime, NULL);

for (i = 0; i < n; i++)
switch (fork()) {
case 0: exit(0);
case -1: exit(2);
}

wait();
gettimeofday(&etime, NULL);
timersub(&etime, &stime, &rtime);
printf("total fork time: %d.%06d seconds in %d iterations ", rtime.tv_sec, rtime.tv_usec, i);
printf("(%d usec/fork)\n", ((rtime.tv_sec * 1000000) + rtime.tv_usec) / i);
exit(0);
}
---------------------

The following patch (-p0 against 2.4.17) is also available at:

nl.linux.org/~phillips/ptab-2.4.17-3

--
Daniel

--- ../2.4.17.clean/fs/exec.c Fri Dec 21 12:41:55 2001
+++ ./fs/exec.c Mon Feb 18 22:57:10 2002
@@ -860,6 +860,7 @@
int retval;
int i;

+ ptab(printk(">>> execve %s\n", filename));
file = open_exec(filename);

retval = PTR_ERR(file);
--- ../2.4.17.clean/include/linux/mm.h Fri Dec 21 12:42:03 2001
+++ ./include/linux/mm.h Fri Feb 22 20:23:17 2002
@@ -411,7 +411,7 @@

extern int vmtruncate(struct inode * inode, loff_t offset);
extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(__pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address, int write));
extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
extern int make_pages_present(unsigned long addr, unsigned long end);
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
@@ -424,6 +424,19 @@

int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+
+static inline pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+ return __pte_alloc(mm, pmd, address, 1);
+}
+
+#define nil do { } while (0)
+#define ptab_off(cmd) nil
+#define ptab_on(cmd) do { if (current->flags & PF_SHARE_TABLES) cmd; } while (0)
+#define ptab ptab_off
+#define always_share_page_tables 1
+
+extern spinlock_t page_table_share_lock;

/*
* On a two-level page table, this ends up being trivial. Thus the
--- ../2.4.17.clean/include/linux/sched.h Fri Dec 21 12:42:03 2001
+++ ./include/linux/sched.h Mon Feb 18 22:57:10 2002
@@ -427,7 +427,7 @@
#define PF_MEMDIE 0x00001000 /* Killed for out-of-memory */
#define PF_FREE_PAGES 0x00002000 /* per process page freeing */
#define PF_NOIO 0x00004000 /* avoid generating further I/O */
-
+#define PF_SHARE_TABLES 0x00008000 /* share page tables (testing) */
#define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */

/*
--- ../2.4.17.clean/kernel/fork.c Wed Nov 21 13:18:42 2001
+++ ./kernel/fork.c Mon Feb 18 22:57:10 2002
@@ -140,6 +140,7 @@
mm->swap_address = 0;
pprev = &mm->mmap;

+ ptab(printk(">>> dup_mmap\n"));
/*
* Add it to the mmlist after the parent.
* Doing it this way means that we can order the list,
@@ -566,9 +567,10 @@
struct task_struct *p;
struct completion vfork;

+ ptab(printk(">>> fork, stack=%lx\n", stack_start));
retval = -EPERM;

- /*
+ /*
* CLONE_PID is only allowed for the initial SMP swapper
* calls
*/
--- ../2.4.17.clean/kernel/sys.c Tue Sep 18 17:10:43 2001
+++ ./kernel/sys.c Mon Feb 18 22:57:10 2002
@@ -514,6 +514,11 @@
current->uid = new_ruid;
current->user = new_user;
free_uid(old_user);
+
+ if (current->uid == 9999)
+ current->flags |= PF_SHARE_TABLES;
+ printk(">>> user: uid=%i pid=%i pf=%x\n", current->uid, current->pid, current->flags);
+
return 0;
}

--- ../2.4.17.clean/mm/memory.c Fri Dec 21 12:42:05 2001
+++ ./mm/memory.c Fri Feb 22 22:19:43 2002
@@ -34,6 +34,9 @@
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* ([email protected])
+ *
+ * Feb 2002 - Shared page tables added by Daniel Phillips
+ * ([email protected])
*/

#include <linux/mm.h>
@@ -49,6 +52,8 @@
#include <asm/uaccess.h>
#include <asm/tlb.h>

+#define assert(cond) do { if (!(cond)) BUG(); } while (0)
+
unsigned long max_mapnr;
unsigned long num_physpages;
void * high_memory;
@@ -100,10 +105,18 @@
return;
}
pte = pte_offset(dir, 0);
- pmd_clear(dir);
- pte_free(pte);
+ ptab(printk(">>> free page table %p (%i--)\n", pte, page_count(virt_to_page(pte))));
+
+ spin_lock(&page_table_share_lock);
+ if (page_count(virt_to_page(pte)) == 1) {
+ put_page(virt_to_page(pte));
+ pmd_clear(dir);
+ pte_free_slow(pte);
+ }
+ spin_unlock(&page_table_share_lock);
}

+
static inline void free_one_pgd(pgd_t * dir)
{
int j;
@@ -143,8 +156,8 @@
*/
void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
{
- pgd_t * page_dir = mm->pgd;
-
+ pgd_t *page_dir = mm->pgd;
+ ptab(printk(">>> clear_page_tables\n"));
spin_lock(&mm->page_table_lock);
page_dir += first;
do {
@@ -171,13 +184,23 @@
* dst->page_table_lock is held on entry and exit,
* but may be dropped within pmd_alloc() and pte_alloc().
*/
-int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
- struct vm_area_struct *vma)
+spinlock_t page_table_share_lock = SPIN_LOCK_UNLOCKED;
+
+int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma)
{
- pgd_t * src_pgd, * dst_pgd;
+ pgd_t *src_pgd, *dst_pgd;
unsigned long address = vma->vm_start;
unsigned long end = vma->vm_end;
unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
+ int share_page_tables = always_share_page_tables || !!(current->flags & PF_SHARE_TABLES);
+
+#if 0
+ static int teststart = 2, testcount = 99, tests = 0;
+ if (share_page_tables && (tests++ < teststart || tests > teststart + testcount))
+ share_page_tables = 0;
+ if (share_page_tables)
+ printk(">>> copy_page_range test %i\n", tests - 1);
+#endif

src_pgd = pgd_offset(src, address)-1;
dst_pgd = pgd_offset(dst, address)-1;
@@ -186,15 +209,15 @@
pmd_t * src_pmd, * dst_pmd;

src_pgd++; dst_pgd++;
-
+
/* copy_pmd_range */
-
+
if (pgd_none(*src_pgd))
- goto skip_copy_pmd_range;
+ goto skip_pmd_range;
if (pgd_bad(*src_pgd)) {
pgd_ERROR(*src_pgd);
pgd_clear(src_pgd);
-skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
+skip_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
if (!address || (address >= end))
goto out;
continue;
@@ -204,34 +227,58 @@
dst_pmd = pmd_alloc(dst, dst_pgd, address);
if (!dst_pmd)
goto nomem;
-
do {
- pte_t * src_pte, * dst_pte;
-
- /* copy_pte_range */
-
+ pte_t *src_ptb, *dst_ptb;
+
if (pmd_none(*src_pmd))
- goto skip_copy_pte_range;
+ goto skip_ptb_range;
if (pmd_bad(*src_pmd)) {
pmd_ERROR(*src_pmd);
pmd_clear(src_pmd);
-skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
+skip_ptb_range: address = (address + PMD_SIZE) & PMD_MASK;
if (address >= end)
goto out;
- goto cont_copy_pmd_range;
+ goto cont_pmd_range;
}

- src_pte = pte_offset(src_pmd, address);
- dst_pte = pte_alloc(dst, dst_pmd, address);
- if (!dst_pte)
+ src_ptb = pte_offset(src_pmd, address);
+
+ if (!share_page_tables) goto no_share;
+
+ if (pmd_none(*dst_pmd)) {
+ spin_lock(&src->page_table_lock);
+ get_page(virt_to_page(src_ptb));
+ pmd_populate(dst, dst_pmd, ((ulong) src_ptb & PAGE_MASK));
+ ptab(printk(">>> share %p @ %p (++%i)\n", src_ptb, address, page_count(virt_to_page(src_ptb))));
+ goto share;
+ }
+ if ((pmd_val(*src_pmd) & PAGE_MASK) != ((pmd_val(*dst_pmd) & PAGE_MASK)))
+ goto no_share;
+ spin_lock(&src->page_table_lock);
+share:
+ do {
+ pte_t pte = *src_ptb;
+ if (!pte_none(pte) && pte_present(pte)) {
+ struct page *page = pte_page(pte);
+ if (VALID_PAGE(page) && !PageReserved(page) && cow)
+ ptep_set_wrprotect(src_ptb);
+ }
+ if ((address += PAGE_SIZE) >= end)
+ goto out_unlock;
+ src_ptb++;
+ } while ((ulong) src_ptb & PTE_TABLE_MASK);
+ spin_unlock(&src->page_table_lock);
+
+ goto cont_pmd_range;
+no_share:
+ dst_ptb = __pte_alloc(dst, dst_pmd, address, 0);
+ if (!dst_ptb)
goto nomem;

- spin_lock(&src->page_table_lock);
+ spin_lock(&src->page_table_lock);
do {
- pte_t pte = *src_pte;
+ pte_t pte = *src_ptb;
struct page *ptepage;
-
- /* copy_one_pte */

if (pte_none(pte))
goto cont_copy_pte_range_noset;
@@ -240,14 +287,14 @@
goto cont_copy_pte_range;
}
ptepage = pte_page(pte);
- if ((!VALID_PAGE(ptepage)) ||
+ if ((!VALID_PAGE(ptepage)) ||
PageReserved(ptepage))
goto cont_copy_pte_range;

/* If it's a COW mapping, write protect it both in the parent and the child */
if (cow) {
- ptep_set_wrprotect(src_pte);
- pte = *src_pte;
+ ptep_set_wrprotect(src_ptb);
+ pte = *src_ptb;
}

/* If it's a shared mapping, mark it clean in the child */
@@ -257,16 +304,16 @@
get_page(ptepage);
dst->rss++;

-cont_copy_pte_range: set_pte(dst_pte, pte);
+cont_copy_pte_range: set_pte(dst_ptb, pte);
cont_copy_pte_range_noset: address += PAGE_SIZE;
if (address >= end)
goto out_unlock;
- src_pte++;
- dst_pte++;
- } while ((unsigned long)src_pte & PTE_TABLE_MASK);
+ src_ptb++;
+ dst_ptb++;
+ } while ((ulong) src_ptb & PTE_TABLE_MASK);
spin_unlock(&src->page_table_lock);
-
-cont_copy_pmd_range: src_pmd++;
+
+cont_pmd_range: src_pmd++;
dst_pmd++;
} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
}
@@ -288,7 +335,6 @@
BUG();
}
}
-
static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
{
unsigned long offset;
@@ -299,10 +345,21 @@
return 0;
if (pmd_bad(*pmd)) {
pmd_ERROR(*pmd);
- pmd_clear(pmd);
+ pmd_clear(pmd);
return 0;
}
+
ptep = pte_offset(pmd, address);
+
+ spin_lock(&page_table_share_lock);
+ if (page_count(virt_to_page(ptep)) > 1) {
+ ptab(printk(">>> zap table!!! %p (%i)\n", ptep, page_count(virt_to_page(ptep))));
+ tlb_remove_page(tlb, (pte_t *) pmd, pmd_val(*pmd));
+ spin_unlock(&page_table_share_lock);
+ return 0;
+ }
+ spin_unlock(&page_table_share_lock);
+
offset = address & ~PMD_MASK;
if (offset + size > PMD_SIZE)
size = PMD_SIZE - offset;
@@ -352,6 +409,24 @@
return freed;
}

+static int unshare_one(struct mm_struct *mm, pgd_t *dir, unsigned long address)
+{
+ if ((address & PMD_MASK)) {
+ address &= PMD_MASK;
+ if (!pgd_none(*dir)) {
+ pmd_t *pmd = pmd_offset(dir, address);
+ if (!pmd_none(*pmd)) {
+ pte_t *ptb = pte_offset(pmd, address);
+ if (page_count(virt_to_page(ptb)) > 1) {
+ __pte_alloc(mm, pmd, address, 1);
+ return 1;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
/*
* remove user pages in a given range.
*/
@@ -374,6 +449,14 @@
if (address >= end)
BUG();
spin_lock(&mm->page_table_lock);
+
+ /*
+ * Ensure first and last partial page tables are unshared
+ */
+ do {
+ unshare_one(mm, dir, address);
+ } while (unshare_one(mm, pgd_offset(mm, end & PMD_MASK), end));
+
flush_cache_range(mm, address, end);
tlb = tlb_gather_mmu(mm);

@@ -517,7 +600,7 @@

mm = current->mm;
dprintk ("map_user_kiobuf: begin\n");
-
+
pgcount = (va + len + PAGE_SIZE - 1)/PAGE_SIZE - va/PAGE_SIZE;
/* mapping 0 bytes is not permitted */
if (!pgcount) BUG();
@@ -1317,13 +1400,28 @@
}

if (write_access) {
- if (!pte_write(entry))
- return do_wp_page(mm, vma, address, pte, entry);
-
+ if (!pte_write(entry)) {
+ if (page_count(virt_to_page(pte)) == 1) {
+ return do_wp_page(mm, vma, address, pte, entry);
+ } else {
+ int err;
+ spin_lock(&page_table_share_lock);
+ err = do_wp_page(mm, vma, address, pte, entry);
+ spin_unlock(&page_table_share_lock);
+ return err;
+ }
+ }
entry = pte_mkdirty(entry);
}
- entry = pte_mkyoung(entry);
- establish_pte(vma, address, pte, entry);
+ if (page_count(virt_to_page(pte)) == 1) {
+ entry = pte_mkyoung(entry);
+ establish_pte(vma, address, pte, entry);
+ } else {
+ spin_lock(&page_table_share_lock);
+ entry = pte_mkyoung(entry);
+ establish_pte(vma, address, pte, entry);
+ spin_unlock(&page_table_share_lock);
+ }
spin_unlock(&mm->page_table_lock);
return 1;
}
@@ -1398,33 +1496,73 @@
* We've already handled the fast-path in-line, and we own the
* page table lock.
*/
-pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+pte_t *__pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address, int unshare)
{
- if (pmd_none(*pmd)) {
- pte_t *new;
+ struct page *ptb_page;
+ pte_t *new, *src_ptb, *dst_ptb;

- /* "fast" allocation can happen without dropping the lock.. */
- new = pte_alloc_one_fast(mm, address);
- if (!new) {
- spin_unlock(&mm->page_table_lock);
- new = pte_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
- if (!new)
- return NULL;
+ if (pmd_none(*pmd)) {
+ spin_unlock(&mm->page_table_lock);
+ new = pte_alloc_one(mm, address);
+ spin_lock(&mm->page_table_lock);
+ if (!new) return NULL;

- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (!pmd_none(*pmd)) {
- pte_free(new);
- goto out;
- }
- }
+ /* Populated while unlocked? */
+ if (!pmd_none(*pmd))
+ goto unshared_free;
pmd_populate(mm, pmd, new);
+unshared: return pte_offset(pmd, address);
}
-out:
- return pte_offset(pmd, address);
+ if (!unshare || page_count(virt_to_page(pmd_page(*pmd))) == 1)
+ goto unshared;
+ spin_unlock(&mm->page_table_lock);
+ new = pte_alloc_one(mm, address);
+ spin_lock(&mm->page_table_lock);
+ if (!new) return NULL;
+
+ assert(page_count(virt_to_page(new)) == 1);
+ spin_lock(&page_table_share_lock);
+ ptb_page = virt_to_page(pmd_page(*pmd));
+ if (page_count(ptb_page) == 1)
+ goto unshared_unlock;
+
+ ptab(printk(">>> make page table %p @ %p %s\n", new, address,
+ unshare == 2? "write fault":
+ unshare == 1? "unshared":
+ "bogus"));
+
+ src_ptb = pte_offset(pmd, 0);
+ dst_ptb = new;
+ ptab(printk(">>> unshare %p (%i--)\n", *pmd, page_count(ptb_page)));
+ // assert(ptb_page->flags & PG_shared_debug);
+ do {
+ pte_t pte = *src_ptb;
+ if (!pte_none(pte)) {
+ if (pte_present(pte)) {
+ struct page *page = pte_page(pte);
+ if (VALID_PAGE(page) && !PageReserved(page)) {
+ get_page(page);
+ pte = pte_mkold(pte_mkclean(pte));
+ mm->rss++;
+ }
+ } else
+ swap_duplicate(pte_to_swp_entry(pte));
+ set_pte(dst_ptb, pte);
+ }
+ src_ptb++;
+ dst_ptb++;
+ } while ((ulong) dst_ptb & PTE_TABLE_MASK);
+ pmd_populate(mm, pmd, new);
+ put_page(ptb_page);
+ spin_unlock(&page_table_share_lock);
+ goto unshared;
+
+unshared_unlock:
+ get_page(ptb_page);
+ spin_unlock(&page_table_share_lock);
+unshared_free:
+ pte_free_slow(new);
+ goto unshared;
}

int make_pages_present(unsigned long addr, unsigned long end)
--- ../2.4.17.clean/mm/mremap.c Thu Sep 20 23:31:26 2001
+++ ./mm/mremap.c Mon Feb 18 22:57:54 2002
@@ -92,6 +92,7 @@
{
unsigned long offset = len;

+ ptab(printk(">>> mremap\n"));
flush_cache_range(mm, old_addr, old_addr + len);

/*
--- ../2.4.17.clean/mm/vmscan.c Fri Dec 21 12:42:05 2001
+++ ./mm/vmscan.c Fri Feb 22 16:05:29 2002
@@ -69,9 +69,17 @@
* is needed on CPUs which update the accessed and dirty
* bits in hardware.
*/
- flush_cache_page(vma, address);
- pte = ptep_get_and_clear(page_table);
- flush_tlb_page(vma, address);
+ if (page_count(virt_to_page(page_table)) == 1) {
+ flush_cache_page(vma, address);
+ pte = ptep_get_and_clear(page_table);
+ flush_tlb_page(vma, address);
+ } else {
+ spin_lock(&page_table_share_lock);
+ flush_cache_all();
+ pte = ptep_get_and_clear(page_table);
+ flush_tlb_all();
+ spin_unlock(&page_table_share_lock);
+ }

if (pte_dirty(pte))
set_page_dirty(page);

2002-02-23 01:33:50

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing

The following gross mistake was noticed promptly by Rik van Riel:

spin_lock(&page_table_share_lock);
- if (page_count(virt_to_page(pte)) == 1) {
+ if (put_page_testzero(virt_to_page(pte))) {
pmd_clear(dir);
pte_free_slow(pte);
}
spin_unlock(&page_table_share_lock);

However, oddly enough, that's not the memory leak.

--
Daniel

2002-02-23 04:22:02

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] Page table sharing, leak gone

I found the leak. As predicted, it was stupid, a get_page left over from a
previous incarnation. Now it's behaving pretty nicely. I haven't seen an
oops for a while and leaks seem to be down to a dull roar if not entirely
gone. I think swapoff isn't entirely comfortable with the new situation, and
tends to hang, I'll look into that in due course.

Since I don't actually have a user base for this, I just overwrote the
previously posted leaky version, it's still:

nl.linux.org/~phillips/ptab-2.4.17-3

This is getting to the point of needing heavier testing.

--
Daniel