This patch eliminates about 35% of the raw rmap setup/teardown overhead by
adopting a new locking interface that allows the add_rmaps to be batched in
copy_page_range. This is work-in-progress. I expect to show a further 35%
overhead reduction shortly, by batching the remove_rmaps as well. Further
gains will come more slowly, but I hope that an immediate 70% reduction in
overhead gets us into the doesn't-suck-too-much range, and we can move on
to other benchmarks.
This patch is against 2.4.19-pre7+rmap13b. I'll forward port to 2.5 in due
course, however this should allow you to verify my results.
Here is the script I used, essentially the same as the one you originally
posted, but all in one piece:
---------------------------
#!/bin/sh
doit()
{
( cat $1 | wc -l )
}
doitlots()
{
count=0
while (( count<500 ))
do
doit foo >/dev/null
count=$(expr $count + 1)
done
echo done
}
echo hello >foobar
rm -f foocount
echo >foocount
count=0
while (( count<10 ))
do
doitlots foobar >>foocount &
let count++
done
count=0
while (( count<10 ))
do
count=$(cat foocount | wc -l)
done
---------------------------
--- 2.4.19-pre7.clean/include/linux/mm.h Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/include/linux/mm.h Fri Aug 2 17:45:04 2002
@@ -131,8 +131,10 @@
struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused);
};
-/* forward declaration; pte_chain is meant to be internal to rmap.c */
-struct pte_chain;
+struct pte_chain {
+ struct pte_chain * next;
+ pte_t * ptep;
+};
/*
* Each physical page in the system has a struct page associated with
@@ -324,29 +326,40 @@
#define PageLaunder(page) test_bit(PG_launder, &(page)->flags)
#define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags)
-/*
- * inlines for acquisition and release of PG_chainlock
- */
-static inline void pte_chain_lock(struct page *page)
+#define num_rmap_locks (1 << 8)
+
+extern spinlock_t rmap_locks[num_rmap_locks];
+
+void init_rmap_locks(void);
+
+static inline unsigned long rmap_locknum(unsigned long index)
{
- /*
- * Assuming the lock is uncontended, this never enters
- * the body of the outer loop. If it is contended, then
- * within the inner loop a non-atomic test is used to
- * busywait with less bus contention for a good time to
- * attempt to acquire the lock bit.
- */
- while (test_and_set_bit(PG_chainlock, &page->flags)) {
- while (test_bit(PG_chainlock, &page->flags))
- cpu_relax();
- }
+ return (index >> 4) & (num_rmap_locks - 1);
}
-static inline void pte_chain_unlock(struct page *page)
+static inline spinlock_t *lock_rmap(struct page *page)
{
- clear_bit(PG_chainlock, &page->flags);
+ unsigned long index = page->index;
+ while (1) {
+ spinlock_t *lock = rmap_locks + rmap_locknum(index);
+ spin_lock(lock);
+ if (index == page->index)
+ return lock;
+ spin_unlock(lock);
+ }
}
+static inline void set_page_index(struct page *page, unsigned long index)
+{
+ spinlock_t *lock = lock_rmap(page);
+ page->index = index;
+ spin_unlock(lock);
+}
+
+struct pte_chain *pte_chain_alloc(zone_t *zone);
+void pte_chain_push(zone_t *zone, struct pte_chain *pte_chain);
+void add_rmap_nolock(struct page* page, pte_t *ptep, struct pte_chain *pte_chain);
+
/*
* The zone field is never updated after free_area_init_core()
* sets it, so none of the operations on it need to be atomic.
@@ -519,7 +532,7 @@
extern int shmem_zero_setup(struct vm_area_struct *);
extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size);
-extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma);
+extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma, unsigned *locknum);
extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot);
--- 2.4.19-pre7.clean/kernel/fork.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/kernel/fork.c Fri Aug 2 16:25:22 2002
@@ -132,6 +132,7 @@
{
struct vm_area_struct * mpnt, *tmp, **pprev;
int retval;
+ unsigned rmap_locknum = -1;
flush_cache_mm(current->mm);
mm->locked_vm = 0;
@@ -191,7 +192,7 @@
*pprev = tmp;
pprev = &tmp->vm_next;
mm->map_count++;
- retval = copy_page_range(mm, current->mm, tmp);
+ retval = copy_page_range(mm, current->mm, tmp, &rmap_locknum);
spin_unlock(&mm->page_table_lock);
if (tmp->vm_ops && tmp->vm_ops->open)
--- 2.4.19-pre7.clean/mm/bootmem.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/bootmem.c Fri Aug 2 16:25:22 2002
@@ -61,6 +61,8 @@
*/
memset(bdata->node_bootmem_map, 0xff, mapsize);
+ init_rmap_locks(); // is there a better place for this?
+
return mapsize;
}
--- 2.4.19-pre7.clean/mm/filemap.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/filemap.c Fri Aug 2 16:25:22 2002
@@ -635,7 +635,7 @@
if (!PageLocked(page))
BUG();
- page->index = index;
+ set_page_index(page, index);
page_cache_get(page);
spin_lock(&pagecache_lock);
add_page_to_inode_queue(mapping, page);
@@ -658,7 +658,7 @@
flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
page->flags = flags | (1 << PG_locked);
page_cache_get(page);
- page->index = offset;
+ set_page_index(page, offset);
add_page_to_inode_queue(mapping, page);
add_page_to_hash_queue(page, hash);
}
--- 2.4.19-pre7.clean/mm/memory.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/memory.c Fri Aug 2 17:48:29 2002
@@ -176,13 +176,17 @@
* dst->page_table_lock is held on entry and exit,
* but may be dropped within pmd_alloc() and pte_alloc().
*/
+struct pte_chain *pte_chain_alloc(zone_t *zone);
+
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma, unsigned *unused_locknum)
{
pgd_t * src_pgd, * dst_pgd;
unsigned long address = vma->vm_start;
unsigned long end = vma->vm_end;
unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+ zone_t *pte_chain_zone = zone_table[ZONE_NORMAL];
+ struct pte_chain *local_pte_chain = NULL, *pte_chain;
src_pgd = pgd_offset(src, address)-1;
dst_pgd = pgd_offset(dst, address)-1;
@@ -212,6 +216,8 @@
do {
pte_t * src_pte, * dst_pte;
+ unsigned last_locknum = -1;
+ spinlock_t *rmap_lock = NULL;
/* copy_pte_range */
@@ -247,6 +253,28 @@
goto cont_copy_pte_range_noset;
}
ptepage = pte_page(pte);
+
+ if (!local_pte_chain) {
+ unsigned more = 16;
+ if (last_locknum != -1) {
+ spin_unlock(rmap_lock);
+ last_locknum = -1;
+ }
+ while (more--) {
+ struct pte_chain *new = pte_chain_alloc(pte_chain_zone);
+ new->next = local_pte_chain;
+ local_pte_chain = new;
+ }
+ }
+
+ if (last_locknum != rmap_locknum(ptepage->index)) {
+ if (last_locknum != -1) {
+
+ spin_unlock(rmap_lock);
+ }
+ rmap_lock = lock_rmap(ptepage);
+ last_locknum = rmap_locknum(ptepage->index);
+ }
if ((!VALID_PAGE(ptepage)) ||
PageReserved(ptepage))
goto cont_copy_pte_range;
@@ -265,15 +293,24 @@
dst->rss++;
cont_copy_pte_range: set_pte(dst_pte, pte);
- page_add_rmap(ptepage, dst_pte);
+ pte_chain = local_pte_chain;
+ local_pte_chain = local_pte_chain->next;
+ add_rmap_nolock(ptepage, dst_pte, pte_chain);
+
cont_copy_pte_range_noset: address += PAGE_SIZE;
- if (address >= end)
+ if (address >= end) {
+ if (last_locknum != -1)
+ spin_unlock(rmap_lock);
goto out_unlock;
+ }
src_pte++;
dst_pte++;
} while ((unsigned long)src_pte & PTE_TABLE_MASK);
spin_unlock(&src->page_table_lock);
-
+
+ if (last_locknum != -1)
+ spin_unlock(rmap_lock);
+
cont_copy_pmd_range: src_pmd++;
dst_pmd++;
} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
@@ -281,6 +318,13 @@
out_unlock:
spin_unlock(&src->page_table_lock);
out:
+ spin_lock(&pte_chain_zone->pte_chain_freelist_lock);
+ while (local_pte_chain) {
+ struct pte_chain *next = local_pte_chain->next;
+ pte_chain_push(pte_chain_zone, local_pte_chain);
+ local_pte_chain = next;
+ }
+ spin_unlock(&pte_chain_zone->pte_chain_freelist_lock);
return 0;
nomem:
return -ENOMEM;
@@ -1518,3 +1562,4 @@
}
return page;
}
+
--- 2.4.19-pre7.clean/mm/page_alloc.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/page_alloc.c Fri Aug 2 17:49:36 2002
@@ -213,6 +213,7 @@
if (curr != head) {
unsigned int index;
+ static unsigned foo_page_allocs;
page = memlist_entry(curr, struct page, list);
if (BAD_RANGE(zone,page))
@@ -227,6 +228,7 @@
spin_unlock_irqrestore(&zone->lock, flags);
set_page_count(page, 1);
+ page->index = foo_page_allocs++ >> PAGE_CACHE_SHIFT;
if (BAD_RANGE(zone,page))
BUG();
DEBUG_LRU_PAGE(page);
--- 2.4.19-pre7.clean/mm/rmap.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/rmap.c Fri Aug 2 17:33:51 2002
@@ -43,16 +43,20 @@
* in systems with long-lived applications the relative overhead of
* exit() will be lower since the applications are long-lived.
*/
-struct pte_chain {
- struct pte_chain * next;
- pte_t * ptep;
-};
-static inline struct pte_chain * pte_chain_alloc(zone_t *);
+spinlock_t rmap_locks[num_rmap_locks];
+
static inline void pte_chain_free(struct pte_chain *, struct pte_chain *,
struct page *, zone_t *);
static void alloc_new_pte_chains(zone_t *);
+void init_rmap_locks(void)
+{
+ int i = 0;
+ while (i < num_rmap_locks)
+ spin_lock_init(rmap_locks + i++);
+}
+
/**
* page_referenced - test if the page was referenced
* @page: the page to test
@@ -86,9 +90,10 @@
* Add a new pte reverse mapping to a page.
* The caller needs to hold the mm->page_table_lock.
*/
-void page_add_rmap(struct page * page, pte_t * ptep)
+void page_add_rmap(struct page *page, pte_t *ptep)
{
- struct pte_chain * pte_chain;
+ struct pte_chain *pte_chain;
+ spinlock_t *rmap_lock;
#ifdef DEBUG_RMAP
if (!page || !ptep)
@@ -103,7 +108,7 @@
return;
#ifdef DEBUG_RMAP
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
{
struct pte_chain * pc;
for (pc = page->pte_chain; pc; pc = pc->next) {
@@ -111,19 +116,48 @@
BUG();
}
}
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
#endif
pte_chain = pte_chain_alloc(page_zone(page));
-
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
/* Hook up the pte_chain to the page. */
pte_chain->ptep = ptep;
pte_chain->next = page->pte_chain;
page->pte_chain = pte_chain;
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
+}
+
+void add_rmap_nolock(struct page* page, pte_t *ptep, struct pte_chain *pte_chain)
+{
+#ifdef DEBUG_RMAP
+ if (!page || !ptep)
+ BUG();
+ if (!pte_present(*ptep))
+ BUG();
+ if (!ptep_to_mm(ptep));
+ BUG();
+#endif
+
+ if (!VALID_PAGE(page) || PageReserved(page))
+ return;
+
+#ifdef DEBUG_RMAP
+ {
+ struct pte_chain * pc;
+ for (pc = page->pte_chain; pc; pc = pc->next) {
+ if (pc->ptep == ptep)
+ BUG();
+ }
+ }
+#endif
+
+ /* Hook up the pte_chain to the page. */
+ pte_chain->ptep = ptep;
+ pte_chain->next = page->pte_chain;
+ page->pte_chain = pte_chain;
}
/**
@@ -140,6 +174,7 @@
{
struct pte_chain * pc, * prev_pc = NULL;
zone_t *zone;
+ spinlock_t *rmap_lock;
if (!page || !ptep)
BUG();
@@ -148,7 +183,7 @@
zone = page_zone(page);
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) {
if (pc->ptep == ptep) {
pte_chain_free(pc, prev_pc, page, zone);
@@ -166,7 +201,7 @@
#endif
out:
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
return;
}
@@ -335,8 +370,7 @@
** functions.
**/
-static inline void pte_chain_push(zone_t * zone,
- struct pte_chain * pte_chain)
+void pte_chain_push(zone_t *zone, struct pte_chain *pte_chain)
{
pte_chain->ptep = NULL;
pte_chain->next = zone->pte_chain_freelist;
@@ -388,7 +422,7 @@
* pte_chain structures as required.
* Caller needs to hold the page's pte_chain_lock.
*/
-static inline struct pte_chain * pte_chain_alloc(zone_t * zone)
+struct pte_chain *pte_chain_alloc(zone_t *zone)
{
struct pte_chain * pte_chain;
--- 2.4.19-pre7.clean/mm/swap.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/swap.c Fri Aug 2 17:35:47 2002
@@ -78,6 +78,8 @@
*/
void drop_page(struct page * page)
{
+ spinlock_t *rmap_lock;
+
if (!TryLockPage(page)) {
if (page->mapping && page->buffers) {
page_cache_get(page);
@@ -90,7 +92,7 @@
}
/* Make sure the page really is reclaimable. */
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
if (!page->mapping || PageDirty(page) || page->pte_chain ||
page->buffers || page_count(page) > 1)
deactivate_page_nolock(page);
@@ -106,7 +108,7 @@
add_page_to_inactive_clean_list(page);
}
}
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
}
/*
--- 2.4.19-pre7.clean/mm/vmscan.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/vmscan.c Fri Aug 2 17:33:46 2002
@@ -81,6 +81,7 @@
struct list_head * page_lru;
swp_entry_t entry = {0};
int maxscan;
+ spinlock_t *rmap_lock;
/*
* We need to hold the pagecache_lock around all tests to make sure
@@ -109,13 +110,13 @@
}
/* Page cannot be reclaimed ? Move to inactive_dirty list. */
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
if (unlikely(page->pte_chain || page->buffers ||
PageReferenced(page) || PageDirty(page) ||
page_count(page) > 1 || TryLockPage(page))) {
del_page_from_inactive_clean_list(page);
add_page_to_inactive_dirty_list(page);
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
continue;
}
@@ -140,7 +141,7 @@
printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
list_del(page_lru);
zone->inactive_clean_pages--;
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
}
spin_unlock(&pagecache_lock);
@@ -150,7 +151,7 @@
found_page:
__lru_cache_del(page);
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
spin_unlock(&pagecache_lock);
spin_unlock(&pagemap_lru_lock);
if (entry.val)
@@ -213,6 +214,7 @@
{
int maxscan, cleaned_pages, target;
struct list_head * entry;
+ spinlock_t *rmap_lock;
target = free_plenty(zone);
cleaned_pages = 0;
@@ -268,17 +270,17 @@
* The page is in active use or really unfreeable. Move to
* the active list and adjust the page age if needed.
*/
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
if (page_referenced(page) && page_mapping_inuse(page) &&
!page_over_rsslimit(page)) {
del_page_from_inactive_dirty_list(page);
add_page_to_active_list(page);
page->age = max((int)page->age, PAGE_AGE_START);
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
continue;
}
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
/*
* Anonymous process memory without backing store. Try to
@@ -286,10 +288,10 @@
*
* XXX: implement swap clustering ?
*/
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
if (page->pte_chain && !page->mapping && !page->buffers) {
page_cache_get(page);
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
spin_unlock(&pagemap_lru_lock);
if (!add_to_swap(page)) {
activate_page(page);
@@ -300,7 +302,7 @@
}
page_cache_release(page);
spin_lock(&pagemap_lru_lock);
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
}
/*
@@ -313,14 +315,14 @@
case SWAP_FAIL:
goto page_active;
case SWAP_AGAIN:
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
continue;
case SWAP_SUCCESS:
; /* try to free the page below */
}
}
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
if (PageDirty(page) && page->mapping) {
/*
@@ -407,12 +409,12 @@
* This test is not safe from races, but only the one
* in reclaim_page() needs to be.
*/
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
if (page->mapping && !PageDirty(page) && !page->pte_chain &&
page_count(page) == 1) {
del_page_from_inactive_dirty_list(page);
add_page_to_inactive_clean_list(page);
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
cleaned_pages++;
} else {
@@ -424,7 +426,7 @@
page_active:
del_page_from_inactive_dirty_list(page);
add_page_to_active_list(page);
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
}
}
@@ -476,6 +478,7 @@
struct list_head * page_lru;
int nr_deactivated = 0;
struct page * page;
+ spinlock_t *rmap_lock;
/* Take the lock while messing with the list... */
spin_lock(&pagemap_lru_lock);
@@ -505,9 +508,9 @@
* From here until the end of the current iteration
* both PG_locked and the pte_chain_lock are held.
*/
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
if (!page_mapping_inuse(page)) {
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
drop_page(page);
continue;
@@ -533,12 +536,12 @@
} else {
deactivate_page_nolock(page);
if (++nr_deactivated > target) {
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
goto done;
}
}
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
/* Low latency reschedule point */
Daniel Phillips wrote:
>
> This patch eliminates about 35% of the raw rmap setup/teardown overhead by
> adopting a new locking interface that allows the add_rmaps to be batched in
> copy_page_range.
Well that's fairly straightforward, thanks. Butt-ugly though ;)
Don't bother doing teardown yet. I have patches which batch
all the zap_page_range activity into 16-page chunks, so we
eventually end up in a single function with 16 virtually-contiguous
pages to release. Adding the batched locking to that will
be simple.
Sigh. I have a test which sends the 2.5.30 VM into a five-minute
coma and which immediately panics latest -ac with pte_chain oom.
Remind me again why all this is worth it?
I'll port your stuff to 2.5 over the weekend, let you know...
On Fri, Aug 02, 2002 at 01:20:37PM -0700, Andrew Morton wrote:
> Sigh. I have a test which sends the 2.5.30 VM into a five-minute
> coma and which immediately panics latest -ac with pte_chain oom.
> Remind me again why all this is worth it?
> I'll port your stuff to 2.5 over the weekend, let you know...
I wrote the test (or is this the one I wrote?), I'll fix it. I've
already arranged to sleep in the right places.
Cheers,
Bill
On Fri, 2 Aug 2002, Andrew Morton wrote:
> Daniel Phillips wrote:
> >
> > This patch eliminates about 35% of the raw rmap setup/teardown overhead by
> > adopting a new locking interface that allows the add_rmaps to be batched in
> > copy_page_range.
>
> Well that's fairly straightforward, thanks. Butt-ugly though ;)
It'd be nice if the code would be a bit more beautiful and the
reverse mapping scheme more modular.
Remember that we're planning to go to an object-based scheme
later on, turning the code into a big monolithic mesh really
makes long-term maintenance a pain...
regards,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
Rik van Riel wrote:
>
> On Fri, 2 Aug 2002, Andrew Morton wrote:
> > Daniel Phillips wrote:
> > >
> > > This patch eliminates about 35% of the raw rmap setup/teardown overhead by
> > > adopting a new locking interface that allows the add_rmaps to be batched in
> > > copy_page_range.
> >
> > Well that's fairly straightforward, thanks. Butt-ugly though ;)
>
> It'd be nice if the code would be a bit more beautiful and the
> reverse mapping scheme more modular.
I changed it to, essentially:
foo()
{
spinlock_t *rmap_lock = NULL;
unsigned rmap_lockno = -1;
...
for (stuff) {
cached_rmap_lock(page, &rmap_lock, &rmap_lockno);
__page_add_rmap(page, ptep);
..
}
drop_rmap_lock(&rmap_lock, &rmap_lockno);
}
See http://www.zip.com.au/~akpm/linux/patches/2.5/2.5.30/daniel-rmap-speedup.patch
Fixing zap_pte_range pretty much requires the pagemap_lru_lock
rework; otherwise we couldn't hold the rmap lock across
tlb_remove_page().
> Remember that we're planning to go to an object-based scheme
> later on, turning the code into a big monolithic mesh really
> makes long-term maintenance a pain...
We have short-term rmap problems:
1) Unexplained pte chain state with ntpd
2) 10-20% increased CPU load in fork/exec/exit loads
3) system lock under heavy mmap load
4) ZONE_NORMAL pte_chain consumption
Daniel and I are on 2), Bill is on 4) (I think).
On Fri, 2 Aug 2002, Andrew Morton wrote:
> > > Well that's fairly straightforward, thanks. Butt-ugly though ;)
> I changed it to, essentially:
> See http://www.zip.com.au/~akpm/linux/patches/2.5/2.5.30/daniel-rmap-speedup.patch
This patch looks good. Good enough for long-term maintainability,
even... ;)
I like it.
> We have short-term rmap problems:
>
> 1) Unexplained pte chain state with ntpd
I'll do a detailed trace of xntpd to see what's happening...
> 2) 10-20% increased CPU load in fork/exec/exit loads
> 3) system lock under heavy mmap load
> 4) ZONE_NORMAL pte_chain consumption
>
> Daniel and I are on 2), Bill is on 4) (I think).
regards,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
Rik van Riel wrote:
>> Remember that we're planning to go to an object-based scheme
>> later on, turning the code into a big monolithic mesh really
>> makes long-term maintenance a pain...
On Fri, Aug 02, 2002 at 05:31:45PM -0700, Andrew Morton wrote:
> We have short-term rmap problems:
> 1) Unexplained pte chain state with ntpd
> 2) 10-20% increased CPU load in fork/exec/exit loads
> 3) system lock under heavy mmap load
> 4) ZONE_NORMAL pte_chain consumption
On Fri, Aug 02, 2002 at 05:31:45PM -0700, Andrew Morton wrote:
> Daniel and I are on 2), Bill is on 4) (I think).
I am indeed on (4), though I'd describe what I'm doing as "OOM handling".
Cheers,
Bill
On Friday 02 August 2002 22:20, Andrew Morton wrote:
> Daniel Phillips wrote:
> >
> > This patch eliminates about 35% of the raw rmap setup/teardown overhead by
> > adopting a new locking interface that allows the add_rmaps to be batched
> > in copy_page_range.
>
> Well that's fairly straightforward, thanks. Butt-ugly though ;)
I could try "fast is beautiful" or "beauty is in the eye of the beholder",
but I think I'll stick with "beauty isn't the point just now".
> Don't bother doing teardown yet. I have patches which batch
> all the zap_page_range activity into 16-page chunks, so we
> eventually end up in a single function with 16 virtually-contiguous
> pages to release. Adding the batched locking to that will
> be simple.
Great. Well, both the locking locality of anonymous pages and the dispersion
of mmaped pages could be improved considrably, so maybe I'll play around with
those a little. Taking a wild guess, it might be good for another 5-10%
overhead reduction, and won't impact the basic structure.
> Sigh. I have a test which sends the 2.5.30 VM into a five-minute
> coma
That doesn't sound like a rmap problem per se. Is the test posted?
> and which immediately panics latest -ac with pte_chain oom.
> Remind me again why all this is worth it?
It will be worth it when we finally have a system that swaps well and doesn't
die if you throw a lot of disk IO at it (like BSD). It will be doubly worth
it when active defragmentation happens.
What we will end up with at the end of this cycle will have all the solidity
and flexibility of the BSD VM with little of the complexity. According to me
anyway ;-)
--
Daniel
No joy, I'm afraid.
2.5.26:
./daniel.sh 39.78s user 71.72s system 368% cpu 30.260 total
quad:/home/akpm> time ./daniel.sh
./daniel.sh 38.45s user 70.00s system 365% cpu 29.642 total
c0132b0c 328 1.03288 free_page_and_swap_cache
c013074c 334 1.05177 lru_cache_add
c0112a64 428 1.34778 do_page_fault
c0144e90 434 1.36667 link_path_walk
c01388b4 458 1.44225 do_page_cache_readahead
c01263e0 468 1.47374 clear_page_tables
c01319b0 479 1.50838 __free_pages_ok
c01e0700 514 1.61859 radix_tree_lookup
c012a8a8 605 1.90515 find_get_page
c01079f8 640 2.01537 page_fault
c01127d4 649 2.04371 pte_alloc_one
c0131ca0 811 2.55385 rmqueue
c0127cc8 1152 3.62766 do_anonymous_page
c013230c 1421 4.47474 page_cache_release
c0126880 1544 4.86207 zap_pte_range
c012662c 1775 5.58949 copy_page_range
c0127e70 1789 5.63358 do_no_page
c012750c 6860 21.6022 do_wp_page
Stock 2.5.30:
./daniel.sh 36.60s user 88.23s system 366% cpu 34.029 total
quad:/home/akpm> time ./daniel.sh
./daniel.sh 37.22s user 87.88s system 354% cpu 35.288 total
c014fdc4 191 0.872943 __d_lookup
c01310c0 203 0.927788 kmem_cache_alloc_batch
c0114154 227 1.03748 do_page_fault
c0146ea8 243 1.1106 link_path_walk
c0132fd0 257 1.17459 __free_pages_ok
c0134284 279 1.27514 free_page_and_swap_cache
c0131538 309 1.41225 kmem_cache_free
c0107c90 320 1.46252 page_fault
c012ca48 326 1.48995 find_get_page
c012a220 349 1.59506 handle_mm_fault
c0128520 360 1.64534 clear_page_tables
c0113ed0 367 1.67733 pte_alloc_one
c013129c 399 1.82358 kmem_cache_alloc
c01332bc 453 2.07038 rmqueue
c0129df4 557 2.5457 do_anonymous_page
c013392c 689 3.14899 page_cache_release
c0128a60 832 3.80256 zap_pte_range
c0129fa0 893 4.08135 do_no_page
c0128828 1081 4.94059 copy_page_range
c013aa74 1276 5.83181 page_add_rmap
c013ab3c 3094 14.1408 page_remove_rmap
c01296a8 3466 15.841 do_wp_page
2.5.30+pagemap_lru_lock stuff
quad:/home/akpm> time ./daniel.sh
./daniel.sh 41.01s user 97.15s system 373% cpu 36.996 total
quad:/home/akpm> time ./daniel.sh
./daniel.sh 36.67s user 87.04s system 368% cpu 33.575 total
c0131d60 230 1.08979 kmem_cache_alloc_batch
c0148728 231 1.09453 link_path_walk
c01321d8 238 1.12769 kmem_cache_free
c01142b4 240 1.13717 do_page_fault
c0135624 291 1.37882 free_page_and_swap_cache
c012a8cc 323 1.53044 handle_mm_fault
c0128790 326 1.54466 clear_page_tables
c0107c90 338 1.60152 page_fault
c0131f3c 350 1.65837 kmem_cache_alloc
c0113f20 373 1.76735 pte_alloc_one
c012d2a8 397 1.88107 find_get_page
c013466c 415 1.96636 rmqueue
c0132f74 449 2.12746 __pagevec_release
c012a3bc 532 2.52073 do_anonymous_page
c012a5b0 772 3.6579 do_no_page
c0128da0 854 4.04643 zap_pte_range
c0128b48 1031 4.8851 copy_page_range
c013c054 1244 5.89434 page_add_rmap
c013c11c 3088 14.6316 page_remove_rmap
c0129b58 3206 15.1907 do_wp_page
2.5.30+pagemap_lru_lock+this patch:
quad:/home/akpm> time ./daniel.sh
./daniel.sh 38.78s user 91.56s system 366% cpu 35.534 total
quad:/home/akpm> time ./daniel.sh
./daniel.sh 38.07s user 88.64s system 363% cpu 34.883 total
c0135a90 332 1.30853 free_page_and_swap_cache
c013c57c 332 1.30853 page_add_rmap
c012ad4d 337 1.32824 .text.lock.memory
c0132448 353 1.3913 kmem_cache_free
c0128790 372 1.46618 clear_page_tables
c0107c90 377 1.48589 page_fault
c01142b4 423 1.66719 do_page_fault
c0113f20 432 1.70266 pte_alloc_one
c012d518 438 1.72631 find_get_page
c013c91c 438 1.72631 .text.lock.rmap
c01321ac 443 1.74602 kmem_cache_alloc
c012aafc 453 1.78543 handle_mm_fault
c01349fc 463 1.82485 rmqueue
c012a5ec 655 2.58159 do_anonymous_page
c01331e4 748 2.94813 __pagevec_release
c012a7e0 992 3.90982 do_no_page
c0128e90 1426 5.62037 zap_pte_range
c0128b48 1586 6.25099 copy_page_range
c013c5c8 2324 9.1597 __page_remove_rmap
c0129d88 4028 15.8758 do_wp_page
- page_add_rmap has vanished
- page_remove_rmap has halved (80% of the remaining is the
list walk)
- we've moved the cost into the new locking site, zap_pte_range
and copy_page_range.
So rmap locking is still a 15% slowdown on my soggy quad, which generally
seems relatively immune to locking costs. PPC will like the change
because spinlocks are better than bitops. ia32 should have liked it
for the same reason but, as I say, this machine doesn't seem to have
the bandwidth*latency to be affected much by these things.
On more modern machines and other architectures this remains
a significant problem for rmap, I expect.
Guess we should instrument it up and make sure that the hashing
and index thing is getting the right locality. I saw UML-for-2.5.30
whizz past, if you have time ;)
Broken out patches are at
http://www.zip.com.au/~akpm/linux/patches/2.5/2.5.30/
Rolled-up patch is at
http://www.zip.com.au/~akpm/linux/patches/2.5/2.5.30/everything.gz
On Saturday 03 August 2002 07:24, Andrew Morton wrote:
> - page_add_rmap has vanished
> - page_remove_rmap has halved (80% of the remaining is the
> list walk)
> - we've moved the cost into the new locking site, zap_pte_range
> and copy_page_range.
> So rmap locking is still a 15% slowdown on my soggy quad, which generally
> seems relatively immune to locking costs.
What is it about your quad? I'm getting the expected results here on my two
way. I just checked that the lock hashing is doing what it's supposed to.
It is: if I drop all the locks into a single bucket, the speedup drops by
half.
It seems odd that you're seeing effectively no change at all. Is it possible
we lost something in translation? What happens if you just run with the
copy_page_range side, and no changes to zap_page_range?
> PPC will like the change
> because spinlocks are better than bitops. ia32 should have liked it
> for the same reason but, as I say, this machine doesn't seem to have
> the bandwidth*latency to be affected much by these things.
>
> On more modern machines and other architectures this remains
> a significant problem for rmap, I expect.
My 2X 1GHz PIII definitely likes it.
> Guess we should instrument it up and make sure that the hashing
> and index thing is getting the right locality. I saw UML-for-2.5.30
> whizz past, if you have time ;)
I've got intrumentation for that all ready to go. I'll break it out and send
it along. The bucket distribution can definitely be improved, by xoring some
higher bits of the lock number with a value specific to each mapping. The
anon page locality is poor with the simple increment-a-counter approach; we
can do much better.
But before we start on the micro-optimization we need to know why your quad
is so unaffected by the big change. Are you sure the slab cache batching of
pte chain allocation performs as well as my simpleminded inline batching?
(I batched the pte chain allocation lock quite nicely.) What about the bit
test/set for the direct rmap pointer, how is performance affected by dropping
the direct lookup optimization? Note that you are holding the rmap lock
considerably longer than I was, by holding it across __page_add_rmap instead
of just across the few instructions where pointers are actually updated. I'm
also wondering if gcc is optimizing your cached_rmap_lock inline as well as
you think it is.
I really need to be running on 2.5 so I can crosscheck your results. I'll
return to the matter of getting the dac960 running now.
Miscellaneous question: we are apparently adding rmaps to reserved pages, why
is that?
--
Daniel
On Fri, 2 Aug 2002, Andrew Morton wrote:
> No joy, I'm afraid.
> Guess we should instrument it up and make sure that the hashing
> and index thing is getting the right locality.
Could it be that your quad needs to go to RAM to grab a cacheline
that's exclusive on the other CPU while Daniel's machine can just
shove cachelines from CPU to CPU ?
What I'm referring to is the fact that the pte_chain_locks in
Daniel's patch are all packed into a few cachelines, instead of
having each lock on its own cache line...
This could explain the fact that the locking overhead plummeted
on Daniel's box while it didn't change at all on your machine.
regards,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
On Saturday 03 August 2002 07:24, Andrew Morton wrote:
> No joy, I'm afraid.
We need to eliminate some variables. People, can we please have some smp
results for 2 way or whatever-way for the exact kernel I used:
http://www.kernel.org/pub/linux/kernel/v2.4/linux-2.4.18.tar.bz2
http://www.kernel.org/pub/linux/kernel/v2.4/testing/old/patch-2.4.19-pre7.bz2
http://surriel.com/patches/2.4/2.4.19p7-rmap-13b
With and without this patch:
http://people.nl.linux.org/~phillips/patches/rmap.speedup-2.4.19-pre7
Using this script:
http://people.nl.linux.org/~phillips/patches/lots_of_forks.sh
time sh lots_of_forks.sh
Thanks.
--
Daniel
Daniel Phillips wrote:
>
> On Saturday 03 August 2002 07:24, Andrew Morton wrote:
> > - page_add_rmap has vanished
> > - page_remove_rmap has halved (80% of the remaining is the
> > list walk)
> > - we've moved the cost into the new locking site, zap_pte_range
> > and copy_page_range.
>
> > So rmap locking is still a 15% slowdown on my soggy quad, which generally
> > seems relatively immune to locking costs.
>
> What is it about your quad?
Dunno. I was discussing related things with David M-T yesterday. A
`lock;incl' on a P4 takes ~120 cycles, everything in-cache. On my PIII
it's 33 cycles. 12-16 on ia64. Lots of variation there.
Instrumentation during your 35 second test shows:
- Performed an rmap lock 6.5M times
- Got a hit on the cached lock 9.2M times
- Got a miss on the cached lock 2.6M times.
So the remaining (6.5M - 2.6M) locks were presumably in the
pagefault handlers.
lockmeter results are at http://www.zip.com.au/~akpm/linux/rmap-lockstat.txt
- We took 3,000,000 rwlocks (mainly pagecache)
- We took 20,000,000 spinlocks
- the locking is spread around copy_mm, copy-page_range,
do_no_page, handle_mm_fault, page_add_rmap, zap_pte_range
- total amount of CPU time lost spinning on locks is 1%, mainly
in page_add_rmap and zap_pte_range.
That's not much spintime. The total system time with this test went
from 71 seconds (2.5.26) to 88 seconds (2.5.30). (4.5 seconds per CPU)
So all the time is presumably spent waiting on cachelines to come from
other CPUs, or from local L2.
lockmeter results for 2.5.26 are at
http://www.zip.com.au/~akpm/linux/2.5.26-lockstat.txt
- 2.5.26 took 17,000,000 spinlocks
- but 3,000,000 of those were kmap_lock and pagemap_lru_lock, which
have been slaughtered in my tree. rmap really added 6,000,000
locks to 2.5.30.
Running the same test on 2.4:
2.4.19-pre7:
./daniel.sh 35.12s user 65.96s system 363% cpu 27.814 total
./daniel.sh 35.95s user 64.77s system 362% cpu 27.763 total
./daniel.sh 34.99s user 66.46s system 364% cpu 27.861 total
2.4.19-pre7+rmap:
./daniel.sh 36.20s user 106.80s system 363% cpu 39.316 total
./daniel.sh 38.76s user 118.69s system 399% cpu 39.405 total
./daniel.sh 35.47s user 106.90s system 364% cpu 39.062 total
2.4.19-pre7+rmap-13b+your patch:
./daniel.sh 33.72s user 97.20s system 364% cpu 35.904 total
./daniel.sh 35.18s user 94.48s system 363% cpu 35.690 total
./daniel.sh 34.83s user 95.66s system 363% cpu 35.921 total
The system time is pretty gross, isn't it?
And it's disproportional to the increased number of lockings.
> ...
>
> But before we start on the micro-optimization we need to know why your quad
> is so unaffected by the big change.
We need a little test proggy to measure different platforms cache
load latency, locked operations, etc. I have various bits-n-pieces,
will put something together.
> Are you sure the slab cache batching of
> pte chain allocation performs as well as my simpleminded inline batching?
Slab is pretty good, I find. And there's no indication of a problem
in the profiles.
> (I batched the pte chain allocation lock quite nicely.) What about the bit
> test/set for the direct rmap pointer, how is performance affected by dropping
> the direct lookup optimization?
It didn't show in the instruction-level oprofiling.
> Note that you are holding the rmap lock
> considerably longer than I was, by holding it across __page_add_rmap instead
> of just across the few instructions where pointers are actually updated. I'm
> also wondering if gcc is optimizing your cached_rmap_lock inline as well as
> you think it is.
>
> I really need to be running on 2.5 so I can crosscheck your results. I'll
> return to the matter of getting the dac960 running now.
Sigh. No IDE?
> Miscellaneous question: we are apparently adding rmaps to reserved pages, why
> is that?
That's one for Rik...
Rik van Riel wrote:
>
> On Fri, 2 Aug 2002, Andrew Morton wrote:
>
> > No joy, I'm afraid.
>
> > Guess we should instrument it up and make sure that the hashing
> > and index thing is getting the right locality.
>
> Could it be that your quad needs to go to RAM to grab a cacheline
> that's exclusive on the other CPU while Daniel's machine can just
> shove cachelines from CPU to CPU ?
Could be, Rik. I don't know. It's a bit worrisome that we might
be dependent on subtleties like that.
> What I'm referring to is the fact that the pte_chain_locks in
> Daniel's patch are all packed into a few cachelines, instead of
> having each lock on its own cache line...
>
> This could explain the fact that the locking overhead plummeted
> on Daniel's box while it didn't change at all on your machine.
Oh it helped a bit. More in 2.4 than 2.5. Possibly I broke
Daniel's patch somehow. But even the improvement in 2.4
from Daniel's patch is disappointing.
On Saturday 03 August 2002 23:05, Rik van Riel wrote:
> On Fri, 2 Aug 2002, Andrew Morton wrote:
>
> > No joy, I'm afraid.
>
> > Guess we should instrument it up and make sure that the hashing
> > and index thing is getting the right locality.
>
> Could it be that your quad needs to go to RAM to grab a cacheline
> that's exclusive on the other CPU while Daniel's machine can just
> shove cachelines from CPU to CPU ?
>
> What I'm referring to is the fact that the pte_chain_locks in
> Daniel's patch are all packed into a few cachelines, instead of
> having each lock on its own cache line...
>
> This could explain the fact that the locking overhead plummeted
> on Daniel's box while it didn't change at all on your machine.
To put each lock in its own cacheline:
static inline unsigned rmap_lockno(pgoff_t index)
{
- return (index >> 4) & (ARRAY_SIZE(rmap_locks) - 1);
+ return (index >> 4) & (ARRAY_SIZE(rmap_locks) - 64);
}
a quick hack, notable mainly for being obscure ;-)
I did try this and noticed scarcely any difference.
--
Daniel
On Saturday 03 August 2002 23:43, Andrew Morton wrote:
> > This could explain the fact that the locking overhead plummeted
> > on Daniel's box while it didn't change at all on your machine.
>
> Oh it helped a bit. More in 2.4 than 2.5. Possibly I broke
> Daniel's patch somehow. But even the improvement in 2.4
> from Daniel's patch is disappointing.
What are the numbers for 2.4? Are they for my out-of-the-box patch?
--
Daniel
On Sat, 3 Aug 2002, Andrew Morton wrote:
> > Miscellaneous question: we are apparently adding rmaps to reserved pages, why
> > is that?
>
> That's one for Rik...
Good question, 2.4-rmap never did that so I wonder when that
got changed...
In fact, page_add_rmap() checks for PageReserved and doesn't
add the rmap if the page is reserved.
regards,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
Wait a second guys, the problem is with the script, look at those CPU
numbers:
> ./daniel.sh 39.78s user 71.72s system 368% cpu 30.260 total
> quad:/home/akpm> time ./daniel.sh
> ./daniel.sh 38.45s user 70.00s system 365% cpu 29.642 total
They should be 399%!! With my fancy script, the processes themselves are
getting serialized somehow.
Lets back up and try this again with this pair of scripts, much closer to
the original:
doitlots:
-------------------------------
#!/bin/sh
doit()
{
( cat $1 | wc -l )
}
count=0
while [ $count != 500 ]
do
doit doitlots > /dev/null
count=$(expr $count + 1)
done
echo done
-------------------------------
forklots:
-------------------------------
echo >foocount
./doitlots >>foocount &
./doitlots >>foocount &
./doitlots >>foocount &
./doitlots >>foocount &
./doitlots >>foocount &
./doitlots >>foocount &
./doitlots >>foocount &
./doitlots >>foocount &
./doitlots >>foocount &
count=0
while [ $count != 10 ]
do
count=$(wc foocount | cut -b -8)
done
-------------------------------
/me makes the sign of the beast at bash
--
Daniel
Daniel Phillips wrote:
>
> Wait a second guys, the problem is with the script, look at those CPU
> numbers:
>
> > ./daniel.sh 39.78s user 71.72s system 368% cpu 30.260 total
> > quad:/home/akpm> time ./daniel.sh
> > ./daniel.sh 38.45s user 70.00s system 365% cpu 29.642 total
>
> They should be 399%!! With my fancy script, the processes themselves are
> getting serialized somehow.
>
> Lets back up and try this again with this pair of scripts, much closer to
> the original:
>
Still 360%. I did have a version which achieved 398%, but it
succumbed to the monthly "why is there so much junk in my home
dir" disease.
But it doesn't matter, does it? We're looking at deltas here.
On Sunday 04 August 2002 00:39, Andrew Morton wrote:
> Daniel Phillips wrote:
> >
> > Wait a second guys, the problem is with the script, look at those CPU
> > numbers:
> >
> > > ./daniel.sh 39.78s user 71.72s system 368% cpu 30.260 total
> > > quad:/home/akpm> time ./daniel.sh
> > > ./daniel.sh 38.45s user 70.00s system 365% cpu 29.642 total
> >
> > They should be 399%!! With my fancy script, the processes themselves are
> > getting serialized somehow.
> >
> > Lets back up and try this again with this pair of scripts, much closer to
> > the original:
>
> Still 360%. I did have a version which achieved 398%, but it
> succumbed to the monthly "why is there so much junk in my home
> dir" disease.
>
> But it doesn't matter, does it? We're looking at deltas here.
It matters a whole lot. If we aren't saturating the CPUs then we're not
testing what we think we're testing.
--
Daniel
On Saturday 03 August 2002 23:40, Andrew Morton wrote:
> - total amount of CPU time lost spinning on locks is 1%, mainly
> in page_add_rmap and zap_pte_range.
>
> That's not much spintime. The total system time with this test went
> from 71 seconds (2.5.26) to 88 seconds (2.5.30). (4.5 seconds per CPU)
> So all the time is presumably spent waiting on cachelines to come from
> other CPUs, or from local L2.
Have we tried this one:
static inline unsigned rmap_lockno(pgoff_t index)
{
- return (index >> 4) & (ARRAY_SIZE(rmap_locks) - 1);
+ return (index >> 4) & (ARRAY_SIZE(rmap_locks) - 16);
}
(which puts all the rmap spinlocks in separate cache lines)
--
Daniel
On Saturday 03 August 2002 23:40, Andrew Morton wrote:
> Running the same test on 2.4:
>
> 2.4.19-pre7:
> ./daniel.sh 35.12s user 65.96s system 363% cpu 27.814 total
> ./daniel.sh 35.95s user 64.77s system 362% cpu 27.763 total
> ./daniel.sh 34.99s user 66.46s system 364% cpu 27.861 total
>
> 2.4.19-pre7+rmap:
> ./daniel.sh 36.20s user 106.80s system 363% cpu 39.316 total
> ./daniel.sh 38.76s user 118.69s system 399% cpu 39.405 total
> ./daniel.sh 35.47s user 106.90s system 364% cpu 39.062 total
>
> 2.4.19-pre7+rmap-13b+your patch:
> ./daniel.sh 33.72s user 97.20s system 364% cpu 35.904 total
> ./daniel.sh 35.18s user 94.48s system 363% cpu 35.690 total
> ./daniel.sh 34.83s user 95.66s system 363% cpu 35.921 total
>
> The system time is pretty gross, isn't it?
>
> And it's disproportional to the increased number of lockings.
These numbers show a 30% reduction in rmap overhead with my patch,
close to what I originally reported:
((35.904 + 35.690 + 35.921) - (27.814 + 27.763 + 27.861)) /
((39.316 + 39.405 + 39.062) - (27.814 + 27.763 + 27.861)) ~= .70
But they also show that rmap overhead is around 29% on your box,
even with my patch:
(35.904 + 35.690 + 35.921) / (27.814 + 27.763 + 27.861) ~= 1.29
Granted, it's still way too high, and we are still in search of the
'dark cycles'.
Did we do an apples-to-apples comparison of 2.4 to 2.5? Because if
we did, then going by your numbers, 2.5.26 is already considerably
worse than 2.4.19-pre7:
((30.260 + 29.642)/2) / ((27.814 + 27.763 + 27.861)/3) ~= 1.08
Is it fair to compare your 2.4 vs 2.5 numbers?
--
Daniel
Seperate cache lines yes, but don't you also need to be somewhat
aware of the algorithm used for 2-way or 4-way set associative cacheing?
I forget the details, but it seems like P-II or P-III used an algorithm
that dropped the last several bits when deciding which bucket in the
set it would put a value in. So when we had items at some multiple
of, I think 64 (could be 64 bytes, 64 kbytes) that only one entry
in the set associated cache would be used for lots and lots of memory
references. My brain is really fuzzy on this right now, but if anyone
knows that the, for instance, P-II or P-III algorithm is for doing
replacement in a 2-way or 4-way set associative cache is, that might
help point out where a micro-optimization isn't optimizing as expected...
And sorry for the vagueness of the description, I don't have my Intel
books at hand to dig this up. But that might account for wide variations
of a single function on two different processor types of the same
family.
gerrit
In message <E17b7iB-0003Lu-00@starship>, > : Daniel Phillips writes:
> On Saturday 03 August 2002 23:40, Andrew Morton wrote:
> > - total amount of CPU time lost spinning on locks is 1%, mainly
> > in page_add_rmap and zap_pte_range.
> >
> > That's not much spintime. The total system time with this test went
> > from 71 seconds (2.5.26) to 88 seconds (2.5.30). (4.5 seconds per CPU)
> > So all the time is presumably spent waiting on cachelines to come from
> > other CPUs, or from local L2.
>
> Have we tried this one:
>
> static inline unsigned rmap_lockno(pgoff_t index)
> {
> - return (index >> 4) & (ARRAY_SIZE(rmap_locks) - 1);
> + return (index >> 4) & (ARRAY_SIZE(rmap_locks) - 16);
> }
>
> (which puts all the rmap spinlocks in separate cache lines)
>
> --
> Daniel
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
>
Daniel Phillips wrote:
>
> On Saturday 03 August 2002 23:40, Andrew Morton wrote:
> > Running the same test on 2.4:
> >
> > 2.4.19-pre7:
> > ./daniel.sh 35.12s user 65.96s system 363% cpu 27.814 total
> > ./daniel.sh 35.95s user 64.77s system 362% cpu 27.763 total
> > ./daniel.sh 34.99s user 66.46s system 364% cpu 27.861 total
> >
> > 2.4.19-pre7+rmap:
> > ./daniel.sh 36.20s user 106.80s system 363% cpu 39.316 total
> > ./daniel.sh 38.76s user 118.69s system 399% cpu 39.405 total
> > ./daniel.sh 35.47s user 106.90s system 364% cpu 39.062 total
> >
> > 2.4.19-pre7+rmap-13b+your patch:
> > ./daniel.sh 33.72s user 97.20s system 364% cpu 35.904 total
> > ./daniel.sh 35.18s user 94.48s system 363% cpu 35.690 total
> > ./daniel.sh 34.83s user 95.66s system 363% cpu 35.921 total
> >
> > The system time is pretty gross, isn't it?
> >
> > And it's disproportional to the increased number of lockings.
>
> These numbers show a 30% reduction in rmap overhead with my patch,
> close to what I originally reported:
>
> ((35.904 + 35.690 + 35.921) - (27.814 + 27.763 + 27.861)) /
> ((39.316 + 39.405 + 39.062) - (27.814 + 27.763 + 27.861)) ~= .70
>
> But they also show that rmap overhead is around 29% on your box,
> even with my patch:
>
> (35.904 + 35.690 + 35.921) / (27.814 + 27.763 + 27.861) ~= 1.29
>
> Granted, it's still way too high, and we are still in search of the
> 'dark cycles'.
I'd say that the rmap overhead remains 50%, actually. That's the
increase in system time.
> Did we do an apples-to-apples comparison of 2.4 to 2.5?
Seems 2.4 is a little faster - see the other email. Just another
hit on page->flags somewhere would be enough to make that difference.
Nothing very obvious stands out in the oprofiles.
2.5.26:
c011c7b0 255 0.820833 exit_notify
c0131d00 255 0.820833 lru_cache_add
c0117d48 257 0.827271 copy_mm
c012d078 271 0.872336 filemap_nopage
c0113dec 312 1.00431 pgd_alloc
c011415c 338 1.08801 do_page_fault
c014eb84 379 1.21998 __d_lookup
c0134050 385 1.2393 free_page_and_swap_cache
c0139a08 405 1.30368 do_page_cache_readahead
c0145e08 417 1.3423 link_path_walk
c0132f30 428 1.37771 __free_pages_ok
c012c3b8 582 1.87343 find_get_page
c01db08c 583 1.87665 radix_tree_lookup
c0128040 594 1.91206 clear_page_tables
c0107b58 650 2.09232 page_fault
c0113ea0 682 2.19533 pte_alloc_one
c01331c0 785 2.52688 rmqueue
c0129868 1146 3.68892 do_anonymous_page
c013383c 1485 4.78015 page_cache_release
c01284f0 1513 4.87028 zap_pte_range
c0129a04 1717 5.52694 do_no_page
c01282b0 1726 5.55591 copy_page_range
c0129124 6653 21.4157 do_wp_page
2.4.19-pre7:
c0140004 144 0.79929 free_page_and_swap_cache
c013bbc4 146 0.810391 kmem_cache_alloc
c011bf44 148 0.821492 copy_mm
c013c290 163 0.904751 kmem_cache_free
c01193fc 164 0.910302 do_schedule
c011c88c 168 0.932504 do_fork
c0155fe4 192 1.06572 link_path_walk
c013d6e0 211 1.17118 lru_cache_add
c01182d8 220 1.22114 do_page_fault
c0122158 226 1.25444 exit_notify
c012e5c4 252 1.39876 clear_page_tables
c013eee0 292 1.62078 __free_pages_ok
c01096cc 404 2.24245 page_fault
c013f298 409 2.2702 rmqueue
c0161438 440 2.44227 d_lookup
c0130c60 443 2.45893 pte_alloc
c0134318 634 3.51909 __find_get_page
c0130404 660 3.66341 do_anonymous_page
c013fa3c 728 4.04085 __free_pages
c012e960 972 5.3952 zap_page_range
c012e6d8 1031 5.72269 copy_page_range
c01306a0 1042 5.78375 do_no_page
c012f8a0 3940 21.8694 do_wp_page
Daniel Phillips wrote:
>
> On Saturday 03 August 2002 23:40, Andrew Morton wrote:
> > - total amount of CPU time lost spinning on locks is 1%, mainly
> > in page_add_rmap and zap_pte_range.
> >
> > That's not much spintime. The total system time with this test went
> > from 71 seconds (2.5.26) to 88 seconds (2.5.30). (4.5 seconds per CPU)
> > So all the time is presumably spent waiting on cachelines to come from
> > other CPUs, or from local L2.
>
> Have we tried this one:
>
> static inline unsigned rmap_lockno(pgoff_t index)
> {
> - return (index >> 4) & (ARRAY_SIZE(rmap_locks) - 1);
> + return (index >> 4) & (ARRAY_SIZE(rmap_locks) - 16);
> }
>
> (which puts all the rmap spinlocks in separate cache lines)
Seems a strange way of doing it? We'll only ever use four locks
this way.
2.4.19-pre7:
./daniel.sh 36.00s user 66.09s system 363% cpu 28.059 total
./daniel.sh 35.49s user 67.70s system 361% cpu 28.516 total
./daniel.sh 34.38s user 68.46s system 363% cpu 28.327 total
2.5.26
./daniel.sh 40.90s user 75.79s system 364% cpu 31.984 total
./daniel.sh 37.65s user 69.23s system 366% cpu 29.177 total
./daniel.sh 37.77s user 69.45s system 364% cpu 29.408 total
2.5.30
./daniel.sh 38.01s user 91.31s system 366% cpu 35.281 total
./daniel.sh 37.19s user 87.69s system 368% cpu 33.884 total
./daniel.sh 37.18s user 87.62s system 358% cpu 34.812 total
2.5.30+akpmpatchpile
./daniel.sh 36.71s user 85.73s system 363% cpu 33.722 total
./daniel.sh 35.60s user 83.86s system 358% cpu 33.303 total
./daniel.sh 36.56s user 86.26s system 368% cpu 33.346 total
2.5.30+akpmpatchpile+rmap-speedup:
./daniel.sh 36.22s user 84.09s system 361% cpu 33.237 total
./daniel.sh 40.46s user 93.11s system 376% cpu 35.461 total
./daniel.sh 39.29s user 91.79s system 359% cpu 36.441 total
2.5.30+akpmpatchpile+rmap-speedup+the above:
./daniel.sh 38.75s user 102.66s system 374% cpu 37.764 total
./daniel.sh 38.72s user 105.08s system 362% cpu 39.672 total
./daniel.sh 40.43s user 108.00s system 373% cpu 39.722 total
Which tends to indicate that I broke your patch somehow. It's at
http://www.zip.com.au/~akpm/linux/patches/2.5/2.5.30/daniel-rmap-speedup.patch
and needs deep staring at.
On Sunday 04 August 2002 02:47, Andrew Morton wrote:
> Daniel Phillips wrote:
> >
> > On Saturday 03 August 2002 23:40, Andrew Morton wrote:
> > > - total amount of CPU time lost spinning on locks is 1%, mainly
> > > in page_add_rmap and zap_pte_range.
> > >
> > > That's not much spintime. The total system time with this test went
> > > from 71 seconds (2.5.26) to 88 seconds (2.5.30). (4.5 seconds per CPU)
> > > So all the time is presumably spent waiting on cachelines to come from
> > > other CPUs, or from local L2.
> >
> > Have we tried this one:
> >
> > static inline unsigned rmap_lockno(pgoff_t index)
> > {
> > - return (index >> 4) & (ARRAY_SIZE(rmap_locks) - 1);
> > + return (index >> 4) & (ARRAY_SIZE(rmap_locks) - 16);
> > }
> >
> > (which puts all the rmap spinlocks in separate cache lines)
>
> Seems a strange way of doing it?
It is a strange way of doing it. I felt like being engigmatic at the time,
and no, nothing like that should ever go into production code, it would be
better suited to an IOCCC submission.
> We'll only ever use four locks this way.
Look again: 256 - 16 = 250 = 0xf0.
--
Daniel
Hi,
On Sun, 4 Aug 2002, Daniel Phillips wrote:
> Look again: 256 - 16 = 250 = 0xf0.
What's your maths?! 256d - 16d = 0xff - 0xf = 0xf0 = 240d!
Thunder
--
.-../../-./..-/-..- .-./..-/.-.././.../.-.-.-
On Sun, 4 Aug 2002, Thunder from the hill wrote:
> On Sun, 4 Aug 2002, Daniel Phillips wrote:
> > Look again: 256 - 16 = 250 = 0xf0.
>
> What's your maths?! 256d - 16d = 0xff - 0xf = 0xf0 = 240d!
I believe that was a typo, but the final answer was obviously correct. No
need to get into a hissy..
Zwane
--
function.linuxpower.ca
On Sun, 4 Aug 2002, Thunder from the hill wrote:
> Hi,
>
> On Sun, 4 Aug 2002, Daniel Phillips wrote:
> > Look again: 256 - 16 = 250 = 0xf0.
>
> What's your maths?! 256d - 16d = 0xff - 0xf = 0xf0 = 240d!
What's yours?! 256d - 16d = 0x100 - 0x10 = 0xf0 = 240d!
Couldn't resist... ;-)
/Tobias
OK, I warmed the code up a bit and did some more measurement.
Your locking patch has improved things significantly. And
we're now getting hurt by all the cache misses walking the
pte chains.
Here are some uniprocessor numbers:
up, 2.5.30+rmap-lock-speedup:
./daniel.sh 28.32s user 42.59s system 90% cpu 1:18.20 total
./daniel.sh 29.25s user 38.62s system 91% cpu 1:14.34 total
./daniel.sh 29.13s user 38.70s system 91% cpu 1:14.50 total
c01cdc88 149 0.965276 strnlen_user
c01341f4 181 1.17258 __page_add_rmap
c012d364 195 1.26328 rmqueue
c0147680 197 1.27624 __d_lookup
c010bb28 229 1.48354 timer_interrupt
c013f3b0 235 1.52242 link_path_walk
c01122cc 261 1.69085 do_page_fault
c0111fd0 291 1.8852 pte_alloc_one
c0124be4 292 1.89168 do_anonymous_page
c0123478 304 1.96942 clear_page_tables
c01236c8 369 2.39052 copy_page_range
c01078dc 520 3.36875 page_fault
c012b620 552 3.57606 kmem_cache_alloc
c0124d58 637 4.12672 do_no_page
c0123960 648 4.19798 zap_pte_range
c012b80c 686 4.44416 kmem_cache_free
c0134298 2077 13.4556 __page_remove_rmap
c0124540 2661 17.2389 do_wp_page
up, 2.5.26:
./daniel.sh 27.90s user 31.28s system 90% cpu 1:05.25 total
./daniel.sh 31.41s user 35.30s system 100% cpu 1:06.71 total
./daniel.sh 28.54s user 32.01s system 91% cpu 1:06.41 total
c0124f2c 167 1.21155 find_vma
c0131ea8 183 1.32763 do_page_cache_readahead
c012c07c 186 1.34939 rmqueue
c01c7dc8 192 1.39292 strnlen_user
c010ba78 210 1.52351 timer_interrupt
c0144c50 222 1.61056 __d_lookup
c01120b8 250 1.8137 do_page_fault
c013cc40 260 1.88624 link_path_walk
c0122cd0 282 2.04585 clear_page_tables
c0124128 337 2.44486 do_anonymous_page
c0122e7c 347 2.51741 copy_page_range
c0111e50 363 2.63349 pte_alloc_one
c01c94ac 429 3.1123 radix_tree_lookup
c01077cc 571 4.14248 page_fault
c0123070 620 4.49797 zap_pte_range
c0124280 715 5.18717 do_no_page
c0123afc 2957 21.4524 do_wp_page
So the pte_chain stuff seems to be costing 20% system time here.
But note that I made the do_page_cache_readahead and radix_tree_lookup
cost go away in 2.5.29. So it's more like 30%.
And it's all really in __page_remove_rmap, kmem_cache_alloc/free.
If we convert the pte_chain structure to
struct pte_chain {
struct pte_chain *next;
pte_t *ptes[L1_CACHE_BYTES - 4];
};
and take care to keep them compacted we shall reduce the overhead
of both __page_remove_rmap and the slab functions by up to 7, 15
or 31-fold, depending on the L1 size. page_referenced() wins as well.
Plus we almost halve the memory consumption of the pte_chains
in the high sharing case. And if we have to kmap these suckers
we reduce the frequency of that by 7x,15x,31x,etc.
I'll code it tomorrow.
On Monday 05 August 2002 01:33, Andrew Morton wrote:
> OK, I warmed the code up a bit and did some more measurement.
> Your locking patch has improved things significantly. And
> we're now getting hurt by all the cache misses walking the
> pte chains.
Well that's a relief. I was beginning to believe that your 4 way has some
sort of built-in anti-optimization circuit.
Are you still seeing no improvement on four way smp?
> ...And it's all really in __page_remove_rmap, kmem_cache_alloc/free.
>
> If we convert the pte_chain structure to
>
> struct pte_chain {
> struct pte_chain *next;
> pte_t *ptes[L1_CACHE_BYTES - 4];
> };
>
> and take care to keep them compacted we shall reduce the overhead
> of both __page_remove_rmap and the slab functions by up to 7, 15
> or 31-fold, depending on the L1 size. page_referenced() wins as well.
>
> Plus we almost halve the memory consumption of the pte_chains
> in the high sharing case. And if we have to kmap these suckers
> we reduce the frequency of that by 7x,15x,31x,etc.
>
> I'll code it tomorrow.
Sounds good. There is still some tuning to be done on the rmap lock
batching, to distribute the locks better and set anon page->indexes more
intelligently. I expect this to be good for another percent or two, nothing
really exciting.
--
Daniel
Andrew Morton wrote:
>
> I'll code it tomorrow.
Tomorrow came early.
It makes basically no difference at all. Maybe pulled back 10 of the lost
50%. The kernel is still spending much time in the pte_chain walk in
page_remove_rmap().
Despite the fact that the number of pte_chain references in page_add/remove_rmap
now just averages two in that test. So I'm rather stumped. It seems that
adding any misses at all to the copy_page_range/zap_page_range path hurts.
This patch _has_ to help, even if not on my box. Plus it's gorgeous ;) See
how it minimises internal fragmentation and cache misses at the same time.
Let's look at `make -j6 bzImage':
2.5.30+rmap-locking+rmap-speedup:
c0133444 99 0.597213 __pagevec_lru_add
c0134e10 99 0.597213 __alloc_pages
c0115d60 102 0.61531 do_schedule
c01218fc 116 0.699765 update_one_process
c01331f4 125 0.754057 __pagevec_release
c0121a34 133 0.802316 timer_bh
c012afa4 134 0.808349 sys_brk
c012bfd4 139 0.838511 do_brk
c0128980 155 0.93503 pte_alloc_map
c01350b4 158 0.953128 nr_free_pages
c013c4a0 164 0.989323 __page_add_rmap
c0107100 168 1.01345 system_call
c01346d0 182 1.09791 __free_pages_ok
c013c570 186 1.12204 page_add_rmap
c01df770 186 1.12204 __generic_copy_to_user
c012b88c 188 1.1341 find_vma
c013c5d4 199 1.20046 __page_remove_rmap
c012ae90 212 1.27888 vm_enough_memory
c0151cd4 214 1.29095 __d_lookup
c01351ac 227 1.36937 get_page_state
c012ab0c 230 1.38746 handle_mm_fault
c01126e4 247 1.49002 smp_apic_timer_interrupt
c0148ce8 282 1.70115 link_path_walk
c0128ed0 291 1.75544 zap_pte_range
c0115a48 338 2.03897 scheduler_tick
c012a7f0 349 2.10533 do_no_page
c0107c90 368 2.21994 page_fault
c01142b4 388 2.34059 do_page_fault
c0129d98 389 2.34662 do_wp_page
c010c674 515 3.10671 timer_interrupt
c01349bc 547 3.29975 rmqueue
c01df7bc 711 4.28908 __generic_copy_from_user
c012da60 1045 6.30392 file_read_actor
c012a5fc 3343 20.1665 do_anonymous_page
2.5.26:
c0128160 92 0.594853 pte_alloc_map
c0106fca 96 0.620716 restore_all
c01d98d8 104 0.672443 strnlen_user
c0113118 105 0.678909 set_ioapic_affinity
c012a048 108 0.698306 sys_brk
c01213cc 110 0.711238 update_one_process
c012c3b8 121 0.782361 find_get_page
c012af50 131 0.847019 do_brk
c01338dc 133 0.859951 nr_free_pages
c0131d00 145 0.93754 lru_cache_add
c0139a08 147 0.950472 do_page_cache_readahead
c0129c8c 151 0.976335 handle_mm_fault
c0106f88 164 1.06039 system_call
c01d9730 169 1.09272 __generic_copy_to_user
c01db08c 178 1.15091 radix_tree_lookup
c0132f30 190 1.2285 __free_pages_ok
c012a8ac 194 1.25436 find_vma
c01284f0 226 1.46127 zap_pte_range
c014eb84 234 1.513 __d_lookup
c01339b4 237 1.53239 get_page_state
c011268c 245 1.58412 smp_apic_timer_interrupt
c011415c 246 1.59059 do_page_fault
c0145e08 298 1.92681 link_path_walk
c0107b58 319 2.06259 page_fault
c01157d4 346 2.23717 scheduler_tick
c0129a04 346 2.23717 do_no_page
c0129124 378 2.44407 do_wp_page
c010c7f4 529 3.42041 timer_interrupt
c01331c0 638 4.12518 rmqueue
c01d977c 682 4.40967 __generic_copy_from_user
c012c964 988 6.38821 file_read_actor
c0129868 3179 20.5548 do_anonymous_page
Not much stands out there, except that the increase in HZ hurts
more than rmap.
2.5.26:
make -j6 bzImage 395.35s user 31.21s system 377% cpu 1:52.94 total
2.5.30:
make -j6 bzImage 395.55s user 33.00s system 375% cpu 1:54.19 total
2.5.30+everything
make -j6 bzImage 395.76s user 32.97s system 382% cpu 1:52.00 total
2.4.19
make -j6 bzImage 397.74s user 28.27s system 373% cpu 1:53.91 total
2.5.30+everything+HZ=100
make -j6 bzImage 393.60s user 28.91s system 373% cpu 1:53.06 total
Should we count PageDirect rmaps in /proc/meminfo:ReverseMaps?
I chose not to, so we can compare that number with the slabinfo
for pte_chains and see how much memory is being wasted. Plus the
PageDirect rmaps aren't very interesting, because they don't consume
any resources.
rmap.c | 233 ++++++++++++++++++++++++++++++++++++++++++++---------------------
1 files changed, 161 insertions(+), 72 deletions(-)
--- 2.5.30/mm/rmap.c~rmap-speedup Sun Aug 4 23:23:51 2002
+++ 2.5.30-akpm/mm/rmap.c Sun Aug 4 23:58:27 2002
@@ -41,24 +41,39 @@
* here, the page struct for the page table page contains the process
* it belongs to and the offset within that process.
*
- * A singly linked list should be fine for most, if not all, workloads.
- * On fork-after-exec the mapping we'll be removing will still be near
- * the start of the list, on mixed application systems the short-lived
- * processes will have their mappings near the start of the list and
- * in systems with long-lived applications the relative overhead of
- * exit() will be lower since the applications are long-lived.
+ * We use an array of pte pointers in this structure to minimise cache misses
+ * while traversing reverse maps.
*/
+#define NRPTE (L1_CACHE_BYTES/sizeof(void *) - 1)
+
struct pte_chain {
struct pte_chain * next;
- pte_t * ptep;
+ pte_t *ptes[NRPTE];
};
spinlock_t rmap_locks[NUM_RMAP_LOCKS];
static kmem_cache_t *pte_chain_cache;
static inline struct pte_chain * pte_chain_alloc(void);
-static inline void pte_chain_free(struct pte_chain *, struct pte_chain *,
- struct page *);
+static void pte_chain_free(struct pte_chain *pte_chain);
+
+/*
+ * pte_chain list management policy:
+ *
+ * - If a page has a pte_chain list then it is shared by at least two processes,
+ * because a single sharing uses PageDirect. (Well, this isn't true yet,
+ * coz this code doesn't collapse singletons back to PageDirect on the remove
+ * path).
+ * - A pte_chain list has free space only in the head member - all succeeding
+ * members are 100% full.
+ * - If the head element has free space, it occurs in its leading slots.
+ * - All free space in the pte_chain is at the start of the head member.
+ * - Insertion into the pte_chain puts a pte pointer in the last free slot of
+ * the head member.
+ * - Removal from a pte chain moves the head pte of the head member onto the
+ * victim pte and frees the head member if it became empty.
+ */
+
/**
* page_referenced - test if the page was referenced
@@ -82,8 +97,13 @@ int page_referenced(struct page * page)
} else {
/* Check all the page tables mapping this page. */
for (pc = page->pte.chain; pc; pc = pc->next) {
- if (ptep_test_and_clear_young(pc->ptep))
- referenced++;
+ int i;
+
+ for (i = 0; i < NRPTE; i++) {
+ pte_t *p = pc->ptes[i];
+ if (p && ptep_test_and_clear_young(p))
+ referenced++;
+ }
}
}
return referenced;
@@ -100,6 +120,7 @@ int page_referenced(struct page * page)
void __page_add_rmap(struct page *page, pte_t *ptep)
{
struct pte_chain * pte_chain;
+ int i;
#ifdef DEBUG_RMAP
if (!page || !ptep)
@@ -121,32 +142,58 @@ void __page_add_rmap(struct page *page,
BUG();
} else {
for (pc = page->pte.chain; pc; pc = pc->next) {
- if (pc->ptep == ptep)
- BUG();
+ for (i = 0; i < NRPTE; i++) {
+ pte_t *p = pc->ptes[i];
+
+ if (p && p == ptep)
+ BUG();
+ }
}
}
}
#endif
+ if (page->pte.chain == NULL) {
+ page->pte.direct = ptep;
+ SetPageDirect(page);
+ goto out;
+ }
+
if (PageDirect(page)) {
/* Convert a direct pointer into a pte_chain */
- pte_chain = pte_chain_alloc();
- pte_chain->ptep = page->pte.direct;
- pte_chain->next = NULL;
- page->pte.chain = pte_chain;
ClearPageDirect(page);
- }
- if (page->pte.chain) {
- /* Hook up the pte_chain to the page. */
pte_chain = pte_chain_alloc();
- pte_chain->ptep = ptep;
- pte_chain->next = page->pte.chain;
+ pte_chain->ptes[NRPTE-1] = page->pte.direct;
+ pte_chain->ptes[NRPTE-2] = ptep;
+ mod_page_state(nr_reverse_maps, 2);
page->pte.chain = pte_chain;
- } else {
- page->pte.direct = ptep;
- SetPageDirect(page);
+ goto out;
+ }
+
+ pte_chain = page->pte.chain;
+ if (pte_chain->ptes[0]) { /* It's full */
+ struct pte_chain *new;
+
+ new = pte_chain_alloc();
+ new->next = pte_chain;
+ page->pte.chain = new;
+ new->ptes[NRPTE-1] = ptep;
+ inc_page_state(nr_reverse_maps);
+ goto out;
}
- inc_page_state(nr_reverse_maps);
+
+ BUG_ON(pte_chain->ptes[NRPTE-1] == NULL);
+
+ for (i = NRPTE-2; i >= 0; i--) {
+ if (pte_chain->ptes[i] == NULL) {
+ pte_chain->ptes[i] = ptep;
+ inc_page_state(nr_reverse_maps);
+ goto out;
+ }
+ }
+ BUG();
+
+out:
}
void page_add_rmap(struct page *page, pte_t *ptep)
@@ -172,7 +219,7 @@ void page_add_rmap(struct page *page, pt
*/
void __page_remove_rmap(struct page *page, pte_t *ptep)
{
- struct pte_chain * pc, * prev_pc = NULL;
+ struct pte_chain *pc;
if (!page || !ptep)
BUG();
@@ -186,15 +233,32 @@ void __page_remove_rmap(struct page *pag
goto out;
}
} else {
- for (pc = page->pte.chain; pc; prev_pc = pc, pc = pc->next) {
- if (pc->ptep == ptep) {
- pte_chain_free(pc, prev_pc, page);
- /* Check whether we can convert to direct */
- pc = page->pte.chain;
- if (!pc->next) {
- page->pte.direct = pc->ptep;
- SetPageDirect(page);
- pte_chain_free(pc, NULL, NULL);
+ struct pte_chain *start = page->pte.chain;
+ int victim_i = -1;
+
+ for (pc = start; pc; pc = pc->next) {
+ int i;
+
+ if (pc->next)
+ prefetch(pc->next);
+ for (i = 0; i < NRPTE; i++) {
+ pte_t *p = pc->ptes[i];
+
+ if (!p)
+ continue;
+ if (victim_i == -1)
+ victim_i = i;
+ if (p != ptep)
+ continue;
+ pc->ptes[i] = start->ptes[victim_i];
+ start->ptes[victim_i] = NULL;
+ dec_page_state(nr_reverse_maps);
+ if (victim_i == NRPTE-1) {
+ /* Emptied a pte_chain */
+ page->pte.chain = start->next;
+ pte_chain_free(start);
+ } else {
+ /* Do singleton->PageDirect here */
}
goto out;
}
@@ -213,9 +277,9 @@ void __page_remove_rmap(struct page *pag
printk("\n");
printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n");
#endif
+ return;
out:
- dec_page_state(nr_reverse_maps);
return;
}
@@ -317,8 +381,9 @@ out_unlock:
*/
int try_to_unmap(struct page * page)
{
- struct pte_chain * pc, * next_pc, * prev_pc = NULL;
+ struct pte_chain *pc, *next_pc, *start;
int ret = SWAP_SUCCESS;
+ int victim_i = -1;
/* This page should not be on the pageout lists. */
if (PageReserved(page))
@@ -335,35 +400,57 @@ int try_to_unmap(struct page * page)
page->pte.direct = NULL;
ClearPageDirect(page);
}
- } else {
- for (pc = page->pte.chain; pc; pc = next_pc) {
- next_pc = pc->next;
- switch (try_to_unmap_one(page, pc->ptep)) {
- case SWAP_SUCCESS:
- /* Free the pte_chain struct. */
- pte_chain_free(pc, prev_pc, page);
- break;
- case SWAP_AGAIN:
- /* Skip this pte, remembering status. */
- prev_pc = pc;
- ret = SWAP_AGAIN;
- continue;
- case SWAP_FAIL:
- ret = SWAP_FAIL;
- break;
- case SWAP_ERROR:
- ret = SWAP_ERROR;
- break;
+ goto out;
+ }
+
+ start = page->pte.chain;
+ for (pc = start; pc; pc = next_pc) {
+ int i;
+
+ next_pc = pc->next;
+ if (next_pc)
+ prefetch(next_pc);
+ for (i = 0; i < NRPTE; i++) {
+ pte_t *p = pc->ptes[i];
+
+ if (!p)
+ continue;
+ if (victim_i == -1)
+ victim_i = i;
+
+ switch (try_to_unmap_one(page, p)) {
+ case SWAP_SUCCESS:
+ /*
+ * Release a slot. If we're releasing the
+ * first pte in the first pte_chain then
+ * pc->ptes[i] and start->ptes[victim_i] both
+ * refer to the same thing. It works out.
+ */
+ pc->ptes[i] = start->ptes[victim_i];
+ start->ptes[victim_i] = NULL;
+ dec_page_state(nr_reverse_maps);
+ victim_i++;
+ if (victim_i == NRPTE) {
+ page->pte.chain = start->next;
+ pte_chain_free(start);
+ start = page->pte.chain;
+ victim_i = 0;
+ }
+ break;
+ case SWAP_AGAIN:
+ /* Skip this pte, remembering status. */
+ ret = SWAP_AGAIN;
+ continue;
+ case SWAP_FAIL:
+ ret = SWAP_FAIL;
+ goto out;
+ case SWAP_ERROR:
+ ret = SWAP_ERROR;
+ goto out;
}
}
- /* Check whether we can convert to direct pte pointer */
- pc = page->pte.chain;
- if (pc && !pc->next) {
- page->pte.direct = pc->ptep;
- SetPageDirect(page);
- pte_chain_free(pc, NULL, NULL);
- }
}
+out:
return ret;
}
@@ -384,14 +471,9 @@ int try_to_unmap(struct page * page)
* called for new pte_chain structures which aren't on any list yet.
* Caller needs to hold the rmap_lock if the page is non-NULL.
*/
-static inline void pte_chain_free(struct pte_chain * pte_chain,
- struct pte_chain * prev_pte_chain, struct page * page)
+static void pte_chain_free(struct pte_chain *pte_chain)
{
- if (prev_pte_chain)
- prev_pte_chain->next = pte_chain->next;
- else if (page)
- page->pte.chain = pte_chain->next;
-
+ pte_chain->next = NULL;
kmem_cache_free(pte_chain_cache, pte_chain);
}
@@ -406,6 +488,13 @@ static inline struct pte_chain *pte_chai
return kmem_cache_alloc(pte_chain_cache, GFP_ATOMIC);
}
+static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags)
+{
+ struct pte_chain *pc = p;
+
+ memset(pc, 0, sizeof(*pc));
+}
+
void __init pte_chain_init(void)
{
int i;
@@ -417,7 +506,7 @@ void __init pte_chain_init(void)
sizeof(struct pte_chain),
0,
0,
- NULL,
+ pte_chain_ctor,
NULL);
if (!pte_chain_cache)
.
On Monday 05 August 2002 09:05, Andrew Morton wrote:
> Andrew Morton wrote:
> It makes basically no difference at all. Maybe pulled back 10 of the lost
> 50%. The kernel is still spending much time in the pte_chain walk in
> page_remove_rmap().
I wouldn't call that 'no difference', just not as much as hoped for.
Well, I'm not quite out of tricks yet. There's one more to go, and it just
might be worth more than all the others. It's a little subtle though, and
I'm still working out some wrinkles, so I'll write it up in detail tomorrow.
For now, consider that our rmaps tend to form a lot of parallel chains. That
is, if you look at the chains for two pages whose index differs by one, the
pte fields of each of the pte_chain nodes differs by 4. This relationship
tends to hold quite consistently over rather large groups of pages, a fact
that I took advantage of in the lock batching. Now, if we put a relative
number in the pte field instead of an absolute pointer, the corresponding
pte_chain nodes for all those parallel chains become identical. How nice it
would be if we could share those pte chain nodes between pages.
I've convinced myself that this is possible. It's a little tricky though: we
have to handle CoW unsharing, and the possibility of the index changing (as
with the lock batching, but in this case there's a little more work to do).
My feeling is that the implementation will not be particularly messy, even
though it sounds scary on first blush.
> Despite the fact that the number of pte_chain references in
> page_add/remove_rmap now just averages two in that test.
It's weird that it only averages two. It's a four way and your running 10 in
parallel, plus a process to watch for completion, right?
--
Daniel
On Mon, 5 Aug 2002, Daniel Phillips wrote:
> > Despite the fact that the number of pte_chain references in
> > page_add/remove_rmap now just averages two in that test.
>
> It's weird that it only averages two. It's a four way and your running
> 10 in parallel, plus a process to watch for completion, right?
I explained this one in the comment above the declaration of
struct pte_chain ;)
* A singly linked list should be fine for most, if not all, workloads.
* On fork-after-exec the mapping we'll be removing will still be near
* the start of the list, on mixed application systems the short-lived
* processes will have their mappings near the start of the list and
* in systems with long-lived applications the relative overhead of
* exit() will be lower since the applications are long-lived.
cheers,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
Rik van Riel wrote:
>
> On Mon, 5 Aug 2002, Daniel Phillips wrote:
>
> > > Despite the fact that the number of pte_chain references in
> > > page_add/remove_rmap now just averages two in that test.
> >
> > It's weird that it only averages two. It's a four way and your running
> > 10 in parallel, plus a process to watch for completion, right?
>
> I explained this one in the comment above the declaration of
> struct pte_chain ;)
>
> * A singly linked list should be fine for most, if not all, workloads.
> * On fork-after-exec the mapping we'll be removing will still be near
> * the start of the list, on mixed application systems the short-lived
> * processes will have their mappings near the start of the list and
> * in systems with long-lived applications the relative overhead of
> * exit() will be lower since the applications are long-lived.
I don't think so - the list walks in there are fairly long.
What seems to be happening is that, as Daniel mentioned,
all the pte_chains for page N happen to have good locality
with the pte_chains for page N+1. Like parallel lines.
That might not hold up for longer-lived processes, slab cache
fragmentation, longer chains, etc...
On Monday 05 August 2002 09:05, Andrew Morton wrote:
> Tomorrow came early.
>
> It makes basically no difference at all. Maybe pulled back 10 of the lost
> 50%. The kernel is still spending much time in the pte_chain walk in
> page_remove_rmap().
>
> Despite the fact that the number of pte_chain references in page_add/remove_rmap
> now just averages two in that test. So I'm rather stumped. It seems that
> adding any misses at all to the copy_page_range/zap_page_range path hurts.
>
> This patch _has_ to help, even if not on my box. Plus it's gorgeous ;) See
> how it minimises internal fragmentation and cache misses at the same time.
It is very nice, and short. It would be a shame if it didn't actually
accomplish anything. Have you got any more data on that?
> Let's look at `make -j6 bzImage':
>
> [...]
>
> Not much stands out there, except that the increase in HZ hurts
> more than rmap.
What stands out for me is that rmap is now apparently at parity with
(virtual scanning) 2.4.19 for a real application, i.e., parallel make.
I'm sure we'll still see the raw setup/teardown overhead if we make a
point of going looking for it, but it would be weird if we didn't.
So can we tentatively declare victory of the execution overhead issue?
This is looking very good:
> 2.5.26:
> make -j6 bzImage 395.35s user 31.21s system 377% cpu 1:52.94 total
> 2.5.30:
> make -j6 bzImage 395.55s user 33.00s system 375% cpu 1:54.19 total
> 2.5.30+everything
> make -j6 bzImage 395.76s user 32.97s system 382% cpu 1:52.00 total
> 2.4.19
> make -j6 bzImage 397.74s user 28.27s system 373% cpu 1:53.91 total
> 2.5.30+everything+HZ=100
> make -j6 bzImage 393.60s user 28.91s system 373% cpu 1:53.06 total
There's still the memory consumption issue, and indeed, we can
construct a case where pte chain overhead is twice that of page tables
themselves, so I have to apologize to Martin Bligh for claiming a few
weeks ago that that can't happen. Vectoring up the pte chain nodes as
you do here doesn't help much because the internal fragmentation
roughly equals the reduction in link fields.
So coalescing is the next big evil problem to tackle. It occurs to me
that defragging pte chain node pages is actually easy and local. A
reasonable algorithm goes something like this:
Take the pte chain lock
For each allocated pte chain node on the page
Pick any referenced pte on it, get the reference page and take
the rmap lock
If the page's rmap list doesn't include this pte chain node
any more, repeat the above steps until it does
Copy the contents of the pte chain node onto a newly allocated
pte chain node, and substitute the new node for the old one
Free the old pte chain node
Release the pte chain lock
It's actually more important to be able to coalesce pte chain nodes as
needed than it is to save space, since for the average case we should
be using mostly direct pointers anyway.
Anyway, I think we are within sight of running out of serious issues
and now is the time to start testing the thing that rmap is supposed
to improve, namely swap performance.
Rmap is clearly going to help a lot with swapout on heavily shared
pages by giving us active unmapping, so we don't have to leave large
numbers of pages sitting around half in memory and half in swap. But
rmap doesn't help a bit with that same nasty behaviour on swapin; we
still have to wait patiently for every sharer to get around to
faulting in the page, and in my mind, that makes it very easy to
construct a test case that turns all of swap into a useless appendage,
full of half swapped-in pages. In other words, swapping is going to
continue to suck until we do something about that, and now is a good
time to start kicking ideas around.
Now is also the time to think about making active defragmentation
happen. In my mind, active defragmentation is the only way that
any of the large page patches are going to be halfway useful instead
of just a great source of deadly corner cases.
I'll propose an approach to get the ball rolling. We have two new
lru lists, a GPF_Defrag allocation flag and a new ZONE_LARGE. A page
allocation with GPF_Defrag means the caller promises not to pin the
unit for long periods, and the (n order) unit is allocated
preferentially in ZONE_LARGE. This zone has two lrus: one has
complete LARGE_PAGE_ORDER allocation units on it, the other has
various allocation orders mixed together, but the lru list is still
organized in LARGE_PAGE_ORDER units. So the second lru list will
naturally organize itself with good defragmentation candidates
towards the cold end of the list. When we detect a deficit of
large pages, we start from the cold end of the list and proceed to
unmap and copy in-use pages off somewhere else (glossing over the
exact definition of 'somewhere else' for the moment). The condition
for being able to move a page is exactly the same as being able to
unmap it, and naturally, the unmapping attempt may fail because
somebody has taken a temporary use count on the page. We hope
that that doesn't happen too often, and punting such aunit to the
hot end of the list might well be the right thing to do.
In short, we will always be able to create a good supply of
LARGE_PAGE_ORDER pages in ZONE_LARGE, unless somebody is lying to
us about pinning pages. Um, memlock, we still have to worry about
that, but that's an orthogonal issue.
> Should we count PageDirect rmaps in /proc/meminfo:ReverseMaps?
> I chose not to, so we can compare that number with the slabinfo
> for pte_chains and see how much memory is being wasted. Plus the
> PageDirect rmaps aren't very interesting, because they don't consume
> any resources.
Agreed. It would be nice to have a stat for total ptes mapped, and we
can see how total pte chain nodes compares to that. These two stats
will allow users to form an accurate impression of just how much memory
is being eaten by pte chains.
As I mentioned earlier, I do have one more major optimization waiting in
the wings if we need it, which allows pte chain nodes to be shared. But
since you pointed out so delicately that there are other projects needing
attention, I'd be just as happy to put that one away until we're sure
it's needed, or until somebody has too much time on their hands.
> [...]
>
> + * - If a page has a pte_chain list then it is shared by at least two processes,
> + * because a single sharing uses PageDirect. (Well, this isn't true yet,
> + * coz this code doesn't collapse singletons back to PageDirect on the remove
> + * path).
That's not necessarily such a bad thing. Consider what happens with bash:
first a bunch of do_no_page events cause the direct links to be created as
bash faults in the various mmaps it uses, then it forks to exec a command,
causing all the direct links to turn into pte chain nodes, then the command
execs. If you collapse all the pte chain nodes back to direct links at
this point they're just going to be created again on the next fork/exec,
in other words, lots of extra churning.
So I'd suggest just forgetting about the collapse-to-direct feature. With
pte chain coalescing we'll eventually get the memory back.
> [...]
>
> + * - Insertion into the pte_chain puts a pte pointer in the last free slot of
> + * the head member.
I don't know why you decided to fill in from top to bottom but I suppose it
scarcely makes a difference. You might want to run this one backwards and
take an early exit on null:
> + for (i = 0; i < NRPTE; i++) {
> + pte_t *p = pc->ptes[i];
> + if (p && ptep_test_and_clear_young(p))
> + referenced++;
> + }
>
> [...]
>
> + * - Removal from a pte chain moves the head pte of the head member onto the
> + * victim pte and frees the head member if it became empty.
Thus randomizing the order and losing the pushdown effect, oh well.
--
Daniel
Daniel Phillips wrote:
>
> ...
> > This patch _has_ to help, even if not on my box. Plus it's gorgeous ;) See
> > how it minimises internal fragmentation and cache misses at the same time.
>
> It is very nice, and short. It would be a shame if it didn't actually
> accomplish anything. Have you got any more data on that?
Nope. Just warm and fuzzies.
> What stands out for me is that rmap is now apparently at parity with
> (virtual scanning) 2.4.19 for a real application, i.e., parallel make.
> I'm sure we'll still see the raw setup/teardown overhead if we make a
> point of going looking for it, but it would be weird if we didn't.
>
> So can we tentatively declare victory of the execution overhead issue?
err, no. It's still adding half a millisecond or so to each fork/exec/exit
cycle. And that is arising from, apparently, an extra two cache misses
per page. Doubt if we can take this much further.
> ...
> Vectoring up the pte chain nodes as
> you do here doesn't help much because the internal fragmentation
> roughly equals the reduction in link fields.
Are you sure about that? The vectoring is only a loss for very low
sharing levels, at which the space consumption isn't a problem anyway.
At high levels of sharing it's almost a halving.
> So coalescing is the next big evil problem to tackle. It occurs to me
> that defragging pte chain node pages is actually easy and local.
If we are forced to actively defragment pte_chain pages then
it's all just getting too much, surely.
> ...
>
> Anyway, I think we are within sight of running out of serious issues
> and now is the time to start testing the thing that rmap is supposed
> to improve, namely swap performance.
>
> Rmap is clearly going to help a lot with swapout on heavily shared
> pages by giving us active unmapping, so we don't have to leave large
> numbers of pages sitting around half in memory and half in swap. But
> rmap doesn't help a bit with that same nasty behaviour on swapin; we
> still have to wait patiently for every sharer to get around to
> faulting in the page, and in my mind, that makes it very easy to
> construct a test case that turns all of swap into a useless appendage,
> full of half swapped-in pages.
Is it useful to instantiate the swapped-in page into everyone's
pagetables, save some minor faults?
> > Should we count PageDirect rmaps in /proc/meminfo:ReverseMaps?
> > I chose not to, so we can compare that number with the slabinfo
> > for pte_chains and see how much memory is being wasted. Plus the
> > PageDirect rmaps aren't very interesting, because they don't consume
> > any resources.
>
> Agreed. It would be nice to have a stat for total ptes mapped, and we
> can see how total pte chain nodes compares to that. These two stats
> will allow users to form an accurate impression of just how much memory
> is being eaten by pte chains.
This can be determined from a bit of math on
/proc/meminfo:ReverseMaps and /proc/slabinfo:pte_chains
> ...
> > [...]
> >
> > + * - If a page has a pte_chain list then it is shared by at least two processes,
> > + * because a single sharing uses PageDirect. (Well, this isn't true yet,
> > + * coz this code doesn't collapse singletons back to PageDirect on the remove
> > + * path).
>
> That's not necessarily such a bad thing. Consider what happens with bash:
> first a bunch of do_no_page events cause the direct links to be created as
> bash faults in the various mmaps it uses, then it forks to exec a command,
> causing all the direct links to turn into pte chain nodes, then the command
> execs. If you collapse all the pte chain nodes back to direct links at
> this point they're just going to be created again on the next fork/exec,
> in other words, lots of extra churning.
Yes.
> So I'd suggest just forgetting about the collapse-to-direct feature. With
> pte chain coalescing we'll eventually get the memory back.
I implemented Rik's suggestion. The collapse back to direct representation
happens in page_referenced(). So it happens only in response to page reclaim
activity, when the system needs memory. Nice, yes?
> > [...]
> >
> > + * - Insertion into the pte_chain puts a pte pointer in the last free slot of
> > + * the head member.
>
> I don't know why you decided to fill in from top to bottom but I suppose it
> scarcely makes a difference. You might want to run this one backwards and
> take an early exit on null:
>
> > + for (i = 0; i < NRPTE; i++) {
> > + pte_t *p = pc->ptes[i];
> > + if (p && ptep_test_and_clear_young(p))
> > + referenced++;
> > + }
Thanks, did that.
> >
> > [...]
> >
> > + * - Removal from a pte chain moves the head pte of the head member onto the
> > + * victim pte and frees the head member if it became empty.
>
> Thus randomizing the order and losing the pushdown effect, oh well.
>
Only a little, I expect. If the access pattern is mostly LIFO then we'll
still get good locality at the head of the chain.
On Wednesday 07 August 2002 21:40, Andrew Morton wrote:
> Daniel Phillips wrote:
> > What stands out for me is that rmap is now apparently at parity with
> > (virtual scanning) 2.4.19 for a real application, i.e., parallel make.
> > I'm sure we'll still see the raw setup/teardown overhead if we make a
> > point of going looking for it, but it would be weird if we didn't.
> >
> > So can we tentatively declare victory of the execution overhead issue?
>
> err, no. It's still adding half a millisecond or so to each fork/exec/exit
> cycle. And that is arising from, apparently, an extra two cache misses
> per page. Doubt if we can take this much further.
But that overhead did not show up in the kernel build timings you posted,
which do a realistic amount of forking. So what is the worry, exactly?
> > ...
> > Vectoring up the pte chain nodes as
> > you do here doesn't help much because the internal fragmentation
> > roughly equals the reduction in link fields.
>
> Are you sure about that? The vectoring is only a loss for very low
> sharing levels, at which the space consumption isn't a problem anyway.
> At high levels of sharing it's almost a halving.
Your vector will only be half full on average.
> > So coalescing is the next big evil problem to tackle. It occurs to me
> > that defragging pte chain node pages is actually easy and local.
>
> If we are forced to actively defragment pte_chain pages then
> it's all just getting too much, surely.
The algorithm I showed is pretty simple. Active defragmentation is just
something we should be doing, where we can. Fragmentation does happen,
and there are too many places where we're just trying to pretend that it
doesn't, or fiddling with allocation policy in the hopes it will magically
go away, which it never does.
> > Rmap is clearly going to help a lot with swapout on heavily shared
> > pages by giving us active unmapping, so we don't have to leave large
> > numbers of pages sitting around half in memory and half in swap. But
> > rmap doesn't help a bit with that same nasty behaviour on swapin; we
> > still have to wait patiently for every sharer to get around to
> > faulting in the page, and in my mind, that makes it very easy to
> > construct a test case that turns all of swap into a useless appendage,
> > full of half swapped-in pages.
>
> Is it useful to instantiate the swapped-in page into everyone's
> pagetables, save some minor faults?
That's what I was thinking, then we just have to figure out how to find
all those swapped-out ptes efficiently.
> > So I'd suggest just forgetting about the collapse-to-direct feature. With
> > pte chain coalescing we'll eventually get the memory back.
>
> I implemented Rik's suggestion. The collapse back to direct representation
> happens in page_referenced(). So it happens only in response to page reclaim
> activity, when the system needs memory. Nice, yes?
Yes, nice.
--
Daniel
On Wednesday 07 August 2002 22:17, Daniel Phillips wrote:
> On Wednesday 07 August 2002 21:40, Andrew Morton wrote:
> > > Vectoring up the pte chain nodes as
> > > you do here doesn't help much because the internal fragmentation
> > > roughly equals the reduction in link fields.
> >
> > Are you sure about that? The vectoring is only a loss for very low
> > sharing levels, at which the space consumption isn't a problem anyway.
> > At high levels of sharing it's almost a halving.
>
> Your vector will only be half full on average.
Ah, the internal fragmentation only exists in the first node, yes I see.
So I'm correct at typical sharing levels, but for massive sharing, yes
it approaches half the space, which is significant. That's also when rmap
should show the most advantage over virtual scanning, so we really truly
do have to benchmark that.
--
Daniel
Daniel Phillips wrote:
>
> On Wednesday 07 August 2002 21:40, Andrew Morton wrote:
> > Daniel Phillips wrote:
> > > What stands out for me is that rmap is now apparently at parity with
> > > (virtual scanning) 2.4.19 for a real application, i.e., parallel make.
> > > I'm sure we'll still see the raw setup/teardown overhead if we make a
> > > point of going looking for it, but it would be weird if we didn't.
> > >
> > > So can we tentatively declare victory of the execution overhead issue?
> >
> > err, no. It's still adding half a millisecond or so to each fork/exec/exit
> > cycle. And that is arising from, apparently, an extra two cache misses
> > per page. Doubt if we can take this much further.
>
> But that overhead did not show up in the kernel build timings you posted,
> which do a realistic amount of forking. So what is the worry, exactly?
Compilation is compute-intensive, not fork-intensive. Think shell
scripts, arch, forking servers, ...
> > > ...
> > > Vectoring up the pte chain nodes as
> > > you do here doesn't help much because the internal fragmentation
> > > roughly equals the reduction in link fields.
> >
> > Are you sure about that? The vectoring is only a loss for very low
> > sharing levels, at which the space consumption isn't a problem anyway.
> > At high levels of sharing it's almost a halving.
>
> Your vector will only be half full on average.
The vector at the head of the list is half full on average. All the
other vectors in the chain are 100% full. For the single-pte nodes,
Bill reported "the mean pte_chain length for chains of length > 1 is
around 6, and the std. dev. is around 12, and the distribution is *very*
long-tailed". This is a good fit.
> ...
> > Is it useful to instantiate the swapped-in page into everyone's
> > pagetables, save some minor faults?
>
> That's what I was thinking, then we just have to figure out how to find
> all those swapped-out ptes efficiently.
page->pte?
It may be a net loss for high sharing levels. Dunno.
On Wednesday 07 August 2002 22:34, Andrew Morton wrote:
> Daniel Phillips wrote:
> >
> > On Wednesday 07 August 2002 21:40, Andrew Morton wrote:
> > > Daniel Phillips wrote:
> > > > What stands out for me is that rmap is now apparently at parity with
> > > > (virtual scanning) 2.4.19 for a real application, i.e., parallel make.
> > > > I'm sure we'll still see the raw setup/teardown overhead if we make a
> > > > point of going looking for it, but it would be weird if we didn't.
> > > >
> > > > So can we tentatively declare victory of the execution overhead issue?
> > >
> > > err, no. It's still adding half a millisecond or so to each fork/exec/exit
> > > cycle. And that is arising from, apparently, an extra two cache misses
> > > per page. Doubt if we can take this much further.
> >
> > But that overhead did not show up in the kernel build timings you posted,
> > which do a realistic amount of forking. So what is the worry, exactly?
>
> Compilation is compute-intensive, not fork-intensive. Think shell
> scripts, arch, forking servers, ...
OK, so what is an example of a real application that does tons of forking?
We need to get numbers for that. We also need to compare such numbers to
the supposed advantage in swapping.
> > > Is it useful to instantiate the swapped-in page into everyone's
> > > pagetables, save some minor faults?
> >
> > That's what I was thinking, then we just have to figure out how to find
> > all those swapped-out ptes efficiently.
>
> page->pte?
Problem: we went and gave away the page when we swapped it out, but yes,
we could save the pte chain somewhere and maybe that's a win. It would
eat memory just when we're trying to get some back, though.
> It may be a net loss for high sharing levels. Dunno.
High sharing levels are exactly where the swap-in problem is going to
rear its ugly head.
--
Daniel
On Wed, 7 Aug 2002, Daniel Phillips wrote:
> > It may be a net loss for high sharing levels. Dunno.
>
> High sharing levels are exactly where the swap-in problem is going to
> rear its ugly head.
If the swap-in problem exists.
I can see it being triggered artificially because we still
unmap too many pages in the pageout path, but if we fix
shrink_cache() to not unmap the whole inactive list when
we're under continuous memory pressure but only the end of
the list where we're actually reclaiming pages, maybe then
many of the minor page faults we're seeing under such
loads will just disappear.
Not to mention the superfluous IO being scheduled today.
regards,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
On Wednesday 07 August 2002 22:54, Rik van Riel wrote:
> On Wed, 7 Aug 2002, Daniel Phillips wrote:
>
> > > It may be a net loss for high sharing levels. Dunno.
> >
> > High sharing levels are exactly where the swap-in problem is going to
> > rear its ugly head.
>
> If the swap-in problem exists.
>
> I can see it being triggered artificially because we still
> unmap too many pages in the pageout path, but if we fix
> shrink_cache() to not unmap the whole inactive list when
> we're under continuous memory pressure but only the end of
> the list where we're actually reclaiming pages, maybe then
> many of the minor page faults we're seeing under such
> loads will just disappear.
Mmap a shared, anonymous region half the size of physical ram, fork a
hundred children, and let them start randomly writing in it. Now
put the children to sleep and start another process that pushes the
children into swap. Even when the active process goes away and the
children wake up, that is the last you'll ever see of your swap
(until the children die) because the chance that all 100 children
will happen to fault in any given page is miniscule.
Contrived? Sure, but you can bet somebody has a real load that acts
just like this. And *any* anonymous sharing is going to degrade the
effective size of your swap, the only variable is how much.
> Not to mention the superfluous IO being scheduled today.
Yes, well at this point we're suppose to demonstrate how much better
rmap does on that, are we not.
--
Daniel
Daniel Phillips wrote:
>
> ..
> > Not to mention the superfluous IO being scheduled today.
>
> Yes, well at this point we're suppose to demonstrate how much better
> rmap does on that, are we not.
Well let it be said that the 2.5.30 VM is by no means "Rik's
rmap VM". It's the 2.4.15 VM with pte-chains bolted on the side,
and it's fairly sucky in several regards.
Until the big stuff is settled in (locking rework, per-zone LRU,
slablru, ...) there's not a lot of point trying to finetune
the 2.5.30 VM.
But a convincing demonstration of the advantages of the 2.4 rmap
patch would set a lot of minds at ease ;)