Resending...
---------- Forwarded message ----------
Date: Tue, 24 Apr 2001 23:28:38 -0300 (BRT)
From: Marcelo Tosatti <[email protected]>
To: Linus Torvalds <[email protected]>
Cc: Ingo Molnar <[email protected]>, Alan Cox <[email protected]>,
Linux Kernel List <[email protected]>, [email protected],
Rik van Riel <[email protected]>,
Szabolcs Szakacsits <[email protected]>
Subject: Re: [patch] swap-speedup-2.4.3-B3
On Tue, 24 Apr 2001, Linus Torvalds wrote:
> Basically, I don't want to mix synchronous and asynchronous
> interfaces. Everything should be asynchronous by default, and waiting
> should be explicit.
The following patch changes all swap IO functions to be asynchronous by
default and changes the callers to wait when needed (except
rw_swap_page_base which will block writers if there are too many in flight
swap IOs).
Ingo's find_get_swapcache_page() does not wait on locked pages anymore,
which is now up to the callers.
time make -j32 test with 4 CPUs machine, 128M ram and 128M swap:
pre5
----
228.04user 28.14system 5:16.52elapsed 80%CPU (0avgtext+0avgdata
0maxresident)k
0inputs+0outputs (525113major+678617minor)pagefaults 0swaps
pre5 + attached patch
--------------------
227.18user 25.49system 3:40.53elapsed 114%CPU (0avgtext+0avgdata
0maxresident)k
0inputs+0outputs (495387major+644924minor)pagefaults 0swaps
Comments?
diff -Nur linux.orig/include/linux/pagemap.h linux/include/linux/pagemap.h
--- linux.orig/include/linux/pagemap.h Wed Apr 25 00:51:36 2001
+++ linux/include/linux/pagemap.h Wed Apr 25 00:53:04 2001
@@ -77,7 +77,12 @@
unsigned long index, struct page **hash);
extern void lock_page(struct page *page);
#define find_lock_page(mapping, index) \
- __find_lock_page(mapping, index, page_hash(mapping, index))
+ __find_lock_page(mapping, index, page_hash(mapping, index))
+
+extern struct page * __find_get_swapcache_page (struct address_space * mapping,
+ unsigned long index, struct page **hash);
+#define find_get_swapcache_page(mapping, index) \
+ __find_get_swapcache_page(mapping, index, page_hash(mapping, index))
extern void __add_page_to_hash_queue(struct page * page, struct page **p);
diff -Nur linux.orig/include/linux/swap.h linux/include/linux/swap.h
--- linux.orig/include/linux/swap.h Wed Apr 25 00:51:36 2001
+++ linux/include/linux/swap.h Wed Apr 25 00:53:04 2001
@@ -111,8 +111,8 @@
extern int try_to_free_pages(unsigned int gfp_mask);
/* linux/mm/page_io.c */
-extern void rw_swap_page(int, struct page *, int);
-extern void rw_swap_page_nolock(int, swp_entry_t, char *, int);
+extern void rw_swap_page(int, struct page *);
+extern void rw_swap_page_nolock(int, swp_entry_t, char *);
/* linux/mm/page_alloc.c */
@@ -121,8 +121,7 @@
extern void add_to_swap_cache(struct page *, swp_entry_t);
extern int swap_check_entry(unsigned long);
extern struct page * lookup_swap_cache(swp_entry_t);
-extern struct page * read_swap_cache_async(swp_entry_t, int);
-#define read_swap_cache(entry) read_swap_cache_async(entry, 1);
+extern struct page * read_swap_cache_async(swp_entry_t);
/* linux/mm/oom_kill.c */
extern int out_of_memory(void);
diff -Nur linux.orig/mm/filemap.c linux/mm/filemap.c
--- linux.orig/mm/filemap.c Wed Apr 25 00:51:35 2001
+++ linux/mm/filemap.c Wed Apr 25 00:53:04 2001
@@ -678,6 +678,34 @@
}
/*
+ * Find a swapcache page (and get a reference) or return NULL.
+ * The SwapCache check is protected by the pagecache lock.
+ */
+struct page * __find_get_swapcache_page(struct address_space *mapping,
+ unsigned long offset, struct page **hash)
+{
+ struct page *page;
+
+ /*
+ * We need the LRU lock to protect against page_launder().
+ */
+
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(mapping, offset, *hash);
+ if (page) {
+ spin_lock(&pagemap_lru_lock);
+ if (PageSwapCache(page))
+ page_cache_get(page);
+ else
+ page = NULL;
+ spin_unlock(&pagemap_lru_lock);
+ }
+ spin_unlock(&pagecache_lock);
+
+ return page;
+}
+
+/*
* Same as the above, but lock the page too, verifying that
* it's still valid once we own it.
*/
diff -Nur linux.orig/mm/memory.c linux/mm/memory.c
--- linux.orig/mm/memory.c Wed Apr 25 00:51:35 2001
+++ linux/mm/memory.c Wed Apr 25 00:53:04 2001
@@ -1040,7 +1040,7 @@
break;
}
/* Ok, do the async read-ahead now */
- new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
+ new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset));
if (new_page != NULL)
page_cache_release(new_page);
swap_free(SWP_ENTRY(SWP_TYPE(entry), offset));
@@ -1063,13 +1063,13 @@
if (!page) {
lock_kernel();
swapin_readahead(entry);
- page = read_swap_cache(entry);
+ page = read_swap_cache_async(entry);
unlock_kernel();
if (!page) {
spin_lock(&mm->page_table_lock);
return -1;
}
-
+ wait_on_page(page);
flush_page_to_ram(page);
flush_icache_page(vma, page);
}
diff -Nur linux.orig/mm/page_io.c linux/mm/page_io.c
--- linux.orig/mm/page_io.c Wed Apr 25 00:51:35 2001
+++ linux/mm/page_io.c Wed Apr 25 00:53:04 2001
@@ -33,7 +33,7 @@
* that shared pages stay shared while being swapped.
*/
-static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page, int wait)
+static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page)
{
unsigned long offset;
int zones[PAGE_SIZE/512];
@@ -41,6 +41,7 @@
kdev_t dev = 0;
int block_size;
struct inode *swapf = 0;
+ int wait = 0;
/* Don't allow too many pending pages in flight.. */
if ((rw == WRITE) && atomic_read(&nr_async_pages) >
@@ -104,7 +105,7 @@
* - it's marked as being swap-cache
* - it's associated with the swap inode
*/
-void rw_swap_page(int rw, struct page *page, int wait)
+void rw_swap_page(int rw, struct page *page)
{
swp_entry_t entry;
@@ -116,7 +117,7 @@
PAGE_BUG(page);
if (page->mapping != &swapper_space)
PAGE_BUG(page);
- if (!rw_swap_page_base(rw, entry, page, wait))
+ if (!rw_swap_page_base(rw, entry, page))
UnlockPage(page);
}
@@ -125,7 +126,7 @@
* Therefore we can't use it. Later when we can remove the need for the
* lock map and we can reduce the number of functions exported.
*/
-void rw_swap_page_nolock(int rw, swp_entry_t entry, char *buf, int wait)
+void rw_swap_page_nolock(int rw, swp_entry_t entry, char *buf)
{
struct page *page = virt_to_page(buf);
@@ -137,7 +138,8 @@
PAGE_BUG(page);
/* needs sync_page to wait I/O completation */
page->mapping = &swapper_space;
- if (!rw_swap_page_base(rw, entry, page, wait))
+ if (!rw_swap_page_base(rw, entry, page))
UnlockPage(page);
+ wait_on_page(page);
page->mapping = NULL;
}
diff -Nur linux.orig/mm/shmem.c linux/mm/shmem.c
--- linux.orig/mm/shmem.c Wed Apr 25 00:51:35 2001
+++ linux/mm/shmem.c Wed Apr 25 00:53:04 2001
@@ -124,6 +124,7 @@
*ptr = (swp_entry_t){0};
freed++;
if ((page = lookup_swap_cache(entry)) != NULL) {
+ wait_on_page(page);
delete_from_swap_cache(page);
page_cache_release(page);
}
@@ -329,10 +330,11 @@
spin_unlock (&info->lock);
lock_kernel();
swapin_readahead(*entry);
- page = read_swap_cache(*entry);
+ page = read_swap_cache_async(*entry);
unlock_kernel();
if (!page)
return ERR_PTR(-ENOMEM);
+ wait_on_page(page);
if (!Page_Uptodate(page)) {
page_cache_release(page);
return ERR_PTR(-EIO);
diff -Nur linux.orig/mm/swap_state.c linux/mm/swap_state.c
--- linux.orig/mm/swap_state.c Wed Apr 25 00:51:35 2001
+++ linux/mm/swap_state.c Wed Apr 25 00:53:04 2001
@@ -34,7 +34,7 @@
return 0;
in_use:
- rw_swap_page(WRITE, page, 0);
+ rw_swap_page(WRITE, page);
return 0;
}
@@ -163,37 +163,18 @@
/*
* Right now the pagecache is 32-bit only. But it's a 32 bit index. =)
*/
-repeat:
- found = find_lock_page(&swapper_space, entry.val);
+ found = find_get_swapcache_page(&swapper_space, entry.val);
if (!found)
return 0;
- /*
- * Though the "found" page was in the swap cache an instant
- * earlier, it might have been removed by refill_inactive etc.
- * Re search ... Since find_lock_page grabs a reference on
- * the page, it can not be reused for anything else, namely
- * it can not be associated with another swaphandle, so it
- * is enough to check whether the page is still in the scache.
- */
- if (!PageSwapCache(found)) {
- UnlockPage(found);
- page_cache_release(found);
- goto repeat;
- }
+ if (!PageSwapCache(found))
+ BUG();
if (found->mapping != &swapper_space)
- goto out_bad;
+ BUG();
#ifdef SWAP_CACHE_INFO
swap_cache_find_success++;
#endif
- UnlockPage(found);
return found;
}
-
-out_bad:
- printk (KERN_ERR "VM: Found a non-swapper swap page!\n");
- UnlockPage(found);
- page_cache_release(found);
- return 0;
}
/*
@@ -205,7 +186,7 @@
* the swap entry is no longer in use.
*/
-struct page * read_swap_cache_async(swp_entry_t entry, int wait)
+struct page * read_swap_cache_async(swp_entry_t entry)
{
struct page *found_page = 0, *new_page;
unsigned long new_page_addr;
@@ -238,7 +219,7 @@
*/
lock_page(new_page);
add_to_swap_cache(new_page, entry);
- rw_swap_page(READ, new_page, wait);
+ rw_swap_page(READ, new_page);
return new_page;
out_free_page:
diff -Nur linux.orig/mm/swapfile.c linux/mm/swapfile.c
--- linux.orig/mm/swapfile.c Wed Apr 25 00:51:35 2001
+++ linux/mm/swapfile.c Wed Apr 25 00:53:24 2001
@@ -369,13 +369,15 @@
/* Get a page for the entry, using the existing swap
cache page if there is one. Otherwise, get a clean
page and read the swap into it. */
- page = read_swap_cache(entry);
+ page = read_swap_cache_async(entry);
if (!page) {
swap_free(entry);
return -ENOMEM;
}
+ lock_page(page);
if (PageSwapCache(page))
- delete_from_swap_cache(page);
+ delete_from_swap_cache_nolock(page);
+ UnlockPage(page);
read_lock(&tasklist_lock);
for_each_task(p)
unuse_process(p->mm, entry, page);
@@ -650,7 +652,7 @@
}
lock_page(virt_to_page(swap_header));
- rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header, 1);
+ rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header);
if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
swap_header_version = 1;
On Wed, 25 Apr 2001, Marcelo Tosatti wrote:
> On Tue, 24 Apr 2001, Linus Torvalds wrote:
>
> > Basically, I don't want to mix synchronous and asynchronous
> > interfaces. Everything should be asynchronous by default, and waiting
> > should be explicit.
>
> The following patch changes all swap IO functions to be asynchronous by
> default and changes the callers to wait when needed (except
> rw_swap_page_base which will block writers if there are too many in flight
> swap IOs).
>
> Ingo's find_get_swapcache_page() does not wait on locked pages anymore,
> which is now up to the callers.
>
> time make -j32 test with 4 CPUs machine, 128M ram and 128M swap:
>
> pre5
> ----
> 228.04user 28.14system 5:16.52elapsed 80%CPU (0avgtext+0avgdata
> 0maxresident)k
> 0inputs+0outputs (525113major+678617minor)pagefaults 0swaps
>
> pre5 + attached patch
> --------------------
> 227.18user 25.49system 3:40.53elapsed 114%CPU (0avgtext+0avgdata
> 0maxresident)k
> 0inputs+0outputs (495387major+644924minor)pagefaults 0swaps
>
>
> Comments?
More of a question. Neither Ingo's nor your patch makes any difference
on my UP box (128mb PIII/500) doing make -j30. It is taking me 11 1/2
minutes to do this test (that's horrible). Any idea why?
(I can get it to under 9 with MUCH extremely ugly tinkering. I've done
this enough to know that I _should_ be able to do 8 1/2 minutes ~easily)
-Mike
On Thu, 26 Apr 2001, Mike Galbraith wrote:
> > Comments?
>
> More of a question. Neither Ingo's nor your patch makes any difference
> on my UP box (128mb PIII/500) doing make -j30.
Well, my patch incorporates Ingo's patch.
It is now integrated into pre7, btw.
> It is taking me 11 1/2
> minutes to do this test (that's horrible). Any idea why?~
Not really.
If you have concurrent swapping activity, pre7 should improve the
performance since all swap IO is asynchronous now. Only paths which really
need to stop and wait for the swap data are doing it. (eg do_swap_page)
> (I can get it to under 9 with MUCH extremely ugly tinkering. I've done
> this enough to know that I _should_ be able to do 8 1/2 minutes ~easily)
Which kind of changes you're doing to get better performance on this test?
On Thu, 26 Apr 2001, Marcelo Tosatti wrote:
> On Thu, 26 Apr 2001, Mike Galbraith wrote:
>
> > > Comments?
> >
> > More of a question. Neither Ingo's nor your patch makes any difference
> > on my UP box (128mb PIII/500) doing make -j30.
>
> Well, my patch incorporates Ingo's patch.
>
> It is now integrated into pre7, btw.
>
> > It is taking me 11 1/2
> > minutes to do this test (that's horrible). Any idea why?~
>
> Not really.
(darn)
> If you have concurrent swapping activity, pre7 should improve the
> performance since all swap IO is asynchronous now. Only paths which really
> need to stop and wait for the swap data are doing it. (eg do_swap_page)
I'll grab virgin pre7 in a few.
> > (I can get it to under 9 with MUCH extremely ugly tinkering. I've done
> > this enough to know that I _should_ be able to do 8 1/2 minutes ~easily)
>
> Which kind of changes you're doing to get better performance on this test?
Prevent cache collapse at all cost is #one. Matching deactivation rate
to launder/reclaim.. et al. Trying HARD to give PG_referenced a chance
to happen between aging scans [1].
-Mike
1. pagecache is becoming swapcache and must be aged before anything is
done. Meanwhile we're calling refill_inactive_scan() so fast that noone
has a chance to touch a page. Age becomes a simple counter.. I think.
When you hit a big surge, swap pages are at the back of all lists, so all
of your valuable cache gets reclaimed before we write even one swap page.
On Thu, 26 Apr 2001, Mike Galbraith wrote:
> More of a question. Neither Ingo's nor your patch makes any
> difference on my UP box (128mb PIII/500) doing make -j30. [...]
(the patch Marcelo sent is the -B3 patch plus Linus' suggested async
interface cleanup, so it should be functionally equivalent to the -B3
patch.)
Ingo
On Thu, 26 Apr 2001, Marcelo Tosatti wrote:
> > (I can get it to under 9 with MUCH extremely ugly tinkering. I've done
> > this enough to know that I _should_ be able to do 8 1/2 minutes ~easily)
>
> Which kind of changes you're doing to get better performance on this test?
:)
2.4.4.pre7.virgin
real 11m33.589s
user 7m57.790s
sys 0m38.730s
2.4.4.pre7.sillyness
real 9m30.336s
user 7m55.270s
sys 0m38.510s
--- mm/vmscan.c.org Thu Apr 26 06:35:19 2001
+++ mm/vmscan.c Thu Apr 26 08:25:52 2001
@@ -72,8 +72,7 @@
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
mm->rss--;
- if (!page->age)
- deactivate_page(page);
+ age_page_down(page);
UnlockPage(page);
page_cache_release(page);
return;
@@ -282,7 +281,7 @@
/* Always start by trying to penalize the process that is allocating memory */
if (mm)
- retval = swap_out_mm(mm, swap_amount(mm));
+ return swap_out_mm(mm, swap_amount(mm));
/* Then, look at the other mm's */
counter = mmlist_nr >> priority;
@@ -642,6 +641,10 @@
struct page * page;
int maxscan, page_active = 0;
int ret = 0;
+ static unsigned long lastscan;
+
+ if (lastscan == jiffies)
+ return 0;
/* Take the lock while messing with the list... */
spin_lock(&pagemap_lru_lock);
@@ -695,6 +698,7 @@
break;
}
}
+ lastscan = jiffies;
spin_unlock(&pagemap_lru_lock);
return ret;
@@ -791,35 +795,13 @@
#define DEF_PRIORITY (6)
static int refill_inactive(unsigned int gfp_mask, int user)
{
- int count, start_count, maxtry;
-
- count = inactive_shortage() + free_shortage();
- if (user)
- count = (1 << page_cluster);
- start_count = count;
-
- maxtry = 6;
- do {
- if (current->need_resched) {
- __set_current_state(TASK_RUNNING);
- schedule();
- }
-
- while (refill_inactive_scan(DEF_PRIORITY, 1)) {
- if (--count <= 0)
- goto done;
- }
+ int shortage = inactive_shortage();
+ if (refill_inactive_scan(DEF_PRIORITY, 0) < shortage)
/* If refill_inactive_scan failed, try to page stuff out.. */
swap_out(DEF_PRIORITY, gfp_mask);
- if (--maxtry <= 0)
- return 0;
-
- } while (inactive_shortage());
-
-done:
- return (count < start_count);
+ return 0;
}
static int do_try_to_free_pages(unsigned int gfp_mask, int user)
On Thu, 26 Apr 2001, Ingo Molnar wrote:
>
> On Thu, 26 Apr 2001, Mike Galbraith wrote:
>
> > More of a question. Neither Ingo's nor your patch makes any
> > difference on my UP box (128mb PIII/500) doing make -j30. [...]
>
> (the patch Marcelo sent is the -B3 patch plus Linus' suggested async
> interface cleanup, so it should be functionally equivalent to the -B3
> patch.)
>
> Ingo
I know.. read 'em all with much interest :)
-Mike
On Thu, 26 Apr 2001, Mike Galbraith wrote:
> On Thu, 26 Apr 2001, Marcelo Tosatti wrote:
>
> > > (I can get it to under 9 with MUCH extremely ugly tinkering. I've done
> > > this enough to know that I _should_ be able to do 8 1/2 minutes ~easily)
> >
> > Which kind of changes you're doing to get better performance on this test?
>
> :)
>
> 2.4.4.pre7.virgin
> real 11m33.589s
> user 7m57.790s
> sys 0m38.730s
>
> 2.4.4.pre7.sillyness
> real 9m30.336s
> user 7m55.270s
> sys 0m38.510s
Have you tried to tune SWAP_SHIFT and the priority used inside swap_out()
to see if you can make pte deactivation less aggressive ?
If you get the desired effect tuning those values and you end up with the
conclusion that this tuning is a good change for most "common workloads",
it can be integrated in the main kernel.
On Thu, 26 Apr 2001, Mike Galbraith wrote:
> 2.4.4.pre7.virgin
> real 11m33.589s
> 2.4.4.pre7.sillyness
> real 9m30.336s
very interesting. Looks like there are still reserves in the VM, for heavy
workloads. (and swapping is all about heavy workloads.)
it would be interesting to see why your patch has such a good effect.
(and it would be nice get the same improvement in a clean way.)
> - if (!page->age)
> - deactivate_page(page);
> + age_page_down(page);
this one preserves the cache a bit more agressively.
> /* Always start by trying to penalize the process that is allocating memory */
> if (mm)
> - retval = swap_out_mm(mm, swap_amount(mm));
> + return swap_out_mm(mm, swap_amount(mm));
keep swap-out activity more focused to the process that is generating the
VM pressure. It might make sense to test this single change in isolation.
(While we cannot ignore to swap out other contexts under memory pressure,
we could do something to make it focused on the current MM a bit more.)
> + static unsigned long lastscan;
> +
> + if (lastscan == jiffies)
> + return 0;
limit the runtime of refill_inactive_scan(). This is similar to Rik's
reclaim-limit+aging-tuning patch to linux-mm yesterday. could you try
Rik's patch with your patch except this jiffies hack, does it still
achieve the same improvement?
> + int shortage = inactive_shortage();
>
> + if (refill_inactive_scan(DEF_PRIORITY, 0) < shortage)
> /* If refill_inactive_scan failed, try to page stuff out.. */
> swap_out(DEF_PRIORITY, gfp_mask);
>
> + return 0;
(i cannot see how this chunk affects the VM, AFAICS this too makes the
zapping of the cache less agressive.)
perhaps the best would be to first test Rik's patch on pre7-vanilla, it
should go in the same direction your changes go, i think?
Ingo
On Thu, 26 Apr 2001, Ingo Molnar wrote:
> On Thu, 26 Apr 2001, Mike Galbraith wrote:
>
> > 2.4.4.pre7.virgin
> > real 11m33.589s
>
> > 2.4.4.pre7.sillyness
> > real 9m30.336s
>
> very interesting. Looks like there are still reserves in the VM, for heavy
> workloads. (and swapping is all about heavy workloads.)
>
> it would be interesting to see why your patch has such a good effect.
> (and it would be nice get the same improvement in a clean way.)
It's not good.. it's an ugly beaste from hell ;-)
> > - if (!page->age)
> > - deactivate_page(page);
> > + age_page_down(page);
>
> this one preserves the cache a bit more agressively.
(intent)
>
> > /* Always start by trying to penalize the process that is allocating memory */
> > if (mm)
> > - retval = swap_out_mm(mm, swap_amount(mm));
> > + return swap_out_mm(mm, swap_amount(mm));
>
> keep swap-out activity more focused to the process that is generating the
> VM pressure. It might make sense to test this single change in isolation.
> (While we cannot ignore to swap out other contexts under memory pressure,
> we could do something to make it focused on the current MM a bit more.)
(also the intent.. make 'em pagein like a bugger to slow down cache munh)
> > + static unsigned long lastscan;
> > +
> > + if (lastscan == jiffies)
> > + return 0;
>
> limit the runtime of refill_inactive_scan(). This is similar to Rik's
> reclaim-limit+aging-tuning patch to linux-mm yesterday. could you try
> Rik's patch with your patch except this jiffies hack, does it still
> achieve the same improvement?
No. It livelocked on me with almost all active pages exausted.
> > + int shortage = inactive_shortage();
> >
> > + if (refill_inactive_scan(DEF_PRIORITY, 0) < shortage)
> > /* If refill_inactive_scan failed, try to page stuff out.. */
> > swap_out(DEF_PRIORITY, gfp_mask);
> >
> > + return 0;
>
> (i cannot see how this chunk affects the VM, AFAICS this too makes the
> zapping of the cache less agressive.)
(more folks get snagged on write.. they can't eat cache so fast)
-Mike
On Thu, 26 Apr 2001, Marcelo Tosatti wrote:
> Have you tried to tune SWAP_SHIFT and the priority used inside swap_out()
> to see if you can make pte deactivation less aggressive ?
Many many many times.. no dice.
(more agressive is much better for surge regulation.. power brakes!)
-Mike
On Thu, 26 Apr 2001, Mike Galbraith wrote:
> > limit the runtime of refill_inactive_scan(). This is similar to Rik's
> > reclaim-limit+aging-tuning patch to linux-mm yesterday. could you try
> > Rik's patch with your patch except this jiffies hack, does it still
> > achieve the same improvement?
>
> No. It livelocked on me with almost all active pages exausted.
Misspoke.. I didn't try the two mixed. Rik's patch livelocked me.
Still want me to try mixing?
-Mike
On Thu, 26 Apr 2001, Mike Galbraith wrote:
> > (i cannot see how this chunk affects the VM, AFAICS this too makes the
> > zapping of the cache less agressive.)
>
> (more folks get snagged on write.. they can't eat cache so fast)
What about GFP_BUFFER allocations ? :)
I suspect the jiffies hack is avoiding GFP_BUFFER allocations to eat cache
insanely.
Easy way to confirm that: add the kswapd wait queue again and make
allocators which don't have __GFP_IO set wait on that in
try_to_free_pages().
On Thu, 26 Apr 2001, Marcelo Tosatti wrote:
> On Thu, 26 Apr 2001, Mike Galbraith wrote:
>
> > > (i cannot see how this chunk affects the VM, AFAICS this too makes the
> > > zapping of the cache less agressive.)
> >
> > (more folks get snagged on write.. they can't eat cache so fast)
>
> What about GFP_BUFFER allocations ? :)
>
> I suspect the jiffies hack is avoiding GFP_BUFFER allocations to eat cache
> insanely.
(I think it's aging speed in general. If user tasks aren't doing it,
kswapd is.)
> Easy way to confirm that: add the kswapd wait queue again and make
> allocators which don't have __GFP_IO set wait on that in
> try_to_free_pages().
I've tried not allowing those to enter try_to_free_pages() [nogo].
I'll try a waitqueue. wait_event(waitqueue_active(&kswapd_wait)) ok?
-Mike
On Thu, 26 Apr 2001, Mike Galbraith wrote:
>
> 2.4.4.pre7.virgin
> real 11m33.589s
> user 7m57.790s
> sys 0m38.730s
>
> 2.4.4.pre7.sillyness
> real 9m30.336s
> user 7m55.270s
> sys 0m38.510s
Well, I actually like parts of this. The "always swap out current mm" one
looks rather dangerous, and the lastscan jiffy thing is too ugly for
words, but refill_inactive() looks much nicer. There is beauty in
simplicity.
The page aging in drop_pte feels pretty harsh, though.
Have you looked at "free_pte()"? I don't like that function, and it might
make a difference. There are several small nits with it:
- it should probably try to deactivate the page. If drop_pte does that
when it deacctivates a page involuntarily, why not do it for a real "we
just free'd the page voluntarily"?
- swap-cache pages should probably not just be de-activated, but actively
aged down. Right now, they are neither, so we have to work all the
way through refill_inactive() and then page_launder() to clear them
out. Even though the page may be entirely useless by now (we had a
complex special case that caught and short-circuited some of the pages,
and maybe it was worth it. But maybe the right thing is to just age
them down and naturally deactivate them?)
After all, we aged them up for references to this virtual
mapping, and free_pte() just made it go away. Unlike normal page cache
pages, we don't get any advantage from trying to cache the things
across multiple VM's.
- we're dropping the accessed bit on the floor. In the vmscan case the
accessed bit would have aged the page up.
On the other hand, to offset some of these, we actually count the page
accessed _twice_ sometimes: we count it on lookup, and we count it when we
see the accessed bit in vmscan.c. Which results in some pages getting aged
up twice for just one access if we go through the vmscan logic, while if
we just map and unmap them they get counted just once.
Obviously the page aging logic seems to be making a noticeable difference
to you. So looking at page aging logic issues in the bigger picture migth
be worthwhile - not just staring at the actual swap-out code. The fact is,
the swap-out-code cannot get the aging right if the rest of the system
ignores it or does it only for some cases.
I _think_ the logic should be something along the lines of: "freeing the
page amounts to a implied down-aging of the page, but the 'accessed' bit
would have aged it up, so the two take each other out". But if so, the
free_pte() logic should have something like
if (page->mapping) {
if (!pte_young(pte) || PageSwapCache(page))
age_page_down_ageonly(page);
if (!page->age)
deactivate_page(page);
}
instead of just ignoring these issues completely.
Comments?
Linus
On Thu, 26 Apr 2001, Mike Galbraith wrote:
> 1. pagecache is becoming swapcache and must be aged before anything is
> done. Meanwhile we're calling refill_inactive_scan() so fast that noone
> has a chance to touch a page. Age becomes a simple counter.. I think.
> When you hit a big surge, swap pages are at the back of all lists, so all
> of your valuable cache gets reclaimed before we write even one swap page.
Does the patch I sent to [email protected] last night help in
this ?
I found that the way refill_inactive_scan() and swap_out() are being
called from the main loop in refill_inactive() aren't equal and have
fixed that in a way which (IMHO) also beautifies the code a bit.
(and makes sure background aging doesn't get out of hand with a few
simple checks)
regards,
Rik
--
Virtual memory is like a game you can't win;
However, without VM there's truly nothing to lose...
http://www.surriel.com/
http://www.conectiva.com/ http://distro.conectiva.com.br/
On Thu, 26 Apr 2001, Linus Torvalds wrote:
> On Thu, 26 Apr 2001, Mike Galbraith wrote:
> >
> > 2.4.4.pre7.virgin
> > real 11m33.589s
> > user 7m57.790s
> > sys 0m38.730s
> >
> > 2.4.4.pre7.sillyness
> > real 9m30.336s
> > user 7m55.270s
> > sys 0m38.510s
>
> Well, I actually like parts of this. The "always swap out current mm" one
> looks rather dangerous, and the lastscan jiffy thing is too ugly for
> words,
Compared to my tree of a couple days ago (8m53s) this is fine art ;-)
(snip nice list of things to think about)
> Comments?
If my parser ever comes out of down_trygrok(). I'll put this in
a terminal next to one from Ingo.
-Mike
On Thu, 26 Apr 2001, Linus Torvalds wrote:
>
> On the other hand, to offset some of these, we actually count the page
> accessed _twice_ sometimes: we count it on lookup, and we count it when we
> see the accessed bit in vmscan.c. Which results in some pages getting aged
> up twice for just one access if we go through the vmscan logic, while if
> we just map and unmap them they get counted just once.
And sometimes three times, if you count the PAGE_AGE_START bonus
points you get whenever your age is found to be 0 (or less than
PAGE_AGE_START). I think I see the idea, but seems more voodoo.
If you're looking to _simplify_ in this area, there's a confusing
host (9) of intercoupled age-up-and-down de/activate functions.
Aren't those better decoupled? i.e. the ageing ones ageonly,
the de/activate ones not messing with age at all.
Then I think you're left with just age_page_up() and age_page_down()
(maybe inlines as below, assuming the PAGE_AGE_START voodoo), plus
activate_page(), deactivate_page() and deactivate_page_nolock().
static inline void age_page_up(struct page *page)
{
page->age += PAGE_AGE_ADV;
if (page->age > PAGE_AGE_MAX)
page->age = PAGE_AGE_MAX;
else if (page->age < PAGE_AGE_START + PAGE_AGE_ADV)
page->age = PAGE_AGE_START + PAGE_AGE_ADV;
}
static inline void age_page_down(struct page *page)
{
page->age >>= 1;
}
But this is no more than tidying, don't let me distract you.
Hugh
On Thu, 26 Apr 2001, Rik van Riel wrote:
> On Thu, 26 Apr 2001, Mike Galbraith wrote:
>
> > 1. pagecache is becoming swapcache and must be aged before anything is
> > done. Meanwhile we're calling refill_inactive_scan() so fast that noone
> > has a chance to touch a page. Age becomes a simple counter.. I think.
> > When you hit a big surge, swap pages are at the back of all lists, so all
> > of your valuable cache gets reclaimed before we write even one swap page.
>
> Does the patch I sent to [email protected] last night help in
> this ?
>
> I found that the way refill_inactive_scan() and swap_out() are being
> called from the main loop in refill_inactive() aren't equal and have
> fixed that in a way which (IMHO) also beautifies the code a bit.
>
> (and makes sure background aging doesn't get out of hand with a few
> simple checks)
That patch livelocked my box with only ~1000 pages on any list.
I can go back and test some more if you want. (I've seen this so
many times here that I generally just curse a lot [frustration] and
burn the whole tree to it's roots as soon as it shows up)
SysRq: Show Memory
Mem-info:
Free pages: 1404kB ( 0kB HighMem)
( Active: 975, inactive_dirty: 28, inactive_clean: 0, free: 351 (351 702 1053) )
0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 1*512kB 0*1024kB 0*2048kB = 512kB)
1*4kB 5*8kB 1*16kB 0*32kB 1*64kB 0*128kB 1*256kB 1*512kB 0*1024kB 0*2048kB = 892kB)
= 0kB)
Swap cache: add 72, delete 63, find 17/67
Free swap: 264864kB
32752 pages of RAM
0 pages of HIGHMEM
1183 reserved pages
27657 pages shared
9 pages swap cached
0 pages in page table cache
Buffer memory: 112kB
On Thu, 26 Apr 2001, Mike Galbraith wrote:
> On Thu, 26 Apr 2001, Mike Galbraith wrote:
>
> > > limit the runtime of refill_inactive_scan(). This is similar to Rik's
> > > reclaim-limit+aging-tuning patch to linux-mm yesterday. could you try
> > > Rik's patch with your patch except this jiffies hack, does it still
> > > achieve the same improvement?
> >
> > No. It livelocked on me with almost all active pages exausted.
>
> Misspoke.. I didn't try the two mixed. Rik's patch livelocked me.
Interesting. The semantics of my patch are practically the same as
those of the stock kernel ... can you get the stock kernel to
livelock on you, too ?
regards,
Rik
--
Virtual memory is like a game you can't win;
However, without VM there's truly nothing to lose...
http://www.surriel.com/
http://www.conectiva.com/ http://distro.conectiva.com.br/
On Thu, 26 Apr 2001, Rik van Riel wrote:
> On Thu, 26 Apr 2001, Mike Galbraith wrote:
> > On Thu, 26 Apr 2001, Mike Galbraith wrote:
> >
> > > > limit the runtime of refill_inactive_scan(). This is similar to Rik's
> > > > reclaim-limit+aging-tuning patch to linux-mm yesterday. could you try
> > > > Rik's patch with your patch except this jiffies hack, does it still
> > > > achieve the same improvement?
> > >
> > > No. It livelocked on me with almost all active pages exausted.
> >
> > Misspoke.. I didn't try the two mixed. Rik's patch livelocked me.
>
> Interesting. The semantics of my patch are practically the same as
> those of the stock kernel ... can you get the stock kernel to
> livelock on you, too ?
Generally no. Let kswapd continue to run? Yes, but not always.
-Mike
On Thu, 26 Apr 2001, Mike Galbraith wrote:
> On Thu, 26 Apr 2001, Rik van Riel wrote:
>
> > On Thu, 26 Apr 2001, Mike Galbraith wrote:
> >
> > > 1. pagecache is becoming swapcache and must be aged before anything is
> > > done. Meanwhile we're calling refill_inactive_scan() so fast that noone
> > > has a chance to touch a page. Age becomes a simple counter.. I think.
> > > When you hit a big surge, swap pages are at the back of all lists, so all
> > > of your valuable cache gets reclaimed before we write even one swap page.
> >
> > Does the patch I sent to [email protected] last night help in
> > this ?
> >
> > I found that the way refill_inactive_scan() and swap_out() are being
> > called from the main loop in refill_inactive() aren't equal and have
> > fixed that in a way which (IMHO) also beautifies the code a bit.
> >
> > (and makes sure background aging doesn't get out of hand with a few
> > simple checks)
>
> That patch livelocked my box with only ~1000 pages on any list.
>
> I can go back and test some more if you want.
I put it back in, the livelock is 100% repeatable (10 repeats). It's
deactivating/laundering itself to death. :) 3mb for my 386-20/0.96.9
would have been enough to frolic (slowly) in.. this box can't live.
procs memory swap io system cpu
r b w swpd free buff cache si so bi bo in cs us sy id
...
37 3 0 988 1940 1912 32368 0 0 40 49 117 293 90 10 0
47 0 0 1348 1940 1320 26816 0 0 80 17 121 415 86 14 0
39 1 0 1364 1972 1076 20728 0 0 184 0 124 294 84 16 0
39 3 2 1456 1940 232 14720 0 572 67 638 159 2225 85 15 0
29 10 2 1456 1612 416 6908 0 0 252 72 235 2253 84 16 0
17 18 2 1292 1420 608 4216 0 60 340 454 279 3289 29 20 51
33 1 2 1296 1408 488 3464 0 4 317 752 295 2432 11 47 42
locked here
On Thu, 26 Apr 2001, Mike Galbraith wrote:
> > > > No. It livelocked on me with almost all active pages exausted.
> > > Misspoke.. I didn't try the two mixed. Rik's patch livelocked me.
> >
> > Interesting. The semantics of my patch are practically the same as
> > those of the stock kernel ... can you get the stock kernel to
> > livelock on you, too ?
>
> Generally no. Let kswapd continue to run? Yes, but not always.
OK, then I guess we should find out WHY the thing livelocked...
I've heard reports that it's possible to livelock the kernel,
but for some reason you find it easier to livelock the kernel
with my patch applied.
Maybe this is enough of a clue to find out some things on why
the kernel livelocked? Maybe we should add some instrumentation
to the kernel to find out why things like this happen?
IMHO having good instrumentation in the kernel makes sense
anyway, since it will allow us to do easier performance analysis
of people's machines, so we'll have less guesswork and it'll be
easier to get the kernel to perform well on more machines...
regards,
Rik
--
Linux MM bugzilla: http://linux-mm.org/bugzilla.shtml
Virtual memory is like a game you can't win;
However, without VM there's truly nothing to lose...
http://www.surriel.com/
http://www.conectiva.com/ http://distro.conectiva.com/
On Thu, 26 Apr 2001, Rik van Riel wrote:
> On Thu, 26 Apr 2001, Mike Galbraith wrote:
>
> > > > > No. It livelocked on me with almost all active pages exausted.
> > > > Misspoke.. I didn't try the two mixed. Rik's patch livelocked me.
> > >
> > > Interesting. The semantics of my patch are practically the same as
> > > those of the stock kernel ... can you get the stock kernel to
> > > livelock on you, too ?
> >
> > Generally no. Let kswapd continue to run? Yes, but not always.
>
> OK, then I guess we should find out WHY the thing livelocked...
Hi Rik,
I decided to take a break from pondering input and see why the thing
ran itself into the ground. Methinks I was sent the wrooong patch :)
--- linux-2.4.4-pre7/mm/vmscan.c.orig Wed Apr 25 23:59:48 2001
+++ linux-2.4.4-pre7/mm/vmscan.c Thu Apr 26 00:31:31 2001
@@ -24,6 +24,8 @@
#include <asm/pgalloc.h>
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
/*
* The swap-out function returns 1 if it successfully
* scanned all the pages it was asked to (`count').
@@ -631,17 +633,45 @@
/**
* refill_inactive_scan - scan the active list and find pages to deactivate
* @priority: the priority at which to scan
- * @oneshot: exit after deactivating one page
+ * @count: the number of pages to deactivate
*
* This function will scan a portion of the active list to find
* unused pages, those pages will then be moved to the inactive list.
*/
-int refill_inactive_scan(unsigned int priority, int oneshot)
+int refill_inactive_scan(unsigned int priority, int count)
{
struct list_head * page_lru;
struct page * page;
- int maxscan, page_active = 0;
- int ret = 0;
+ int maxscan = nr_active_pages >> priority;
+ int page_active = 0;
+
+ /*
+ * If no count was specified, we do background page aging.
+ * This is done so, after periods of little VM activity, we
+ * know which pages to swap out and we can handle load spikes.
+ * However, if we scan unlimited and deactivate all pages,
+ * we still wouldn't know which pages to swap ...
+ *
+ * The obvious solution is to do less background scanning when
+ * we have lots of inactive pages and to completely stop if we
+ * have tons of them...
+ */
+ if (!count) {
+ int nr_active, nr_inactive;
+
+ /* Active pages can be "hidden" in ptes, take a saner number. */
+ nr_active = MAX(nr_active_pages, num_physpages / 2);
+ nr_inactive = nr_inactive_dirty_pages + nr_free_pages() +
+ nr_inactive_clean_pages();
+
+ if (nr_inactive * 10 < nr_active) {
+ maxscan = nr_active_pages >> 4;
+ } else if (nr_inactive * 3 < nr_active_pages) {
+ maxscan = nr_active >> 8;
+ } else {
+ maxscan = 0;
+ }
+ }
/* Take the lock while messing with the list... */
spin_lock(&pagemap_lru_lock);
@@ -690,14 +720,13 @@
list_del(page_lru);
list_add(page_lru, &active_list);
} else {
- ret = 1;
- if (oneshot)
+ if (--count <= 0)
break;
}
}
spin_unlock(&pagemap_lru_lock);
- return ret;
+ return count;
}
/*
@@ -805,10 +834,9 @@
schedule();
}
- while (refill_inactive_scan(DEF_PRIORITY, 1)) {
- if (--count <= 0)
- goto done;
- }
+ count -= refill_inactive_scan(DEF_PRIORITY, count);
+ if (--count <= 0)
+ goto done;
/* If refill_inactive_scan failed, try to page stuff out.. */
swap_out(DEF_PRIORITY, gfp_mask);
On Fri, 27 Apr 2001, Mike Galbraith wrote:
> On Thu, 26 Apr 2001, Rik van Riel wrote:
>
> > On Thu, 26 Apr 2001, Mike Galbraith wrote:
> >
> > > > > > No. It livelocked on me with almost all active pages exausted.
> > > > > Misspoke.. I didn't try the two mixed. Rik's patch livelocked me.
> > > >
> > > > Interesting. The semantics of my patch are practically the same as
> > > > those of the stock kernel ... can you get the stock kernel to
> > > > livelock on you, too ?
> > >
> > > Generally no. Let kswapd continue to run? Yes, but not always.
> >
> > OK, then I guess we should find out WHY the thing livelocked...
>
> Hi Rik,
>
> I decided to take a break from pondering input and see why the thing
> ran itself into the ground. Methinks I was sent the wrooong patch :)
Mike,
Please apply this patch on top of Rik's v2 patch otherwise you'll get the
livelock easily:
--- linux.orig/mm/vmscan.c Fri Apr 27 04:32:52 2001
+++ linux/mm/vmscan.c Fri Apr 27 04:32:34 2001
@@ -644,6 +644,7 @@
struct page * page;
int maxscan = nr_active_pages >> priority;
int page_active = 0;
+ int start_count = count;
/*
* If no count was specified, we do background page aging.
@@ -725,7 +726,7 @@
}
spin_unlock(&pagemap_lru_lock);
- return count;
+ return (start_count - count);
}
/*
On Fri, 27 Apr 2001, Marcelo Tosatti wrote:
> On Fri, 27 Apr 2001, Mike Galbraith wrote:
>
> > I decided to take a break from pondering input and see why the thing
> > ran itself into the ground. Methinks I was sent the wrooong patch :)
>
> Mike,
>
> Please apply this patch on top of Rik's v2 patch otherwise you'll get the
> livelock easily:
test results:
real 11m44.088s
user 7m57.720s
sys 0m36.420s
On Thu, 26 Apr 2001, Linus Torvalds wrote:
> Have you looked at "free_pte()"? I don't like that function, and it might
> make a difference. There are several small nits with it:
snip
> I _think_ the logic should be something along the lines of: "freeing the
> page amounts to a implied down-aging of the page, but the 'accessed' bit
> would have aged it up, so the two take each other out". But if so, the
> free_pte() logic should have something like
>
> if (page->mapping) {
> if (!pte_young(pte) || PageSwapCache(page))
> age_page_down_ageonly(page);
> if (!page->age)
> deactivate_page(page);
> }
Hi,
I tried this out today after some more reading.
virgin pre7 +Rik
real 11m44.088s
user 7m57.720s
sys 0m36.420s
+Rik +Linus
real 11m48.597s
user 7m55.620s
sys 0m37.860s
+Rik +Linus +HarshAging
real 11m17.758s
user 7m57.650s
sys 0m36.350s
None of them make much difference.
-Mike
On Fri, 27 Apr 2001, Mike Galbraith wrote:
> virgin pre7 +Rik
> real 11m44.088s
> user 7m57.720s
> sys 0m36.420s
> None of them make much difference.
Good, then I suppose we can put in the cleanup from my code, since
it makes the balancing a bit more predictable and should keep the
background aging within bounds.
I'll send a fixed patch tonight (with that last small thinko you
and marcelo discovered removed).
regards,
Rik
--
Linux MM bugzilla: http://linux-mm.org/bugzilla.shtml
Virtual memory is like a game you can't win;
However, without VM there's truly nothing to lose...
http://www.surriel.com/
http://www.conectiva.com/ http://distro.conectiva.com/