The special cases in the use-once code have annoyed me for a while,
and I'd like to see if replacing them with something simpler could
be worthwhile.
I haven't actually benchmarked (or even tested) this code yet, but
the basic idea is that we want to ignore multiple references to the
same page if they happen really close to each other, and only keep
a page on the active list if it got accessed again on a time scale
that matters to the pageout code. In other words, filtering out
correlated references in a simpler way.
Opinions ?
Signed-off-by: Rik van Riel <[email protected]>
include/linux/page-flags.h | 7 +++++++
mm/filemap.c | 11 ++---------
mm/shmem.c | 7 ++-----
mm/swap.c | 11 ++---------
mm/vmscan.c | 29 ++++++-----------------------
5 files changed, 19 insertions(+), 46 deletions(-)
Index: linux-2.6.11/include/linux/page-flags.h
===================================================================
--- linux-2.6.11.orig/include/linux/page-flags.h
+++ linux-2.6.11/include/linux/page-flags.h
@@ -77,6 +77,8 @@
#define PG_nosave_free 19 /* Free, should not be written */
#define PG_uncached 20 /* Page has been mapped as uncached */
+#define PG_new 21 /* Newly allocated page */
+
/*
* Global page accounting. One instance per CPU. Only unsigned longs are
* allowed.
@@ -305,6 +307,11 @@ extern void __mod_page_state(unsigned of
#define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags)
#define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags)
+#define PageNew(page) test_bit(PG_new, &(page)->flags)
+#define SetPageNew(page) set_bit(PG_new, &(page)->flags)
+#define ClearPageNew(page) clear_bit(PG_new, &(page)->flags)
+#define TestClearPageNew(page) test_and_clear_bit(PG_new, &(page)->flags)
+
struct page; /* forward declaration */
int test_clear_page_dirty(struct page *page);
Index: linux-2.6.11/mm/filemap.c
===================================================================
--- linux-2.6.11.orig/mm/filemap.c
+++ linux-2.6.11/mm/filemap.c
@@ -370,6 +370,7 @@ int add_to_page_cache(struct page *page,
if (!error) {
page_cache_get(page);
SetPageLocked(page);
+ SetPageNew(page);
page->mapping = mapping;
page->index = offset;
mapping->nrpages++;
@@ -710,7 +711,6 @@ void do_generic_mapping_read(struct addr
unsigned long offset;
unsigned long last_index;
unsigned long next_index;
- unsigned long prev_index;
loff_t isize;
struct page *cached_page;
int error;
@@ -719,7 +719,6 @@ void do_generic_mapping_read(struct addr
cached_page = NULL;
index = *ppos >> PAGE_CACHE_SHIFT;
next_index = index;
- prev_index = ra.prev_page;
last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
@@ -776,13 +775,7 @@ page_ok:
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- /*
- * When (part of) the same page is read multiple times
- * in succession, only mark it as accessed the first time.
- */
- if (prev_index != index)
- mark_page_accessed(page);
- prev_index = index;
+ mark_page_accessed(page);
/*
* Ok, we have the page, and it's up-to-date, so
Index: linux-2.6.11/mm/swap.c
===================================================================
--- linux-2.6.11.orig/mm/swap.c
+++ linux-2.6.11/mm/swap.c
@@ -115,19 +115,11 @@ void fastcall activate_page(struct page
/*
* Mark a page as having seen activity.
- *
- * inactive,unreferenced -> inactive,referenced
- * inactive,referenced -> active,unreferenced
- * active,unreferenced -> active,referenced
*/
void fastcall mark_page_accessed(struct page *page)
{
- if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
- activate_page(page);
- ClearPageReferenced(page);
- } else if (!PageReferenced(page)) {
+ if (!PageReferenced(page))
SetPageReferenced(page);
- }
}
EXPORT_SYMBOL(mark_page_accessed);
@@ -157,6 +149,7 @@ void fastcall lru_cache_add_active(struc
if (!pagevec_add(pvec, page))
__pagevec_lru_add_active(pvec);
put_cpu_var(lru_add_active_pvecs);
+ ClearPageNew(page);
}
void lru_add_drain(void)
Index: linux-2.6.11/mm/shmem.c
===================================================================
--- linux-2.6.11.orig/mm/shmem.c
+++ linux-2.6.11/mm/shmem.c
@@ -1525,11 +1525,8 @@ static void do_shmem_file_read(struct fi
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- /*
- * Mark the page accessed if we read the beginning.
- */
- if (!offset)
- mark_page_accessed(page);
+
+ mark_page_accessed(page);
} else
page = ZERO_PAGE(0);
Index: linux-2.6.11/mm/vmscan.c
===================================================================
--- linux-2.6.11.orig/mm/vmscan.c
+++ linux-2.6.11/mm/vmscan.c
@@ -225,27 +225,6 @@ static int shrink_slab(unsigned long sca
return 0;
}
-/* Called without lock on whether page is mapped, so answer is unstable */
-static inline int page_mapping_inuse(struct page *page)
-{
- struct address_space *mapping;
-
- /* Page is in somebody's page tables. */
- if (page_mapped(page))
- return 1;
-
- /* Be more reluctant to reclaim swapcache than pagecache */
- if (PageSwapCache(page))
- return 1;
-
- mapping = page_mapping(page);
- if (!mapping)
- return 0;
-
- /* File is mmap'd by somebody? */
- return mapping_mapped(mapping);
-}
-
static inline int is_page_cache_freeable(struct page *page)
{
return page_count(page) - !!PagePrivate(page) == 2;
@@ -398,9 +377,13 @@ static int shrink_list(struct list_head
goto keep_locked;
referenced = page_referenced(page, 1, sc->priority <= 0);
- /* In active use or really unfreeable? Activate it. */
- if (referenced && page_mapping_inuse(page))
+
+ if (referenced) {
+ /* New page. Let's see if it'll get used again... */
+ if (TestClearPageNew(page))
+ goto keep_locked;
goto activate_locked;
+ }
#ifdef CONFIG_SWAP
/*
---
linux-2.6-npiggin/include/linux/mm_inline.h | 1
linux-2.6-npiggin/include/linux/page-flags.h | 8 +++
linux-2.6-npiggin/mm/swap.c | 2
linux-2.6-npiggin/mm/vmscan.c | 62 ++++++++++++++++-----------
mm/page_alloc.c | 0
5 files changed, 49 insertions(+), 24 deletions(-)
diff -puN mm/vmscan.c~vm-page-skipped mm/vmscan.c
--- linux-2.6/mm/vmscan.c~vm-page-skipped 2005-02-09 20:48:24.000000000 +1100
+++ linux-2.6-npiggin/mm/vmscan.c 2005-02-11 20:56:44.000000000 +1100
@@ -361,8 +361,20 @@ static int shrink_list(struct list_head
if (PageWriteback(page))
goto keep_locked;
- if (page_referenced(page, 1, sc->priority <= 0))
- goto activate_locked;
+ if (page_referenced(page, 1, sc->priority <= 0)) {
+ /*
+ * Has been referenced. Activate used twice or
+ * mapped pages, otherwise give it another chance
+ * on the inactive list
+ */
+ if (TestSetPageUsedOnce(page) || mapped)
+ goto activate_locked;
+ if (page_test_clear_pte_dirty(page, 1))
+ set_page_dirty(page);
+ if (PageDirty(page))
+ sc->nr_dirty_inactive++;
+ goto keep_locked;
+ }
#ifdef CONFIG_SWAP
/*
@@ -581,9 +593,10 @@ static void shrink_cache(struct zone *zo
if (TestSetPageLRU(page))
BUG();
list_del(&page->lru);
- if (PageActive(page))
+ if (PageActive(page)) {
+ ClearPageUsedOnce(page);
add_page_to_active_list(zone, page);
- else
+ } else
add_page_to_inactive_list(zone, page);
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
@@ -617,7 +630,7 @@ done:
static void
refill_inactive_zone(struct zone *zone, struct scan_control *sc)
{
- int pgmoved, pgmoved_dirty;
+ int pgmoved;
int pgdeactivate = 0;
int pgscanned = 0;
int nr_pages = sc->nr_to_scan;
@@ -633,7 +646,6 @@ refill_inactive_zone(struct zone *zone,
lru_add_drain();
pgmoved = 0;
- pgmoved_dirty = 0;
spin_lock_irq(&zone->lru_lock);
while (pgscanned < nr_pages && !list_empty(&zone->active_list)) {
@@ -717,24 +729,6 @@ refill_inactive_zone(struct zone *zone,
list_add(&page->lru, &l_inactive);
}
- /*
- * Try to write back as many pages as the number of dirty ones
- * we're adding to the inactive list. This tends to cause slow
- * streaming writers to write data to the disk smoothly, at the
- * dirtying rate, which is nice. But that's undesirable in
- * laptop mode, where we *want* lumpy writeout. So in laptop
- * mode, write out the whole world.
- */
- zone->nr_dirty_inactive += pgmoved_dirty;
- pgmoved_dirty = zone->nr_dirty_inactive;
- if (pgmoved_dirty > zone->nr_inactive / 2
- || (!(laptop_mode && !sc->may_writepage)
- && pgmoved_dirty > SWAP_CLUSTER_MAX)) {
- zone->nr_dirty_inactive = 0;
- wakeup_bdflush(laptop_mode ? 0 : pgmoved_dirty*2);
- sc->may_writepage = 1;
- }
-
pagevec_init(&pvec, 1);
pgmoved = 0;
spin_lock_irq(&zone->lru_lock);
@@ -799,6 +793,7 @@ shrink_zone(struct zone *zone, struct sc
{
unsigned long nr_active;
unsigned long nr_inactive;
+ unsigned long count;
/*
* Add one to `nr_to_scan' just to make sure that the kernel will
@@ -819,6 +814,7 @@ shrink_zone(struct zone *zone, struct sc
nr_inactive = 0;
sc->nr_to_reclaim = SWAP_CLUSTER_MAX;
+ sc->nr_dirty_inactive = 0;
while (nr_active || nr_inactive) {
if (nr_active) {
@@ -837,6 +833,24 @@ shrink_zone(struct zone *zone, struct sc
break;
}
}
+
+ /*
+ * Try to write back as many pages as the number of dirty ones
+ * we're adding to the inactive list. This tends to cause slow
+ * streaming writers to write data to the disk smoothly, at the
+ * dirtying rate, which is nice. But that's undesirable in
+ * laptop mode, where we *want* lumpy writeout. So in laptop
+ * mode, write out the whole world.
+ */
+ zone->nr_dirty_inactive += sc->nr_dirty_inactive;
+ count = zone->nr_dirty_inactive;
+ if (count > zone->nr_inactive / 2
+ || (!(laptop_mode && !sc->may_writepage)
+ && count > SWAP_CLUSTER_MAX)) {
+ zone->nr_dirty_inactive = 0;
+ wakeup_bdflush(laptop_mode ? 0 : count*2);
+ sc->may_writepage = 1;
+ }
}
/*
diff -puN include/linux/page-flags.h~vm-page-skipped include/linux/page-flags.h
--- linux-2.6/include/linux/page-flags.h~vm-page-skipped 2005-02-09 20:48:24.000000000 +1100
+++ linux-2.6-npiggin/include/linux/page-flags.h 2005-02-11 20:56:43.000000000 +1100
@@ -76,6 +76,8 @@
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_nosave_free 19 /* Free, should not be written */
+#define PG_usedonce 20 /* LRU page has been touched once */
+
/*
* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -293,6 +295,12 @@ extern void __mod_page_state(unsigned of
#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags)
#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags)
+#define PageUsedOnce(page) test_bit(PG_usedonce, &(page)->flags)
+#define SetPageUsedOnce(page) set_bit(PG_usedonce, &(page)->flags)
+#define TestSetPageUsedOnce(page) test_and_set_bit(PG_usedonce, &(page)->flags)
+#define ClearPageUsedOnce(page) clear_bit(PG_usedonce, &(page)->flags)
+#define TestClearPageUsedOnce(page) test_and_clear_bit(PG_usedonce, &(page)->flags)
+
#ifdef CONFIG_SWAP
#define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags)
#define SetPageSwapCache(page) set_bit(PG_swapcache, &(page)->flags)
diff -puN mm/truncate.c~vm-page-skipped mm/truncate.c
diff -puN mm/swap.c~vm-page-skipped mm/swap.c
--- linux-2.6/mm/swap.c~vm-page-skipped 2005-02-09 20:48:24.000000000 +1100
+++ linux-2.6-npiggin/mm/swap.c 2005-02-11 20:56:43.000000000 +1100
@@ -267,6 +267,7 @@ void __pagevec_lru_add(struct pagevec *p
}
if (TestSetPageLRU(page))
BUG();
+ ClearPageUsedOnce(page);
add_page_to_inactive_list(zone, page);
}
if (zone)
@@ -296,6 +297,7 @@ void __pagevec_lru_add_active(struct pag
BUG();
if (TestSetPageActive(page))
BUG();
+ ClearPageUsedOnce(page);
add_page_to_active_list(zone, page);
}
if (zone)
diff -puN include/linux/swap.h~vm-page-skipped include/linux/swap.h
diff -puN include/linux/mm_inline.h~vm-page-skipped include/linux/mm_inline.h
--- linux-2.6/include/linux/mm_inline.h~vm-page-skipped 2005-02-09 20:48:24.000000000 +1100
+++ linux-2.6-npiggin/include/linux/mm_inline.h 2005-02-11 20:56:43.000000000 +1100
@@ -35,6 +35,7 @@ del_page_from_lru(struct zone *zone, str
ClearPageActive(page);
zone->nr_active--;
} else {
+ ClearPageUsedOnce(page);
zone->nr_inactive--;
}
}
diff -puN mm/page_alloc.c~vm-page-skipped mm/page_alloc.c
_
Rik van Riel writes:
> The special cases in the use-once code have annoyed me for a while,
> and I'd like to see if replacing them with something simpler could
> be worthwhile.
>
> I haven't actually benchmarked (or even tested) this code yet, but
> the basic idea is that we want to ignore multiple references to the
> same page if they happen really close to each other, and only keep
> a page on the active list if it got accessed again on a time scale
> that matters to the pageout code. In other words, filtering out
> correlated references in a simpler way.
>
> Opinions ?
[...]
> - * active,unreferenced -> active,referenced
> */
> void fastcall mark_page_accessed(struct page *page)
> {
> - if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
> - activate_page(page);
> - ClearPageReferenced(page);
> - } else if (!PageReferenced(page)) {
> + if (!PageReferenced(page))
> SetPageReferenced(page);
> - }
> }
So file system pages never get to the active list? Hmm... this doesn't
sound right.
>
> EXPORT_SYMBOL(mark_page_accessed);
> @@ -157,6 +149,7 @@ void fastcall lru_cache_add_active(struc
> if (!pagevec_add(pvec, page))
[...]
> goto keep_locked;
>
> referenced = page_referenced(page, 1, sc->priority <= 0);
> - /* In active use or really unfreeable? Activate it. */
> - if (referenced && page_mapping_inuse(page))
> +
> + if (referenced) {
> + /* New page. Let's see if it'll get used again... */
> + if (TestClearPageNew(page))
> + goto keep_locked;
> goto activate_locked;
> + }
This will screw scanning most likely: no referenced page is ever
reclaimed unless lowest scanning priority is reached---this looks like
sure way to oom and has capacity to increase CPU consumption
significantly.
>
> #ifdef CONFIG_SWAP
Nikita.
On Tue, 3 May 2005, Nick Piggin wrote:
> I think the biggest problem with our twine and duct tape page reclaim
> scheme is that somehow *works* (for some value of works).
> I think we branch a new tree for all interested VM developers to work
> on and try to get performing well. Probably try to restrict it to page
> reclaim and related fundamentals so it stays as small as possible and
> worth testing.
Sounds great. I'd be willing to maintain a quilt tree for
this - in fact, I've already got a few patches ;)
Also, we should probably keep track of exactly what we're
working towards. I've put my ideas on a wiki page, feel
free to add yours - probably a new page for stuff that's
not page replacement related ;)
http://wiki.linux-mm.org/wiki/AdvancedPageReplacement
--
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan
Rik van Riel wrote:
> On Tue, 3 May 2005, Nick Piggin wrote:
>
>> I think the biggest problem with our twine and duct tape page reclaim
>> scheme is that somehow *works* (for some value of works).
>
>
>> I think we branch a new tree for all interested VM developers to work
>> on and try to get performing well. Probably try to restrict it to page
>> reclaim and related fundamentals so it stays as small as possible and
>> worth testing.
>
>
> Sounds great. I'd be willing to maintain a quilt tree for
> this - in fact, I've already got a few patches ;)
>
OK I'll help maintain and review patches.
Also having a box or two for running regression and stress
testing is a must. I can do a bit here, but unfortunately
"kernel compiles until it hurts" is probably not the best
workload to target.
In general most systems and their workloads aren't constantly
swapping, so we should aim to minimise IO for normal
workloads. Databases that use the pagecache (eg. postgresql)
would be a good test. But again we don't want to focus on one
thing.
That said, of course we don't want to hurt the "really
thrashing" case - and hopefully improve it if possible.
> Also, we should probably keep track of exactly what we're
> working towards. I've put my ideas on a wiki page, feel
> free to add yours - probably a new page for stuff that's
> not page replacement related ;)
>
I guess some of those page replacement algorithms would be
really interesting to explore - and we definitely should be
looking into them at some stage as part of our -vm tree.
Though my initial aims are to first simplify and rationalise
and unify things where possible.
NUMA "issues" related to page reclaim are also interesting to
me.
I'll try to get some time to get my patches into shape in the
next week or so.
--
SUSE Labs, Novell Inc.
On Wed, 4 May 2005, Nick Piggin wrote:
>
> Also having a box or two for running regression and stress
> testing is a must. I can do a bit here, but unfortunately
> "kernel compiles until it hurts" is probably not the best
> workload to target.
>
> In general most systems and their workloads aren't constantly
> swapping, so we should aim to minimise IO for normal
> workloads. Databases that use the pagecache (eg. postgresql)
> would be a good test. But again we don't want to focus on one
> thing.
>
> That said, of course we don't want to hurt the "really
> thrashing" case - and hopefully improve it if possible.
may I suggest useing OpenOffice as one test, it can eat up horrendous
amounts of ram in operation (I have one spreadsheet I can send you if
needed that takes 45min of cpu time on a Athlon64 3200 with 1G of ram just
to open, at which time it shows openoffice takeing more then 512M of ram)
David Lang
--
There are two ways of constructing a software design. One way is to make it so simple that there are obviously no deficiencies. And the other way is to make it so complicated that there are no obvious deficiencies.
-- C.A.R. Hoare
On Tue, 3 May 2005, Nikita Danilov wrote:
> So file system pages never get to the active list? Hmm... this doesn't
> sound right.
Yes they will, I changed vmscan.c to compensate.
> > + if (referenced) {
> > + /* New page. Let's see if it'll get used again... */
> > + if (TestClearPageNew(page))
> > + goto keep_locked;
> > goto activate_locked;
> > + }
>
> This will screw scanning most likely: no referenced page is ever
> reclaimed unless lowest scanning priority is reached---this looks like
> sure way to oom and has capacity to increase CPU consumption
> significantly.
Doubtful, the VM always manages to clear the referenced bits
faster than they can be set, especially on pages that aren't
even mapped!
The problem you describe should be a lot worse for mapped
pages, where we already do not run into the problem, despite
the VM taking no precautions.
--
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan
On Tue, May 03, 2005 at 05:51:43PM -0700, David Lang wrote:
> On Wed, 4 May 2005, Nick Piggin wrote:
>
> >
> >Also having a box or two for running regression and stress
> >testing is a must. I can do a bit here, but unfortunately
> >"kernel compiles until it hurts" is probably not the best
> >workload to target.
if there are some tests or output (kernel logs, etc)
or proc info or vmstat or whatever, which doesn't take
100% cpu time, I'm able and willing to test it on different
workloads (including compiling the kernel until it hurts ;)
> >In general most systems and their workloads aren't constantly
> >swapping, so we should aim to minimise IO for normal
> >workloads. Databases that use the pagecache (eg. postgresql)
> >would be a good test. But again we don't want to focus on one
> >thing.
> >
> >That said, of course we don't want to hurt the "really
> >thrashing" case - and hopefully improve it if possible.
>
> may I suggest useing OpenOffice as one test, it can eat up horrendous
> amounts of ram in operation (I have one spreadsheet I can send you if
> needed that takes 45min of cpu time on a Athlon64 3200 with 1G of ram just
> to open, at which time it shows openoffice takeing more then 512M of ram)
cool, looks like they are taking the MS compatibility
really serious nowadays ...
best,
Herbert
> David Lang
>
> --
> There are two ways of constructing a software design. One way is to make it
> so simple that there are obviously no deficiencies. And the other way is to
> make it so complicated that there are no obvious deficiencies.
> -- C.A.R. Hoare
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Fri, 2005-05-20 at 20:16 +0200, Herbert Poetzl wrote:
> cool, looks like they are taking the MS compatibility
> really serious nowadays ...
>
Um... I don't know when you last used Windows, but most Linux desktop
GUI apps are way more bloated than the Windows counterparts. Take a
look at some of the Gnome bounties for reducing bloat - some of them are
just embarassing.
Quick demo: with a recent Gnome, open the file selector dialog, and
browse to /usr/bin. The disk goes nuts for 20 seconds before the file
list is even displayed. Now hit cancel and try it again. No disk
activity this time, but the CPU pegs for 7-8 seconds before the files
are displayed.
Now try the same on Windows. It's instantaneous.
Lee
--On Friday, May 20, 2005 20:16:06 +0200 Herbert Poetzl <[email protected]> wrote:
> On Tue, May 03, 2005 at 05:51:43PM -0700, David Lang wrote:
>> On Wed, 4 May 2005, Nick Piggin wrote:
>>
>> >
>> > Also having a box or two for running regression and stress
>> > testing is a must. I can do a bit here, but unfortunately
>> > "kernel compiles until it hurts" is probably not the best
>> > workload to target.
>
> if there are some tests or output (kernel logs, etc)
> or proc info or vmstat or whatever, which doesn't take
> 100% cpu time, I'm able and willing to test it on different
> workloads (including compiling the kernel until it hurts ;)
I did take that patch and run a bunch of tests on it across a few
different architectures. everything worked fine, no perf differnences
either way ... but then I may not have actually put it under memory
pressure, so it might not be ideal testing ;-)
M.