2005-12-30 22:40:18

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH] vm: page-replace and clockpro

Hi All,

These two patch sets implement a new page replacement algorithm based on
CLOCK-Pro.

The first patch set: page-replace-*, abstracts the current page replace code
and moves it to its own file: mm/page_replace.c.

The second patch set: clockpro-*, then implements a new replace algorithm by
reimplementing the hooks introduced in the previous set.


Andrew, Nick, the kswapd-incmin patch is there again ;-)
I know there is still some disagreement on this patch, however without
it reclaim truely sucks rock with this code.
What happens is that zone_dma is severly overscanned and the clockpro
implementation cannot handle this nicely.


PeterZ


2005-12-30 22:40:38

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 02/14] page-replace-try_pageout.patch


From: Peter Zijlstra <[email protected]>

Move the functionality of the shrink_list() loop into its own function.

Signed-off-by: Peter Zijlstra <[email protected]>

mm/vmscan.c | 324 ++++++++++++++++++++++++++++++++----------------------------
1 file changed, 174 insertions(+), 150 deletions(-)

Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c
+++ linux-2.6-git/mm/vmscan.c
@@ -51,6 +51,16 @@ typedef enum {
PAGE_CLEAN,
} pageout_t;

+/* possible outcome of try_pageout() */
+typedef enum {
+ /* unable to perform pageout */
+ PAGEOUT_KEEP,
+ /* unable to perform pageout, page is in active use */
+ PAGEOUT_ACTIVATE,
+ /* pageout succeded, page is gone */
+ PAGEOUT_SUCCESS,
+} try_pageout_t;
+
struct scan_control {
/* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
unsigned long nr_to_scan;
@@ -382,189 +392,203 @@ static pageout_t pageout(struct page *pa
return PAGE_CLEAN;
}

-/*
- * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
- */
-static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+static try_pageout_t try_pageout(struct page *page, struct scan_control *sc)
{
- LIST_HEAD(ret_pages);
- struct pagevec freed_pvec;
- int pgactivate = 0;
- int reclaimed = 0;
-
- cond_resched();
-
- pagevec_init(&freed_pvec, 1);
- while (!list_empty(page_list)) {
- struct address_space *mapping;
- struct page *page;
- int may_enter_fs;
- int referenced;
-
- cond_resched();
-
- page = lru_to_page(page_list);
- list_del(&page->lru);
+ struct address_space *mapping;
+ int may_enter_fs;
+ int referenced;

- if (TestSetPageLocked(page))
- goto keep;
+ if (TestSetPageLocked(page))
+ goto keep;

- BUG_ON(PageActive(page));
+ BUG_ON(PageActive(page));

+ sc->nr_scanned++;
+ /* Double the slab pressure for mapped and swapcache pages */
+ if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
- /* Double the slab pressure for mapped and swapcache pages */
- if (page_mapped(page) || PageSwapCache(page))
- sc->nr_scanned++;

- if (PageWriteback(page))
- goto keep_locked;
+ if (PageWriteback(page))
+ goto keep_locked;

- referenced = page_referenced(page, 1);
- /* In active use or really unfreeable? Activate it. */
- if (referenced && page_mapping_inuse(page))
- goto activate_locked;
+ referenced = page_referenced(page, 1);
+ /* In active use or really unfreeable? Activate it. */
+ if (referenced && page_mapping_inuse(page))
+ goto activate_locked;

#ifdef CONFIG_SWAP
- /*
- * Anonymous process memory has backing store?
- * Try to allocate it some swap space here.
- */
- if (PageAnon(page) && !PageSwapCache(page)) {
- if (!sc->may_swap)
- goto keep_locked;
- if (!add_to_swap(page))
- goto activate_locked;
- }
+ /*
+ * Anonymous process memory has backing store?
+ * Try to allocate it some swap space here.
+ */
+ if (PageAnon(page) && !PageSwapCache(page)) {
+ if (!sc->may_swap)
+ goto keep_locked;
+ if (!add_to_swap(page))
+ goto activate_locked;
+ }
#endif /* CONFIG_SWAP */

- mapping = page_mapping(page);
- may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
- (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+ mapping = page_mapping(page);
+ may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+ (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));

- /*
- * The page is mapped into the page tables of one or more
- * processes. Try to unmap it here.
- */
- if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page)) {
- case SWAP_FAIL:
- goto activate_locked;
- case SWAP_AGAIN:
- goto keep_locked;
- case SWAP_SUCCESS:
- ; /* try to free the page below */
- }
+ /*
+ * The page is mapped into the page tables of one or more
+ * processes. Try to unmap it here.
+ */
+ if (page_mapped(page) && mapping) {
+ switch (try_to_unmap(page)) {
+ case SWAP_FAIL:
+ goto activate_locked;
+ case SWAP_AGAIN:
+ goto keep_locked;
+ case SWAP_SUCCESS:
+ ; /* try to free the page below */
}
+ }

- if (PageDirty(page)) {
- if (referenced)
- goto keep_locked;
- if (!may_enter_fs)
- goto keep_locked;
- if (laptop_mode && !sc->may_writepage)
- goto keep_locked;
+ if (PageDirty(page)) {
+ if (referenced)
+ goto keep_locked;
+ if (!may_enter_fs)
+ goto keep_locked;
+ if (laptop_mode && !sc->may_writepage)
+ goto keep_locked;

- /* Page is dirty, try to write it out here */
- switch(pageout(page, mapping)) {
- case PAGE_KEEP:
- goto keep_locked;
- case PAGE_ACTIVATE:
- goto activate_locked;
- case PAGE_SUCCESS:
- if (PageWriteback(page) || PageDirty(page))
- goto keep;
+ /* Page is dirty, try to write it out here */
+ switch(pageout(page, mapping)) {
+ case PAGE_KEEP:
+ goto keep_locked;
+ case PAGE_ACTIVATE:
+ goto activate_locked;
+ case PAGE_SUCCESS:
+ if (PageWriteback(page) || PageDirty(page))
+ goto keep;
/*
- * A synchronous write - probably a ramdisk. Go
- * ahead and try to reclaim the page.
- */
- if (TestSetPageLocked(page))
- goto keep;
- if (PageDirty(page) || PageWriteback(page))
- goto keep_locked;
- mapping = page_mapping(page);
- case PAGE_CLEAN:
- ; /* try to free the page below */
- }
+ * A synchronous write - probably a ramdisk. Go
+ * ahead and try to reclaim the page.
+ */
+ if (TestSetPageLocked(page))
+ goto keep;
+ if (PageDirty(page) || PageWriteback(page))
+ goto keep_locked;
+ mapping = page_mapping(page);
+ case PAGE_CLEAN:
+ ; /* try to free the page below */
}
+ }

- /*
- * If the page has buffers, try to free the buffer mappings
- * associated with this page. If we succeed we try to free
- * the page as well.
- *
- * We do this even if the page is PageDirty().
- * try_to_release_page() does not perform I/O, but it is
- * possible for a page to have PageDirty set, but it is actually
- * clean (all its buffers are clean). This happens if the
- * buffers were written out directly, with submit_bh(). ext3
- * will do this, as well as the blockdev mapping.
- * try_to_release_page() will discover that cleanness and will
- * drop the buffers and mark the page clean - it can be freed.
- *
- * Rarely, pages can have buffers and no ->mapping. These are
- * the pages which were not successfully invalidated in
- * truncate_complete_page(). We try to drop those buffers here
- * and if that worked, and the page is no longer mapped into
- * process address space (page_count == 1) it can be freed.
- * Otherwise, leave the page on the LRU so it is swappable.
- */
- if (PagePrivate(page)) {
- if (!try_to_release_page(page, sc->gfp_mask))
- goto activate_locked;
- if (!mapping && page_count(page) == 1)
- goto free_it;
- }
+ /*
+ * If the page has buffers, try to free the buffer mappings
+ * associated with this page. If we succeed we try to free
+ * the page as well.
+ *
+ * We do this even if the page is PageDirty().
+ * try_to_release_page() does not perform I/O, but it is
+ * possible for a page to have PageDirty set, but it is actually
+ * clean (all its buffers are clean). This happens if the
+ * buffers were written out directly, with submit_bh(). ext3
+ * will do this, as well as the blockdev mapping.
+ * try_to_release_page() will discover that cleanness and will
+ * drop the buffers and mark the page clean - it can be freed.
+ *
+ * Rarely, pages can have buffers and no ->mapping. These are
+ * the pages which were not successfully invalidated in
+ * truncate_complete_page(). We try to drop those buffers here
+ * and if that worked, and the page is no longer mapped into
+ * process address space (page_count == 1) it can be freed.
+ * Otherwise, leave the page on the LRU so it is swappable.
+ */
+ if (PagePrivate(page)) {
+ if (!try_to_release_page(page, sc->gfp_mask))
+ goto activate_locked;
+ if (!mapping && page_count(page) == 1)
+ goto free_it;
+ }

- if (!mapping)
- goto keep_locked; /* truncate got there first */
+ if (!mapping)
+ goto keep_locked; /* truncate got there first */

- write_lock_irq(&mapping->tree_lock);
+ write_lock_irq(&mapping->tree_lock);

- /*
- * The non-racy check for busy page. It is critical to check
- * PageDirty _after_ making sure that the page is freeable and
- * not in use by anybody. (pagecache + us == 2)
- */
- if (unlikely(page_count(page) != 2))
- goto cannot_free;
- smp_rmb();
- if (unlikely(PageDirty(page)))
- goto cannot_free;
+ /*
+ * The non-racy check for busy page. It is critical to check
+ * PageDirty _after_ making sure that the page is freeable and
+ * not in use by anybody. (pagecache + us == 2)
+ */
+ if (unlikely(page_count(page) != 2))
+ goto cannot_free;
+ smp_rmb();
+ if (unlikely(PageDirty(page)))
+ goto cannot_free;

#ifdef CONFIG_SWAP
- if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page_private(page) };
- __delete_from_swap_cache(page);
- write_unlock_irq(&mapping->tree_lock);
- swap_free(swap);
- __put_page(page); /* The pagecache ref */
- goto free_it;
- }
+ if (PageSwapCache(page)) {
+ swp_entry_t swap = { .val = page_private(page) };
+ __delete_from_swap_cache(page);
+ write_unlock_irq(&mapping->tree_lock);
+ swap_free(swap);
+ __put_page(page); /* The pagecache ref */
+ goto free_it;
+ }
#endif /* CONFIG_SWAP */

- __remove_from_page_cache(page);
- write_unlock_irq(&mapping->tree_lock);
- __put_page(page);
+ __remove_from_page_cache(page);
+ write_unlock_irq(&mapping->tree_lock);
+ __put_page(page);

free_it:
- unlock_page(page);
- reclaimed++;
- if (!pagevec_add(&freed_pvec, page))
- __pagevec_release_nonlru(&freed_pvec);
- continue;
+ unlock_page(page);
+ return PAGEOUT_SUCCESS;

cannot_free:
- write_unlock_irq(&mapping->tree_lock);
- goto keep_locked;
+ write_unlock_irq(&mapping->tree_lock);
+ goto keep_locked;

activate_locked:
- SetPageActive(page);
- pgactivate++;
+ unlock_page(page);
+ return PAGEOUT_ACTIVATE;
keep_locked:
- unlock_page(page);
+ unlock_page(page);
keep:
- list_add(&page->lru, &ret_pages);
- BUG_ON(PageLRU(page));
+ return PAGEOUT_KEEP;
+}
+
+static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+{
+ LIST_HEAD(ret_pages);
+ struct pagevec freed_pvec;
+ int pgactivate = 0;
+ int reclaimed = 0;
+
+ cond_resched();
+
+ pagevec_init(&freed_pvec, 1);
+ while (!list_empty(page_list)) {
+ struct page *page;
+
+ cond_resched();
+
+ page = lru_to_page(page_list);
+ list_del(&page->lru);
+
+ switch(try_pageout(page, sc)) {
+ case PAGEOUT_ACTIVATE:
+ SetPageActive(page);
+ pgactivate++;
+ /* fall through */
+ case PAGEOUT_KEEP:
+ list_add(&page->lru, &ret_pages);
+ BUG_ON(PageLRU(page));
+ break;
+
+ case PAGEOUT_SUCCESS:
+ reclaimed++;
+ if (!pagevec_add(&freed_pvec, page))
+ __pagevec_release_nonlru(&freed_pvec);
+ break;
+ }
}
list_splice(&ret_pages, page_list);
if (pagevec_count(&freed_pvec))

2005-12-30 22:40:28

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 01/14] page-replace-single-batch-insert.patch


From: Peter Zijlstra <[email protected]>

page-replace interface function:
__page_replace_insert()

This function inserts a page into the page replace data structure.

Unify the active and inactive per cpu page lists. For now provide insertion
hints using the LRU specific page flags.

Signed-off-by: Peter Zijlstra <[email protected]>

fs/exec.c | 3 +-
include/linux/mm_page_replace.h | 12 ++++++++++
include/linux/swap.h | 2 -
mm/Makefile | 2 -
mm/memory.c | 9 +++++--
mm/page_replace.c | 11 +++++++++
mm/swap.c | 47 +---------------------------------------
mm/swap_state.c | 3 +-
8 files changed, 36 insertions(+), 53 deletions(-)

Index: linux-2.6-git/fs/exec.c
===================================================================
--- linux-2.6-git.orig/fs/exec.c
+++ linux-2.6-git/fs/exec.c
@@ -321,7 +321,8 @@ void install_arg_page(struct vm_area_str
goto out;
}
inc_mm_counter(mm, anon_rss);
- lru_cache_add_active(page);
+ SetPageActive(page);
+ lru_cache_add(page);
set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
page, vma->vm_page_prot))));
page_add_anon_rmap(page, vma, address);
Index: linux-2.6-git/include/linux/swap.h
===================================================================
--- linux-2.6-git.orig/include/linux/swap.h
+++ linux-2.6-git/include/linux/swap.h
@@ -163,8 +163,6 @@ extern unsigned int nr_free_pagecache_pa

/* linux/mm/swap.c */
extern void FASTCALL(lru_cache_add(struct page *));
-extern void FASTCALL(lru_cache_add_active(struct page *));
-extern void FASTCALL(activate_page(struct page *));
extern void FASTCALL(mark_page_accessed(struct page *));
extern void lru_add_drain(void);
extern int rotate_reclaimable_page(struct page *page);
Index: linux-2.6-git/mm/memory.c
===================================================================
--- linux-2.6-git.orig/mm/memory.c
+++ linux-2.6-git/mm/memory.c
@@ -1497,7 +1497,8 @@ gotten:
ptep_establish(vma, address, page_table, entry);
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
- lru_cache_add_active(new_page);
+ SetPageActive(new_page);
+ lru_cache_add(new_page);
page_add_anon_rmap(new_page, vma, address);

/* Free the old page.. */
@@ -1953,7 +1954,8 @@ static int do_anonymous_page(struct mm_s
if (!pte_none(*page_table))
goto release;
inc_mm_counter(mm, anon_rss);
- lru_cache_add_active(page);
+ SetPageActive(page);
+ lru_cache_add(page);
SetPageReferenced(page);
page_add_anon_rmap(page, vma, address);
} else {
@@ -2085,7 +2087,8 @@ retry:
set_pte_at(mm, address, page_table, entry);
if (anon) {
inc_mm_counter(mm, anon_rss);
- lru_cache_add_active(new_page);
+ SetPageActive(new_page);
+ lru_cache_add(new_page);
page_add_anon_rmap(new_page, vma, address);
} else {
inc_mm_counter(mm, file_rss);
Index: linux-2.6-git/mm/swap.c
===================================================================
--- linux-2.6-git.orig/mm/swap.c
+++ linux-2.6-git/mm/swap.c
@@ -30,6 +30,7 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/init.h>
+#include <linux/mm_page_replace.h>

/* How many pages do we try to swap or page in/out together? */
int page_cluster;
@@ -134,7 +135,6 @@ EXPORT_SYMBOL(mark_page_accessed);
* @page: the page to add
*/
static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };

void fastcall lru_cache_add(struct page *page)
{
@@ -146,25 +146,12 @@ void fastcall lru_cache_add(struct page
put_cpu_var(lru_add_pvecs);
}

-void fastcall lru_cache_add_active(struct page *page)
-{
- struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
-
- page_cache_get(page);
- if (!pagevec_add(pvec, page))
- __pagevec_lru_add_active(pvec);
- put_cpu_var(lru_add_active_pvecs);
-}
-
void lru_add_drain(void)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);

if (pagevec_count(pvec))
__pagevec_lru_add(pvec);
- pvec = &__get_cpu_var(lru_add_active_pvecs);
- if (pagevec_count(pvec))
- __pagevec_lru_add_active(pvec);
put_cpu_var(lru_add_pvecs);
}

@@ -301,7 +288,7 @@ void __pagevec_lru_add(struct pagevec *p
}
if (TestSetPageLRU(page))
BUG();
- add_page_to_inactive_list(zone, page);
+ __page_replace_insert(zone, page);
}
if (zone)
spin_unlock_irq(&zone->lru_lock);
@@ -311,33 +298,6 @@ void __pagevec_lru_add(struct pagevec *p

EXPORT_SYMBOL(__pagevec_lru_add);

-void __pagevec_lru_add_active(struct pagevec *pvec)
-{
- int i;
- struct zone *zone = NULL;
-
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- struct zone *pagezone = page_zone(page);
-
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
- }
- if (TestSetPageLRU(page))
- BUG();
- if (TestSetPageActive(page))
- BUG();
- add_page_to_active_list(zone, page);
- }
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- release_pages(pvec->pages, pvec->nr, pvec->cold);
- pagevec_reinit(pvec);
-}
-
/*
* Try to drop buffers from the pages in a pagevec
*/
@@ -419,9 +379,6 @@ static void lru_drain_cache(unsigned int
/* CPU is dead, so no locking needed. */
if (pagevec_count(pvec))
__pagevec_lru_add(pvec);
- pvec = &per_cpu(lru_add_active_pvecs, cpu);
- if (pagevec_count(pvec))
- __pagevec_lru_add_active(pvec);
}

/* Drop the CPU's cached committed space back into the central pool. */
Index: linux-2.6-git/mm/swap_state.c
===================================================================
--- linux-2.6-git.orig/mm/swap_state.c
+++ linux-2.6-git/mm/swap_state.c
@@ -353,7 +353,8 @@ struct page *read_swap_cache_async(swp_e
/*
* Initiate read into locked page and return.
*/
- lru_cache_add_active(new_page);
+ SetPageActive(new_page);
+ lru_cache_add(new_page);
swap_readpage(NULL, new_page);
return new_page;
}
Index: linux-2.6-git/include/linux/mm_page_replace.h
===================================================================
--- /dev/null
+++ linux-2.6-git/include/linux/mm_page_replace.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_MM_PAGE_REPLACE_H
+#define _LINUX_MM_PAGE_REPLACE_H
+
+#ifdef __KERNEL__
+
+#include <linux/mmzone.h>
+#include <linux/mm.h>
+
+void __page_replace_insert(struct zone *, struct page *);
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_MM_PAGE_REPLACE_H */
Index: linux-2.6-git/mm/page_replace.c
===================================================================
--- /dev/null
+++ linux-2.6-git/mm/page_replace.c
@@ -0,0 +1,11 @@
+#include <linux/mm_page_replace.h>
+#include <linux/mm_inline.h>
+
+
+void __page_replace_insert(struct zone *zone, struct page *page)
+{
+ if (PageActive(page))
+ add_page_to_active_list(zone, page);
+ else
+ add_page_to_inactive_list(zone, page);
+}
Index: linux-2.6-git/mm/Makefile
===================================================================
--- linux-2.6-git.orig/mm/Makefile
+++ linux-2.6-git/mm/Makefile
@@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o \
readahead.o slab.o swap.o truncate.o vmscan.o \
- prio_tree.o $(mmu-y)
+ prio_tree.o page_replace.o $(mmu-y)

obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o

2005-12-30 22:40:55

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 03/14] page-replace-remove-sc-from-refill.patch


From: Peter Zijlstra <[email protected]>

Remove the dependency on struct scan_control from
refill_inactive_zone so we can move it into the page replace
file which doesn't know anything about scan_control.

Signed-off-by: Peter Zijlstra <[email protected]>

mm/vmscan.c | 12 +++---------
1 files changed, 3 insertions(+), 9 deletions(-)

Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c 2005-12-10 17:13:56.000000000 +0100
+++ linux-2.6-git/mm/vmscan.c 2005-12-10 18:19:39.000000000 +0100
@@ -71,8 +71,6 @@ struct scan_control {
/* Incremented by the number of pages reclaimed */
unsigned long nr_reclaimed;

- unsigned long nr_mapped; /* From page_state */
-
/* How many pages shrink_cache() should reclaim */
int nr_to_reclaim;

@@ -730,12 +728,11 @@ done:
* But we had to alter page->flags anyway.
*/
static void
-refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+refill_inactive_zone(struct zone *zone, int nr_pages)
{
int pgmoved;
int pgdeactivate = 0;
int pgscanned;
- int nr_pages = sc->nr_to_scan;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
LIST_HEAD(l_active); /* Pages to go onto the active_list */
@@ -765,7 +762,7 @@ refill_inactive_zone(struct zone *zone,
* mapped memory instead of just pagecache. Work out how much memory
* is mapped.
*/
- mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+ mapped_ratio = (read_page_state(nr_mapped) * 100) / total_memory;

/*
* Now decide how much we really want to unmap some pages. The mapped
@@ -892,7 +889,7 @@ shrink_zone(struct zone *zone, struct sc
sc->nr_to_scan = min(nr_active,
(unsigned long)sc->swap_cluster_max);
nr_active -= sc->nr_to_scan;
- refill_inactive_zone(zone, sc);
+ refill_inactive_zone(zone, sc->nr_to_scan);
}

if (nr_inactive) {
@@ -991,7 +988,6 @@ int try_to_free_pages(struct zone **zone
}

for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- sc.nr_mapped = read_page_state(nr_mapped);
sc.nr_scanned = 0;
sc.nr_reclaimed = 0;
sc.priority = priority;
@@ -1080,7 +1076,6 @@ loop_again:
sc.gfp_mask = GFP_KERNEL;
sc.may_writepage = 0;
sc.may_swap = 1;
- sc.nr_mapped = read_page_state(nr_mapped);

inc_page_state(pageoutrun);

@@ -1397,7 +1392,6 @@ int zone_reclaim(struct zone *zone, gfp_
sc.gfp_mask = gfp_mask;
sc.may_writepage = 0;
sc.may_swap = 0;
- sc.nr_mapped = read_page_state(nr_mapped);
sc.nr_scanned = 0;
sc.nr_reclaimed = 0;
/* scan at the highest priority */

2005-12-30 22:41:39

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 04/14] page-replace-activate_page.patch


From: Peter Zijlstra <[email protected]>

page-replace interface function:
page_replace_activate()

This function will modify the page state for a reference action.

Signed-off-by: Peter Zijlstra <[email protected]>

include/linux/mm_page_replace.h | 4 ++++
mm/vmscan.c | 3 ++-
2 files changed, 6 insertions(+), 1 deletion(-)

Index: linux-2.6-git/include/linux/mm_page_replace.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_page_replace.h 2005-12-10 17:13:56.000000000 +0100
+++ linux-2.6-git/include/linux/mm_page_replace.h 2005-12-10 18:19:30.000000000 +0100
@@ -7,6 +7,10 @@
#include <linux/mm.h>

void __page_replace_insert(struct zone *, struct page *);
+static inline void page_replace_activate(struct page *page)
+{
+ SetPageActive(page);
+}

#endif /* __KERNEL__ */
#endif /* _LINUX_MM_PAGE_REPLACE_H */
Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c 2005-12-10 17:13:57.000000000 +0100
+++ linux-2.6-git/mm/vmscan.c 2005-12-10 18:19:34.000000000 +0100
@@ -33,6 +33,7 @@
#include <linux/cpuset.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
+#include <linux/mm_page_replace.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -573,7 +574,7 @@ static int shrink_list(struct list_head

switch(try_pageout(page, sc)) {
case PAGEOUT_ACTIVATE:
- SetPageActive(page);
+ page_replace_activate(page);
pgactivate++;
/* fall through */
case PAGEOUT_KEEP:

2005-12-30 22:41:12

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 05/14] page-replace-remove-loop.patch


From: Peter Zijlstra <[email protected]>

This patch removes the loop in shrink_cache() by directly taking
sc->nr_to_scan pages.

Kudos to Wu Fengguang who did a similair patch.

Signed-off-by: Peter Zijlstra <[email protected]>

mm/vmscan.c | 88 +++++++++++++++++++++++++++---------------------------------
1 files changed, 41 insertions(+), 47 deletions(-)

Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c 2005-12-10 17:13:57.000000000 +0100
+++ linux-2.6-git/mm/vmscan.c 2005-12-10 18:19:30.000000000 +0100
@@ -653,61 +653,55 @@ static void shrink_cache(struct zone *zo
{
LIST_HEAD(page_list);
struct pagevec pvec;
- int max_scan = sc->nr_to_scan;
-
- pagevec_init(&pvec, 1);
+ struct page *page;
+ int nr_taken;
+ int nr_scan;
+ int nr_freed;

lru_add_drain();
+
spin_lock_irq(&zone->lru_lock);
- while (max_scan > 0) {
- struct page *page;
- int nr_taken;
- int nr_scan;
- int nr_freed;
-
- nr_taken = isolate_lru_pages(sc->swap_cluster_max,
- &zone->inactive_list,
- &page_list, &nr_scan);
- zone->nr_inactive -= nr_taken;
- zone->pages_scanned += nr_scan;
- spin_unlock_irq(&zone->lru_lock);
+ nr_taken = isolate_lru_pages(sc->nr_to_scan,
+ &zone->inactive_list,
+ &page_list, &nr_scan);
+ zone->nr_inactive -= nr_taken;
+ zone->pages_scanned += nr_scan;
+ spin_unlock_irq(&zone->lru_lock);

- if (nr_taken == 0)
- goto done;
+ if (nr_taken == 0)
+ return;

- max_scan -= nr_scan;
- if (current_is_kswapd())
- mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
- else
- mod_page_state_zone(zone, pgscan_direct, nr_scan);
- nr_freed = shrink_list(&page_list, sc);
- if (current_is_kswapd())
- mod_page_state(kswapd_steal, nr_freed);
- mod_page_state_zone(zone, pgsteal, nr_freed);
- sc->nr_to_reclaim -= nr_freed;
+ if (current_is_kswapd())
+ mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+ else
+ mod_page_state_zone(zone, pgscan_direct, nr_scan);
+ nr_freed = shrink_list(&page_list, sc);
+ if (current_is_kswapd())
+ mod_page_state(kswapd_steal, nr_freed);
+ mod_page_state_zone(zone, pgsteal, nr_freed);
+ sc->nr_to_reclaim -= nr_freed;

- spin_lock_irq(&zone->lru_lock);
- /*
- * Put back any unfreeable pages.
- */
- while (!list_empty(&page_list)) {
- page = lru_to_page(&page_list);
- if (TestSetPageLRU(page))
- BUG();
- list_del(&page->lru);
- if (PageActive(page))
- add_page_to_active_list(zone, page);
- else
- add_page_to_inactive_list(zone, page);
- if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
+ /*
+ * Put back any unfreeable pages.
+ */
+ pagevec_init(&pvec, 1);
+ spin_lock_irq(&zone->lru_lock);
+ while (!list_empty(&page_list)) {
+ page = lru_to_page(&page_list);
+ if (TestSetPageLRU(page))
+ BUG();
+ list_del(&page->lru);
+ if (PageActive(page))
+ add_page_to_active_list(zone, page);
+ else
+ add_page_to_inactive_list(zone, page);
+ if (!pagevec_add(&pvec, page)) {
+ spin_unlock_irq(&zone->lru_lock);
+ __pagevec_release(&pvec);
+ spin_lock_irq(&zone->lru_lock);
}
- }
+ }
spin_unlock_irq(&zone->lru_lock);
-done:
pagevec_release(&pvec);
}

2005-12-30 22:42:51

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 13/14] page-replace-init.patch


From: Peter Zijlstra <[email protected]>

page-replace interface function:
page_replace_init_zone()

This function initialized the page replace specific members of
struct zone.

Signed-off-by: Peter Zijlstra <[email protected]>

include/linux/mm_page_replace.h | 1 +
mm/page_alloc.c | 6 ++----
mm/page_replace.c | 8 ++++++++
3 files changed, 11 insertions(+), 4 deletions(-)

Index: linux-2.6-git/include/linux/mm_page_replace.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_page_replace.h
+++ linux-2.6-git/include/linux/mm_page_replace.h
@@ -38,6 +38,7 @@
#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif

+void __init page_replace_init_zone(struct zone *);
void __page_replace_insert(struct zone *, struct page *);
void page_replace_candidates(struct zone *, int, struct list_head *);

Index: linux-2.6-git/mm/page_alloc.c
===================================================================
--- linux-2.6-git.orig/mm/page_alloc.c
+++ linux-2.6-git/mm/page_alloc.c
@@ -36,6 +36,7 @@
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
+#include <linux/mm_page_replace.h>

#include <asm/tlbflush.h>
#include "internal.h"
@@ -2007,11 +2008,7 @@ static void __init free_area_init_core(s
zone->temp_priority = zone->prev_priority = DEF_PRIORITY;

zone_pcp_init(zone);
- INIT_LIST_HEAD(&zone->active_list);
- INIT_LIST_HEAD(&zone->inactive_list);
- zone->nr_scan_active = 0;
- zone->nr_active = 0;
- zone->nr_inactive = 0;
+ page_replace_init_zone(zone);
atomic_set(&zone->reclaim_in_progress, 0);
if (!size)
continue;
Index: linux-2.6-git/mm/page_replace.c
===================================================================
--- linux-2.6-git.orig/mm/page_replace.c
+++ linux-2.6-git/mm/page_replace.c
@@ -22,6 +22,15 @@ static int __init page_replace_init(void

module_init(page_replace_init)

+void __init page_replace_init_zone(struct zone *zone)
+{
+ INIT_LIST_HEAD(&zone->active_list);
+ INIT_LIST_HEAD(&zone->inactive_list);
+ zone->nr_active = 0;
+ zone->nr_inactive = 0;
+ zone->nr_scan_active = 0;
+}
+
static inline void
add_page_to_inactive_list(struct zone *zone, struct page *page)
{

2005-12-30 22:42:51

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 14/14] page-replace-kswapd-incmin.patch


From: Nick Piggin <[email protected]>

Explicitly teach kswapd about the incremental min logic instead of just scanning
all zones under the first low zone. This should keep more even pressure applied
on the zones.

The new shrink_zone() logic exposes the very worst side of the current
balance_pgdat() function. Without this patch reclaim is limited to ZONE_DMA.

Signed-off-by: Nick Piggin <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>


mm/vmscan.c | 97 +++++++++++++++++++++---------------------------------------
1 file changed, 34 insertions(+), 63 deletions(-)

Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c
+++ linux-2.6-git/mm/vmscan.c
@@ -790,46 +790,18 @@ loop_again:
}

for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;

+ all_zones_ok = 1;
+ sc.nr_scanned = 0;
+ sc.nr_reclaimed = 0;
+ sc.priority = priority;
+ sc.swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX;
+
/* The swap token gets in the way of swapout... */
if (!priority)
disable_swap_token();

- all_zones_ok = 1;
-
- if (nr_pages == 0) {
- /*
- * Scan in the highmem->dma direction for the highest
- * zone which needs scanning
- */
- for (i = pgdat->nr_zones - 1; i >= 0; i--) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (zone->present_pages == 0)
- continue;
-
- if (zone->all_unreclaimable &&
- priority != DEF_PRIORITY)
- continue;
-
- if (!zone_watermark_ok(zone, order,
- zone->pages_high, 0, 0)) {
- end_zone = i;
- goto scan;
- }
- }
- goto out;
- } else {
- end_zone = pgdat->nr_zones - 1;
- }
-scan:
- for (i = 0; i <= end_zone; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- lru_pages += zone->nr_active + zone->nr_inactive;
- }

/*
* Now scan the zone in the dma->highmem direction, stopping
@@ -840,51 +812,51 @@ scan:
* pages behind kswapd's direction of progress, which would
* cause too much scanning of the lower zones.
*/
- for (i = 0; i <= end_zone; i++) {
+ for (i = 0; i <= pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
- int nr_slab;

if (zone->present_pages == 0)
continue;

+ if (nr_pages == 0) { /* Not software suspend */
+ if (zone_watermark_ok(zone, order,
+ zone->pages_high, 0, 0))
+ continue;
+
+ all_zones_ok = 0;
+ }
+
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;

- if (nr_pages == 0) { /* Not software suspend */
- if (!zone_watermark_ok(zone, order,
- zone->pages_high, end_zone, 0))
- all_zones_ok = 0;
- }
zone->temp_priority = priority;
if (zone->prev_priority > priority)
zone->prev_priority = priority;
- sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- sc.priority = priority;
- sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
+ lru_pages += zone->nr_active + zone->nr_inactive;
+
atomic_inc(&zone->reclaim_in_progress);
shrink_zone(zone, &sc);
atomic_dec(&zone->reclaim_in_progress);
- reclaim_state->reclaimed_slab = 0;
- nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
- lru_pages);
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
- total_reclaimed += sc.nr_reclaimed;
- total_scanned += sc.nr_scanned;
- if (zone->all_unreclaimable)
- continue;
- if (nr_slab == 0 && zone->pages_scanned >=
+
+ if (zone->pages_scanned >=
(zone->nr_active + zone->nr_inactive) * 4)
zone->all_unreclaimable = 1;
- /*
- * If we've done a decent amount of scanning and
- * the reclaim ratio is low, start doing writepage
- * even in laptop mode
- */
- if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
- total_scanned > total_reclaimed+total_reclaimed/2)
- sc.may_writepage = 1;
}
+ reclaim_state->reclaimed_slab = 0;
+ shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
+ sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+ total_reclaimed += sc.nr_reclaimed;
+ total_scanned += sc.nr_scanned;
+
+ /*
+ * If we've done a decent amount of scanning and
+ * the reclaim ratio is low, start doing writepage
+ * even in laptop mode
+ */
+ if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+ total_scanned > total_reclaimed+total_reclaimed/2)
+ sc.may_writepage = 1;
+
if (nr_pages && to_free > total_reclaimed)
continue; /* swsusp: need to do more work */
if (all_zones_ok)
@@ -905,7 +877,6 @@ scan:
if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
break;
}
-out:
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;

2005-12-30 22:42:00

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 08/14] page-replace-candidates.patch


From: Peter Zijlstra <[email protected]>

page-replace interface function:
page_replace_candidates()

Abstract the taking of candidate reclaim pages and place the new function
in page_replace.c; the place where all list manupulation happens.

Signed-off-by: Peter Zijlstra <[email protected]>

include/linux/mm_page_replace.h | 2 ++
mm/page_replace.c | 20 ++++++++++++++++++++
mm/vmscan.c | 17 ++---------------
3 files changed, 24 insertions(+), 15 deletions(-)

Index: linux-2.6-git-2/include/linux/mm_page_replace.h
===================================================================
--- linux-2.6-git-2.orig/include/linux/mm_page_replace.h
+++ linux-2.6-git-2/include/linux/mm_page_replace.h
@@ -5,6 +5,7 @@

#include <linux/mmzone.h>
#include <linux/mm.h>
+#include <linux/list.h>

#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))

@@ -37,6 +38,7 @@
#endif

void __page_replace_insert(struct zone *, struct page *);
+void page_replace_candidates(struct zone *, int, struct list_head *);
static inline void page_replace_activate(struct page *page)
{
SetPageActive(page);
Index: linux-2.6-git-2/mm/page_replace.c
===================================================================
--- linux-2.6-git-2.orig/mm/page_replace.c
+++ linux-2.6-git-2/mm/page_replace.c
@@ -1,5 +1,6 @@
#include <linux/mm_page_replace.h>
#include <linux/mm_inline.h>
+#include <linux/swap.h>


void __page_replace_insert(struct zone *zone, struct page *page)
@@ -58,3 +59,22 @@ int isolate_lru_pages(int nr_to_scan, st
*scanned = scan;
return nr_taken;
}
+
+void page_replace_candidates(struct zone *zone, int nr_to_scan, struct list_head *page_list)
+{
+ int nr_taken;
+ int nr_scan;
+
+ spin_lock_irq(&zone->lru_lock);
+ nr_taken = isolate_lru_pages(nr_to_scan, &zone->inactive_list,
+ page_list, &nr_scan);
+ zone->nr_inactive -= nr_taken;
+ zone->pages_scanned += nr_scan;
+ spin_unlock_irq(&zone->lru_lock);
+
+ if (current_is_kswapd())
+ mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+ else
+ mod_page_state_zone(zone, pgscan_direct, nr_scan);
+}
+
Index: linux-2.6-git-2/mm/vmscan.c
===================================================================
--- linux-2.6-git-2.orig/mm/vmscan.c
+++ linux-2.6-git-2/mm/vmscan.c
@@ -575,27 +575,14 @@ static void shrink_cache(struct zone *zo
LIST_HEAD(page_list);
struct pagevec pvec;
struct page *page;
- int nr_taken;
- int nr_scan;
int nr_freed;

lru_add_drain();

- spin_lock_irq(&zone->lru_lock);
- nr_taken = isolate_lru_pages(sc->nr_to_scan,
- &zone->inactive_list,
- &page_list, &nr_scan);
- zone->nr_inactive -= nr_taken;
- zone->pages_scanned += nr_scan;
- spin_unlock_irq(&zone->lru_lock);
-
- if (nr_taken == 0)
+ page_replace_candidates(zone, sc->nr_to_scan, &page_list);
+ if (list_empty(&page_list))
return;

- if (current_is_kswapd())
- mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
- else
- mod_page_state_zone(zone, pgscan_direct, nr_scan);
nr_freed = shrink_list(&page_list, sc);
if (current_is_kswapd())
mod_page_state(kswapd_steal, nr_freed);

2005-12-30 22:42:03

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 06/14] page-replace-move-macros.patch


From: Peter Zijlstra <[email protected]>

Move some utility marco's to the new common header to they can be used
by the code moved over to page_replace.c

Signed-off-by: Peter Zijlstra <[email protected]>

--- linux-2.6-git.orig/include/linux/mm_page_replace.h 2005-12-10 20:50:44.000000000 +0100
+++ linux-2.6-git/include/linux/mm_page_replace.h 2005-12-10 20:53:12.000000000 +0100
@@ -6,6 +6,36 @@
#include <linux/mmzone.h>
#include <linux/mm.h>

+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+
+#ifdef ARCH_HAS_PREFETCH
+#define prefetch_prev_lru_page(_page, _base, _field) \
+ do { \
+ if ((_page)->lru.prev != _base) { \
+ struct page *prev; \
+ \
+ prev = lru_to_page(&(_page->lru)); \
+ prefetch(&prev->_field); \
+ } \
+ } while (0)
+#else
+#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+#ifdef ARCH_HAS_PREFETCHW
+#define prefetchw_prev_lru_page(_page, _base, _field) \
+ do { \
+ if ((_page)->lru.prev != _base) { \
+ struct page *prev; \
+ \
+ prev = lru_to_page(&(_page->lru)); \
+ prefetchw(&prev->_field); \
+ } \
+ } while (0)
+#else
+#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
void __page_replace_insert(struct zone *, struct page *);
static inline void page_replace_activate(struct page *page)
{
--- linux-2.6-git.orig/mm/vmscan.c 2005-12-10 20:50:44.000000000 +0100
+++ linux-2.6-git/mm/vmscan.c 2005-12-10 20:53:12.000000000 +0100
@@ -104,36 +104,6 @@
long nr; /* objs pending delete */
};

-#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
-
-#ifdef ARCH_HAS_PREFETCH
-#define prefetch_prev_lru_page(_page, _base, _field) \
- do { \
- if ((_page)->lru.prev != _base) { \
- struct page *prev; \
- \
- prev = lru_to_page(&(_page->lru)); \
- prefetch(&prev->_field); \
- } \
- } while (0)
-#else
-#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
-#endif
-
-#ifdef ARCH_HAS_PREFETCHW
-#define prefetchw_prev_lru_page(_page, _base, _field) \
- do { \
- if ((_page)->lru.prev != _base) { \
- struct page *prev; \
- \
- prev = lru_to_page(&(_page->lru)); \
- prefetchw(&prev->_field); \
- } \
- } while (0)
-#else
-#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
-#endif
-
/*
* From 0 .. 100. Higher means more swappy.
*/

2005-12-30 22:42:11

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 10/14] page-replace-remove-mm_inline.patch


From: Peter Zijlstra <[email protected]>

Move the few mm_inline function to the new files.

Signed-off-by: Peter Zijlstra <[email protected]>

include/linux/mm_inline.h | 40 ----------------------------------------
include/linux/mm_page_replace.h | 33 +++++++++++++++++++++++++++++++++
mm/page_replace.c | 7 ++++++-
mm/swap.c | 1 -
mm/vmscan.c | 1 -
5 files changed, 39 insertions(+), 43 deletions(-)

Index: linux-2.6-git/include/linux/mm_inline.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_inline.h
+++ /dev/null
@@ -1,40 +0,0 @@
-
-static inline void
-add_page_to_active_list(struct zone *zone, struct page *page)
-{
- list_add(&page->lru, &zone->active_list);
- zone->nr_active++;
-}
-
-static inline void
-add_page_to_inactive_list(struct zone *zone, struct page *page)
-{
- list_add(&page->lru, &zone->inactive_list);
- zone->nr_inactive++;
-}
-
-static inline void
-del_page_from_active_list(struct zone *zone, struct page *page)
-{
- list_del(&page->lru);
- zone->nr_active--;
-}
-
-static inline void
-del_page_from_inactive_list(struct zone *zone, struct page *page)
-{
- list_del(&page->lru);
- zone->nr_inactive--;
-}
-
-static inline void
-del_page_from_lru(struct zone *zone, struct page *page)
-{
- list_del(&page->lru);
- if (PageActive(page)) {
- ClearPageActive(page);
- zone->nr_active--;
- } else {
- zone->nr_inactive--;
- }
-}
Index: linux-2.6-git/include/linux/mm_page_replace.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_page_replace.h
+++ linux-2.6-git/include/linux/mm_page_replace.h
@@ -47,5 +47,38 @@ void page_replace_reinsert(struct zone *

int isolate_lru_pages(int, struct list_head *, struct list_head *, int *);

+static inline void
+add_page_to_active_list(struct zone *zone, struct page *page)
+{
+ list_add(&page->lru, &zone->active_list);
+ zone->nr_active++;
+}
+
+static inline void
+del_page_from_active_list(struct zone *zone, struct page *page)
+{
+ list_del(&page->lru);
+ zone->nr_active--;
+}
+
+static inline void
+del_page_from_inactive_list(struct zone *zone, struct page *page)
+{
+ list_del(&page->lru);
+ zone->nr_inactive--;
+}
+
+static inline void
+del_page_from_lru(struct zone *zone, struct page *page)
+{
+ list_del(&page->lru);
+ if (PageActive(page)) {
+ ClearPageActive(page);
+ zone->nr_active--;
+ } else {
+ zone->nr_inactive--;
+ }
+}
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_PAGE_REPLACE_H */
Index: linux-2.6-git/mm/page_replace.c
===================================================================
--- linux-2.6-git.orig/mm/page_replace.c
+++ linux-2.6-git/mm/page_replace.c
@@ -1,8 +1,13 @@
#include <linux/mm_page_replace.h>
-#include <linux/mm_inline.h>
#include <linux/swap.h>
#include <linux/pagevec.h>

+static inline void
+add_page_to_inactive_list(struct zone *zone, struct page *page)
+{
+ list_add(&page->lru, &zone->inactive_list);
+ zone->nr_inactive++;
+}

void __page_replace_insert(struct zone *zone, struct page *page)
{
Index: linux-2.6-git/mm/swap.c
===================================================================
--- linux-2.6-git.orig/mm/swap.c
+++ linux-2.6-git/mm/swap.c
@@ -22,7 +22,6 @@
#include <linux/pagevec.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/mm_inline.h>
#include <linux/buffer_head.h> /* for try_to_release_page() */
#include <linux/module.h>
#include <linux/percpu_counter.h>
Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c
+++ linux-2.6-git/mm/vmscan.c
@@ -24,7 +24,6 @@
#include <linux/blkdev.h>
#include <linux/buffer_head.h> /* for try_to_release_page(),
buffer_heads_over_limit */
-#include <linux/mm_inline.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>

2005-12-30 22:43:39

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 2/9] clockpro-nonresident-del.patch


From: Peter Zijlstra <[email protected]>

Since the coupling of the resident and nonresident clocks depend on the
actual number of nonresident pages present, stale entries influence the
actual behaviour. Hence we remove the nonresident pages from the map.

Signed-off-by: Peter Zijlstra <[email protected]>

mm/memory.c | 24 ++++++++++++++++++++++++
mm/swapfile.c | 12 ++++++++++--
2 files changed, 34 insertions(+), 2 deletions(-)

Index: linux-2.6-git/mm/swapfile.c
===================================================================
--- linux-2.6-git.orig/mm/swapfile.c
+++ linux-2.6-git/mm/swapfile.c
@@ -278,7 +278,8 @@ void swap_free(swp_entry_t entry)

p = swap_info_get(entry);
if (p) {
- swap_entry_free(p, swp_offset(entry));
+ if (!swap_entry_free(p, swp_offset(entry)))
+ nonresident_get(&swapper_space, entry.val);
spin_unlock(&swap_lock);
}
}
@@ -375,8 +376,15 @@ void free_swap_and_cache(swp_entry_t ent

p = swap_info_get(entry);
if (p) {
- if (swap_entry_free(p, swp_offset(entry)) == 1)
+ switch (swap_entry_free(p, swp_offset(entry))) {
+ case 1:
page = find_trylock_page(&swapper_space, entry.val);
+ break;
+
+ case 0:
+ nonresident_get(&swapper_space, entry.val);
+ break;
+ }
spin_unlock(&swap_lock);
}
if (page) {
Index: linux-2.6-git/mm/memory.c
===================================================================
--- linux-2.6-git.orig/mm/memory.c
+++ linux-2.6-git/mm/memory.c
@@ -595,6 +595,27 @@ int copy_page_range(struct mm_struct *ds
return 0;
}

+static void free_file(struct vm_area_struct *vma,
+ unsigned long offset)
+{
+ struct address_space *mapping;
+ struct page *page;
+
+ if (!vma ||
+ !vma->vm_file ||
+ !vma->vm_file->f_mapping)
+ return;
+
+ mapping = vma->vm_file->f_mapping;
+ page = find_get_page(mapping, offset);
+ if (page) {
+ page_cache_release(page);
+ return;
+ }
+
+ nonresident_get(mapping, offset);
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
@@ -610,6 +631,7 @@ static unsigned long zap_pte_range(struc
do {
pte_t ptent = *pte;
if (pte_none(ptent)) {
+ free_file(vma, pte_to_pgoff(ptent));
(*zap_work)--;
continue;
}
@@ -668,6 +690,8 @@ static unsigned long zap_pte_range(struc
continue;
if (!pte_file(ptent))
free_swap_and_cache(pte_to_swp_entry(ptent));
+ else
+ free_file(vma, pte_to_pgoff(ptent));
pte_clear_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));

2005-12-30 22:44:25

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 7/9] clockpro-remove-old.patch


From: Peter Zijlstra <[email protected]>

Remove the old page replacement code, unused now.

Signed-off-by: Peter Zijlstra <[email protected]>

mm/page_replace.c | 298 ------------------------------------------------------
1 file changed, 298 deletions(-)

Index: linux-2.6-git/mm/page_replace.c
===================================================================
--- linux-2.6-git.orig/mm/page_replace.c
+++ /dev/null
@@ -1,298 +0,0 @@
-#include <linux/mm_page_replace.h>
-#include <linux/swap.h>
-#include <linux/pagevec.h>
-#include <linux/init.h>
-#include <linux/rmap.h>
-#include <linux/buffer_head.h> /* for try_to_release_page(),
- buffer_heads_over_limit */
-
-/*
- * From 0 .. 100. Higher means more swappy.
- */
-int vm_swappiness = 60;
-static long total_memory;
-
-static void refill_inactive_zone(struct zone *, int);
-
-static int __init page_replace_init(void)
-{
- total_memory = nr_free_pagecache_pages();
- return 0;
-}
-
-module_init(page_replace_init)
-
-void __init page_replace_init_zone(struct zone *zone)
-{
- INIT_LIST_HEAD(&zone->active_list);
- INIT_LIST_HEAD(&zone->inactive_list);
- zone->nr_active = 0;
- zone->nr_inactive = 0;
- zone->nr_scan_active = 0;
-}
-
-static inline void
-add_page_to_inactive_list(struct zone *zone, struct page *page)
-{
- list_add(&page->lru, &zone->inactive_list);
- zone->nr_inactive++;
-}
-
-void __page_replace_insert(struct zone *zone, struct page *page)
-{
- if (PageActive(page))
- add_page_to_active_list(zone, page);
- else
- add_page_to_inactive_list(zone, page);
-}
-
-/*
- * zone->lru_lock is heavily contended. Some of the functions that
- * shrink the lists perform better by taking out a batch of pages
- * and working on them outside the LRU lock.
- *
- * For pagecache intensive workloads, this function is the hottest
- * spot in the kernel (apart from copy_*_user functions).
- *
- * Appropriate locks must be held before calling this function.
- *
- * @nr_to_scan: The number of pages to look through on the list.
- * @src: The LRU list to pull pages off.
- * @dst: The temp list to put pages on to.
- * @scanned: The number of pages that were scanned.
- *
- * returns how many pages were moved onto *@dst.
- */
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
- struct list_head *dst, int *scanned)
-{
- int nr_taken = 0;
- struct page *page;
- int scan = 0;
-
- while (scan++ < nr_to_scan && !list_empty(src)) {
- page = lru_to_page(src);
- prefetchw_prev_lru_page(page, src, flags);
-
- if (!TestClearPageLRU(page))
- BUG();
- list_del(&page->lru);
- if (get_page_testone(page)) {
- /*
- * It is being freed elsewhere
- */
- __put_page(page);
- SetPageLRU(page);
- list_add(&page->lru, src);
- continue;
- } else {
- list_add(&page->lru, dst);
- nr_taken++;
- }
- }
-
- *scanned = scan;
- return nr_taken;
-}
-
-void page_replace_candidates(struct zone *zone, int nr_to_scan, struct list_head *page_list)
-{
- int nr_taken;
- int nr_scan;
- unsigned long long nr_scan_active;
-
- spin_lock_irq(&zone->lru_lock);
- nr_taken = isolate_lru_pages(nr_to_scan, &zone->inactive_list,
- page_list, &nr_scan);
- zone->nr_inactive -= nr_taken;
- zone->pages_scanned += nr_scan;
- spin_unlock_irq(&zone->lru_lock);
-
- if (current_is_kswapd())
- mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
- else
- mod_page_state_zone(zone, pgscan_direct, nr_scan);
-
- /*
- * Add one to `nr_to_scan' just to make sure that the kernel will
- * slowly sift through the active list.
- */
- nr_scan_active = (nr_scan + 1ULL) * zone->nr_active * 1024ULL;
- do_div(nr_scan_active, zone->nr_inactive + nr_taken + 1UL);
- zone->nr_scan_active += nr_scan_active;
- while (zone->nr_scan_active >= SWAP_CLUSTER_MAX * 1024UL) {
- zone->nr_scan_active -= SWAP_CLUSTER_MAX * 1024UL;
- refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
- }
-}
-
-/*
- * Put back any unfreeable pages.
- */
-void page_replace_reinsert(struct zone *zone, struct list_head *page_list)
-{
- struct pagevec pvec;
-
- pagevec_init(&pvec, 1);
- spin_lock_irq(&zone->lru_lock);
- while (!list_empty(page_list)) {
- struct page *page = lru_to_page(page_list);
- BUG_ON(PageLRU(page));
- SetPageLRU(page);
- list_del(&page->lru);
- if (PageActive(page))
- add_page_to_active_list(zone, page);
- else
- add_page_to_inactive_list(zone, page);
- if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
-}
-
-/*
- * This moves pages from the active list to the inactive list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone->lru_lock across the whole operation. But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone->lru_lock around each page. It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_count against each page.
- * But we had to alter page->flags anyway.
- */
-static void refill_inactive_zone(struct zone *zone, int nr_pages)
-{
- int pgmoved;
- int pgdeactivate = 0;
- int pgscanned;
- LIST_HEAD(l_hold); /* The pages which were snipped off */
- LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
- LIST_HEAD(l_active); /* Pages to go onto the active_list */
- struct page *page;
- struct pagevec pvec;
- int reclaim_mapped = 0;
- long mapped_ratio;
- long distress;
- long swap_tendency;
-
- lru_add_drain();
- spin_lock_irq(&zone->lru_lock);
- pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
- &l_hold, &pgscanned);
- zone->pages_scanned += pgscanned;
- zone->nr_active -= pgmoved;
- spin_unlock_irq(&zone->lru_lock);
-
- /*
- * `distress' is a measure of how much trouble we're having reclaiming
- * pages. 0 -> no problems. 100 -> great trouble.
- */
- distress = 100 >> zone->prev_priority;
-
- /*
- * The point of this algorithm is to decide when to start reclaiming
- * mapped memory instead of just pagecache. Work out how much memory
- * is mapped.
- */
- mapped_ratio = (read_page_state(nr_mapped) * 100) / total_memory;
-
- /*
- * Now decide how much we really want to unmap some pages. The mapped
- * ratio is downgraded - just because there's a lot of mapped memory
- * doesn't necessarily mean that page reclaim isn't succeeding.
- *
- * The distress ratio is important - we don't want to start going oom.
- *
- * A 100% value of vm_swappiness overrides this algorithm altogether.
- */
- swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
- /*
- * Now use this metric to decide whether to start moving mapped memory
- * onto the inactive list.
- */
- if (swap_tendency >= 100)
- reclaim_mapped = 1;
-
- while (!list_empty(&l_hold)) {
- cond_resched();
- page = lru_to_page(&l_hold);
- list_del(&page->lru);
- if (page_mapped(page)) {
- if (!reclaim_mapped ||
- (total_swap_pages == 0 && PageAnon(page)) ||
- page_referenced(page, 0, 0)) {
- list_add(&page->lru, &l_active);
- continue;
- }
- }
- list_add(&page->lru, &l_inactive);
- }
-
- pagevec_init(&pvec, 1);
- pgmoved = 0;
- spin_lock_irq(&zone->lru_lock);
- while (!list_empty(&l_inactive)) {
- page = lru_to_page(&l_inactive);
- prefetchw_prev_lru_page(page, &l_inactive, flags);
- if (TestSetPageLRU(page))
- BUG();
- if (!TestClearPageActive(page))
- BUG();
- list_move(&page->lru, &zone->inactive_list);
- pgmoved++;
- if (!pagevec_add(&pvec, page)) {
- zone->nr_inactive += pgmoved;
- spin_unlock_irq(&zone->lru_lock);
- pgdeactivate += pgmoved;
- pgmoved = 0;
- if (buffer_heads_over_limit)
- pagevec_strip(&pvec);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- zone->nr_inactive += pgmoved;
- pgdeactivate += pgmoved;
- if (buffer_heads_over_limit) {
- spin_unlock_irq(&zone->lru_lock);
- pagevec_strip(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
-
- pgmoved = 0;
- while (!list_empty(&l_active)) {
- page = lru_to_page(&l_active);
- prefetchw_prev_lru_page(page, &l_active, flags);
- if (TestSetPageLRU(page))
- BUG();
- BUG_ON(!PageActive(page));
- list_move(&page->lru, &zone->active_list);
- pgmoved++;
- if (!pagevec_add(&pvec, page)) {
- zone->nr_active += pgmoved;
- pgmoved = 0;
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- zone->nr_active += pgmoved;
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
-
- mod_page_state_zone(zone, pgrefill, pgscanned);
- mod_page_state(pgdeactivate, pgdeactivate);
-}
-

2005-12-30 22:43:40

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 4/9] clockpro-use-once.patch


From: Rik van Riel <[email protected]>

Simplify the use-once code. I have not benchmarked this change yet,
but I expect it to have little impact on most workloads. It gets rid
of some magic code though, which is nice.

Modified to work with this CLOCK-Pro implementation.

Signed-off-by: Peter Zijlstra <[email protected]>

include/linux/mm_page_replace.h | 14 --------------
mm/filemap.c | 11 ++---------
mm/shmem.c | 6 +-----
mm/swap.c | 27 +--------------------------
mm/swapfile.c | 4 ++--
mm/vmscan.c | 25 ++-----------------------
6 files changed, 8 insertions(+), 79 deletions(-)

Index: linux-2.6-git/mm/filemap.c
===================================================================
--- linux-2.6-git.orig/mm/filemap.c
+++ linux-2.6-git/mm/filemap.c
@@ -387,6 +387,7 @@ int add_to_page_cache(struct page *page,
if (!error) {
page_cache_get(page);
SetPageLocked(page);
+ SetPageTest(page);
page->mapping = mapping;
page->index = offset;
mapping->nrpages++;
@@ -726,7 +727,6 @@ void do_generic_mapping_read(struct addr
unsigned long offset;
unsigned long last_index;
unsigned long next_index;
- unsigned long prev_index;
loff_t isize;
struct page *cached_page;
int error;
@@ -735,7 +735,6 @@ void do_generic_mapping_read(struct addr
cached_page = NULL;
index = *ppos >> PAGE_CACHE_SHIFT;
next_index = index;
- prev_index = ra.prev_page;
last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;

@@ -782,13 +781,7 @@ page_ok:
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);

- /*
- * When (part of) the same page is read multiple times
- * in succession, only mark it as accessed the first time.
- */
- if (prev_index != index)
- mark_page_accessed(page);
- prev_index = index;
+ mark_page_accessed(page);

/*
* Ok, we have the page, and it's up-to-date, so
Index: linux-2.6-git/mm/swap.c
===================================================================
--- linux-2.6-git.orig/mm/swap.c
+++ linux-2.6-git/mm/swap.c
@@ -90,37 +90,12 @@ int rotate_reclaimable_page(struct page
}

/*
- * FIXME: speed this up?
- */
-void fastcall activate_page(struct page *page)
-{
- struct zone *zone = page_zone(page);
-
- spin_lock_irq(&zone->lru_lock);
- if (PageLRU(page) && !PageActive(page)) {
- del_page_from_inactive_list(zone, page);
- SetPageActive(page);
- add_page_to_active_list(zone, page);
- inc_page_state(pgactivate);
- }
- spin_unlock_irq(&zone->lru_lock);
-}
-
-/*
* Mark a page as having seen activity.
- *
- * inactive,unreferenced -> inactive,referenced
- * inactive,referenced -> active,unreferenced
- * active,unreferenced -> active,referenced
*/
void fastcall mark_page_accessed(struct page *page)
{
- if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
- activate_page(page);
- ClearPageReferenced(page);
- } else if (!PageReferenced(page)) {
+ if (!PageReferenced(page))
SetPageReferenced(page);
- }
}

EXPORT_SYMBOL(mark_page_accessed);
Index: linux-2.6-git/mm/swapfile.c
===================================================================
--- linux-2.6-git.orig/mm/swapfile.c
+++ linux-2.6-git/mm/swapfile.c
@@ -421,7 +421,7 @@ static void unuse_pte(struct vm_area_str
* Move the page to the active list so it is not
* immediately swapped out again after swapon.
*/
- activate_page(page);
+ mark_page_accessed(page);
}

static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -523,7 +523,7 @@ static int unuse_mm(struct mm_struct *mm
* Activate page so shrink_cache is unlikely to unmap its
* ptes while lock is dropped, so swapoff can make progress.
*/
- activate_page(page);
+ mark_page_accessed(page);
unlock_page(page);
down_read(&mm->mmap_sem);
lock_page(page);
Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c
+++ linux-2.6-git/mm/vmscan.c
@@ -219,27 +219,6 @@ static int shrink_slab(unsigned long sca
return ret;
}

-/* Called without lock on whether page is mapped, so answer is unstable */
-static inline int page_mapping_inuse(struct page *page)
-{
- struct address_space *mapping;
-
- /* Page is in somebody's page tables. */
- if (page_mapped(page))
- return 1;
-
- /* Be more reluctant to reclaim swapcache than pagecache */
- if (PageSwapCache(page))
- return 1;
-
- mapping = page_mapping(page);
- if (!mapping)
- return 0;
-
- /* File is mmap'd by somebody? */
- return mapping_mapped(mapping);
-}
-
static inline int is_page_cache_freeable(struct page *page)
{
return page_count(page) - !!PagePrivate(page) == 2;
@@ -374,8 +353,8 @@ static try_pageout_t try_pageout(struct
goto keep_locked;

referenced = page_referenced(page, 1);
- /* In active use or really unfreeable? Activate it. */
- if (referenced && page_mapping_inuse(page))
+
+ if (referenced)
goto activate_locked;

#ifdef CONFIG_SWAP
Index: linux-2.6-git/mm/shmem.c
===================================================================
--- linux-2.6-git.orig/mm/shmem.c
+++ linux-2.6-git/mm/shmem.c
@@ -1499,11 +1499,7 @@ static void do_shmem_file_read(struct fi
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- /*
- * Mark the page accessed if we read the beginning.
- */
- if (!offset)
- mark_page_accessed(page);
+ mark_page_accessed(page);
} else {
page = ZERO_PAGE(0);
page_cache_get(page);
Index: linux-2.6-git/include/linux/mm_page_replace.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_page_replace.h
+++ linux-2.6-git/include/linux/mm_page_replace.h
@@ -67,20 +67,6 @@ add_page_to_active_list(struct zone *zon
}

static inline void
-del_page_from_active_list(struct zone *zone, struct page *page)
-{
- list_del(&page->lru);
- zone->nr_active--;
-}
-
-static inline void
-del_page_from_inactive_list(struct zone *zone, struct page *page)
-{
- list_del(&page->lru);
- zone->nr_inactive--;
-}
-
-static inline void
del_page_from_lru(struct zone *zone, struct page *page)
{
list_del(&page->lru);

2005-12-30 22:44:05

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 6/9] clockpro-clockpro.patch


From: Peter Zijlstra <[email protected]>

The flesh of the clockpro implementation.

The paper: http://www.cs.wm.edu/hpcs/WWW/HTML/publications/abs05-3.html
describes the aglorithm aproximated. It described a clock which has three
hands, hand cold, hot and test. The table below describes the actions
each hand performs.

res | hot | tst | ref || Hcold | Hhot | Htst || Flt
----+-----+-----+-----++-------+------+------++-----
1 | 1 | 0 | 1 ||=1101 | 1100 |=1101 ||
1 | 1 | 0 | 0 ||=1100 | 1000 |=1100 ||
----+-----+-----+-----++-------+------+------++-----
1 | 0 | 1 | 1 || 1100 | 1001 | 1001 ||
1 | 0 | 1 | 0 ||X0010 | 1000 | 1000 ||
1 | 0 | 0 | 1 || 1010 |=1001 |=1001 ||
1 | 0 | 0 | 0 ||X0000 |=1000 |=1000 ||
----+-----+-----+-----++-------+------+------++-----
----+-----+-----+-----++-------+------+------++-----
0 | 0 | 1 | 1 || | | || 1100
0 | 0 | 1 | 0 ||=0010 |X0000 |X0000 ||
0 | 0 | 0 | 1 || | | || 1010

The approximation made is the removal of the nonresident pages from the one
clock. The conceptual model is two clocks superimposed, one containing the
resident and one containing the nonresident pages.

Implementation wise I use something based on Rik van Riel's nonresident code
which actually aproximates a clock with reduced order.

The resident clock with two hands is implemented using two lists which are to
be seen as laid head to tail to form the clock. When one hand laps the other
the lists are swapped.

Each page has 3 state bits:

hot -> PageHot()
test -> PageTest()
ref -> page_referenced()

(PG_active will be renamed to PG_hot in a following patch, since the semantics
changed also change the name in order to avoid confusion))

The HandCold rotation is driven by page reclaim needs. HandCold in turn
drives HandHot, for every page HandCold promotes to hot HandHot needs to
degrade one hot page to cold.

Changing the cold page target number also has influence on the HandHot
rotation speed, when incremented the actual number of cold pages will be
less than the desired number and hence we need to degrade some extra hot
pages. When decreased, the actual number of cold pages is too large, so
we would need to inhibit the degradation of hot pages.

The cold page target count is maintained in zone->nr_cold_target; it is
incremented when a page is referenced in its test period and decremented
when a page's test period expires.

The nonresident CLOCK is coupled to HandHot and is rotated so that when
all resident zone CLOCKs have made one revolution, it too has made one
whole revolution.

Signed-off-by: Peter Zijlstra <[email protected]>

include/linux/mm_page_replace.h | 40 +-
include/linux/mmzone.h | 13
mm/Makefile | 2
mm/clockpro.c | 557 ++++++++++++++++++++++++++++++++++++++++
mm/page_alloc.c | 20 -
mm/vmscan.c | 15 -
6 files changed, 603 insertions(+), 44 deletions(-)

Index: linux-2.6-git/include/linux/mmzone.h
===================================================================
--- linux-2.6-git.orig/include/linux/mmzone.h
+++ linux-2.6-git/include/linux/mmzone.h
@@ -140,12 +140,13 @@ struct zone {
ZONE_PADDING(_pad1_)

/* Fields commonly accessed by the page reclaim scanner */
- spinlock_t lru_lock;
- struct list_head active_list;
- struct list_head inactive_list;
- unsigned long nr_scan_active;
- unsigned long nr_active;
- unsigned long nr_inactive;
+ spinlock_t lru_lock;
+ struct list_head list_hand[2];
+ unsigned long nr_resident;
+ unsigned long nr_cold;
+ unsigned long nr_cold_target;
+ unsigned long nr_nonresident_scale;
+
unsigned long pages_scanned; /* since last reclaim */
int all_unreclaimable; /* All pages pinned */

Index: linux-2.6-git/mm/clockpro.c
===================================================================
--- /dev/null
+++ linux-2.6-git/mm/clockpro.c
@@ -0,0 +1,557 @@
+/*
+ * mm/clockpro.c
+ *
+ * Written by Peter Zijlstra <[email protected]>
+ * Released under the GPLv2, see the file COPYING for details.
+ *
+ * res | h/c | tst | ref || Hcold | Hhot | Htst || Flt
+ * ----+-----+-----+-----++-------+------+------++-----
+ * 1 | 1 | 0 | 1 ||=1101 | 1100 |=1101 ||
+ * 1 | 1 | 0 | 0 ||=1100 | 1000 |=1100 ||
+ * ----+-----+-----+-----++-------+------+------++-----
+ * 1 | 0 | 1 | 1 || 1100 | 1001 | 1001 ||
+ * 1 | 0 | 1 | 0 ||X0010 | 1000 | 1000 ||
+ * 1 | 0 | 0 | 1 || 1010 |=1001 |=1001 ||
+ * 1 | 0 | 0 | 0 ||X0000 |=1000 |=1000 ||
+ * ----+-----+-----+-----++-------+------+------++-----
+ * ----+-----+-----+-----++-------+------+------++-----
+ * 0 | 0 | 1 | 1 || | | || 1100
+ * 0 | 0 | 1 | 0 ||=0010 |X0000 |X0000 ||
+ * 0 | 0 | 0 | 1 || | | || 1010
+ *
+ * h/c -> PageHot()
+ * tst -> PageTest()
+ * ref -> page_referenced()
+ *
+ * The HandCold rotation is driven by page reclaim needs. HandCold in turn
+ * drives HandHot, for every page HandCold promotes to hot HandHot needs to
+ * degrade one hot page to cold.
+ *
+ * Changing the cold page target number also has influence on the HandHot
+ * rotation speed, when incremented the actual number of cold pages will be
+ * less than the desired number and hence we need to degrade some extra hot
+ * pages. When decreased, the actual number of cold pages is too large, so
+ * we would need to inhibit the degradation of hot pages.
+ *
+ * The cold page target count is maintained in zone->nr_cold_target; it is
+ * incremented when a page is referenced in its test period and decremented
+ * when a page's test period expires.
+ *
+ * The nonresident CLOCK is coupled to HandHot and is rotated so that when
+ * all resident zone CLOCKs have made one revolution, it too has made one
+ * whole revolution (see __nonres_term()).
+ *
+ * All functions that are prefixed with '__' assume that zone->lru_lock is taken.
+ */
+
+#include <linux/mm_page_replace.h>
+#include <linux/rmap.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+
+#include <asm/div64.h>
+
+/*
+ * From 0 .. 100. Higher means more swappy.
+ */
+int vm_swappiness = 100;
+static long total_memory;
+
+static int __init page_replace_init(void)
+{
+ total_memory = nr_free_pagecache_pages();
+ return 0;
+}
+
+module_init(page_replace_init)
+
+/* Called to initialize the clockpro parameters */
+void __init page_replace_init_zone(struct zone *zone)
+{
+ INIT_LIST_HEAD(&zone->list_hand[0]);
+ INIT_LIST_HEAD(&zone->list_hand[1]);
+ zone->nr_resident = 0;
+ zone->nr_cold = 0;
+ zone->nr_cold_target = zone->pages_high;
+ zone->nr_nonresident_scale = 0;
+}
+
+/*
+ * Increase the cold pages target; limit it to the total number of resident
+ * pages present in the current zone.
+ *
+ * @zone: current zone
+ * @dct: intended increase
+ */
+static void __cold_target_inc(struct zone *zone, unsigned long dct)
+{
+ if (zone->nr_cold_target < zone->nr_resident - dct)
+ zone->nr_cold_target += dct;
+ else
+ zone->nr_cold_target = zone->nr_resident;
+}
+
+/*
+ * Decrease the cold pages target; limit it to the high watermark in order
+ * to always have some pages available for quick reclaim.
+ *
+ * @zone: current zone
+ * @dct: intended decrease
+ */
+static void __cold_target_dec(struct zone *zone, unsigned long dct)
+{
+ if (zone->nr_cold_target > zone->pages_high + dct)
+ zone->nr_cold_target -= dct;
+ else
+ zone->nr_cold_target = zone->pages_high;
+}
+
+static void swap_lists(struct zone *zone)
+{
+ LIST_HEAD(tmp);
+ list_splice_init(&zone->list_hand[0], &tmp);
+ list_splice_init(&zone->list_hand[1], &zone->list_hand[0]);
+ list_splice(&tmp, &zone->list_hand[1]);
+}
+
+static inline
+void __select_list_hand(struct zone *zone, struct list_head *list)
+{
+ if (list_empty(list))
+ swap_lists(zone);
+}
+
+/*
+ * Insert page into @zones clock and update adaptive parameters.
+ *
+ * Several page flags are used for insertion hints:
+ * PG_active - insert as an active page
+ * PG_test - use the use-once logic
+ *
+ * For now we will ignore the active hint; the use once logic is
+ * explained below.
+ *
+ * @zone: target zone.
+ * @page: new page.
+ */
+void __page_replace_insert(struct zone *zone, struct page *page)
+{
+ unsigned int rflags;
+
+ rflags = nonresident_get(page_mapping(page), page_index(page));
+
+ /* ignore the PG_active hint */
+ ClearPageActive(page);
+
+ /* abuse the PG_test flag for pagecache use-once */
+ if (!TestClearPageTest(page)) {
+ /*
+ * Insert (hot) when found in the nonresident list, otherwise
+ * insert as (cold,test). Insert at the head of the Hhot list,
+ * ie. right behind Hcold.
+ */
+ if (rflags & NR_found) {
+ SetPageActive(page);
+ __cold_target_inc(zone, 1);
+ } else {
+ SetPageTest(page);
+ ++zone->nr_cold;
+ }
+ ++zone->nr_resident;
+ __select_list_hand(zone, &zone->list_hand[hand_hot]);
+ list_add(&page->lru, &zone->list_hand[hand_hot]);
+ } else {
+ /*
+ * Pagecache insert; we want to avoid activation on the first
+ * reference (which we know will come); use-once logic.
+ *
+ * This is accomplished by inserting the page one state lower
+ * than usual so the activation that does come ups it to the
+ * normal insert state. Also we insert right behind Hhot so
+ * 1) Hhot cannot interfere; and 2) we lose the first reference
+ * quicker.
+ *
+ * Insert (cold,test)/(cold) so the following activation will
+ * elevate the state to (hot)/(cold,test). (NOTE: the activation
+ * will take care of the cold target increment).
+ */
+ BUG_ON(PageTest(page));
+
+ if (rflags & NR_found) {
+ SetPageTest(page);
+ }
+ ++zone->nr_cold;
+ ++zone->nr_resident;
+ __select_list_hand(zone, &zone->list_hand[hand_cold]);
+ list_add(&page->lru, &zone->list_hand[hand_cold]);
+ }
+
+ BUG_ON(!PageLRU(page));
+}
+
+/*
+ * zone->lru_lock is heavily contended. Some of the functions that
+ * shrink the lists perform better by taking out a batch of pages
+ * and working on them outside the LRU lock.
+ *
+ * For pagecache intensive workloads, this function is the hottest
+ * spot in the kernel (apart from copy_*_user functions).
+ *
+ * @nr_to_scan: The number of pages to look through on the list.
+ * @src: The LRU list to pull pages off.
+ * @dst: The temp list to put pages on to.
+ * @scanned: The number of pages that were scanned.
+ *
+ * returns how many pages were moved onto *@dst.
+ */
+static int isolate_lru_pages(struct zone * zone, int nr_to_scan,
+ struct list_head *src, struct list_head *dst, int *scanned)
+{
+ int nr_taken = 0;
+ struct page *page;
+ int scan = 0;
+
+ spin_lock_irq(&zone->lru_lock);
+ __select_list_hand(zone, src);
+ while (scan++ < nr_to_scan && !list_empty(src)) {
+ page = lru_to_page(src);
+ prefetchw_prev_lru_page(page, src, flags);
+
+ if (!TestClearPageLRU(page))
+ BUG();
+ list_del(&page->lru);
+ if (get_page_testone(page)) {
+ /*
+ * It is being freed elsewhere
+ */
+ __put_page(page);
+ SetPageLRU(page);
+ list_add(&page->lru, src);
+ continue;
+ } else {
+ list_add(&page->lru, dst);
+ nr_taken++;
+ if (!PageActive(page))
+ --zone->nr_cold;
+ }
+ }
+ zone->nr_resident -= nr_taken;
+ zone->pages_scanned += scan;
+ spin_unlock_irq(&zone->lru_lock);
+
+ *scanned = scan;
+ return nr_taken;
+}
+
+/*
+ * Add page to a release pagevec, temp. drop zone lock to release pagevec if full.
+ * Set PG_lru, update zone->nr_cold and zone->nr_resident.
+ *
+ * @zone: @pages zone.
+ * @page: page to be released.
+ * @pvec: pagevec to collect pages in.
+ */
+static void __page_release(struct zone *zone, struct page *page,
+ struct pagevec *pvec)
+{
+ if (TestSetPageLRU(page))
+ BUG();
+ if (!PageActive(page))
+ ++zone->nr_cold;
+ ++zone->nr_resident;
+
+ if (!pagevec_add(pvec, page)) {
+ spin_unlock_irq(&zone->lru_lock);
+ if (buffer_heads_over_limit)
+ pagevec_strip(pvec);
+ __pagevec_release(pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
+}
+
+/*
+ * Try to reclaim a specified number of pages.
+ *
+ * Reclaim cadidates have:
+ * - PG_lru cleared
+ * - 1 extra ref
+ *
+ * NOTE: hot pages are also returned but will be spit back by try_pageout()
+ * this to preserve CLOCK order.
+ *
+ * @zone: target zone to reclaim pages from.
+ * @nr_to_scan: nr of pages to try for reclaim.
+ *
+ * returns candidate list.
+ */
+void page_replace_candidates(struct zone *zone, int nr_to_scan, struct list_head *page_list)
+{
+ int nr_scan;
+
+ isolate_lru_pages(zone, nr_to_scan,
+ &zone->list_hand[hand_cold],
+ page_list, &nr_scan);
+
+ if (current_is_kswapd())
+ mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+ else
+ mod_page_state_zone(zone, pgscan_direct, nr_scan);
+}
+
+/*
+ * Activate a cold page:
+ * cold, !test -> cold, test
+ * cold, test -> hot
+ *
+ * @page: page to activate
+ */
+void page_replace_activate(struct page *page)
+{
+ int hot, test;
+
+ hot = PageActive(page);
+ test = PageTest(page);
+
+ if (hot) {
+ BUG_ON(test);
+ } else {
+ if (test) {
+ SetPageActive(page);
+ /*
+ * Leave PG_test set for new hot pages in order to
+ * recognise then in reinsert() and do accounting.
+ */
+ } else {
+ SetPageTest(page);
+ }
+ }
+}
+
+static int reclaim_mapped(struct zone *);
+static void rotate_hot(struct zone *, int, int, struct pagevec *);
+
+/*
+ * Reinsert those candidate pages that were not freed by try_pageout().
+ * Account pages that were promoted to hot by page_replace_activate().
+ * Rotate hand hot to balance the new hot and lost cold pages vs.
+ * the cold pages target.
+ *
+ * Candidate pages have:
+ * - PG_lru cleared
+ * - 1 extra ref
+ * undo that.
+ *
+ * @zone: zone we're working on.
+ * @page_list: the left over pages.
+ */
+void page_replace_reinsert(struct zone *zone, struct list_head *page_list)
+{
+ struct pagevec pvec;
+ unsigned long dct = 0;
+
+ pagevec_init(&pvec, 1);
+ spin_lock_irq(&zone->lru_lock);
+ __select_list_hand(zone, &zone->list_hand[hand_hot]);
+ while (!list_empty(page_list)) {
+ struct page *page = lru_to_page(page_list);
+ prefetchw_prev_lru_page(page, page_list, flags);
+
+ if (PageActive(page) && PageTest(page)) {
+ ClearPageTest(page);
+ ++dct;
+ }
+
+ list_move(&page->lru, &zone->list_hand[hand_hot]);
+ __page_release(zone, page, &pvec);
+ }
+ __cold_target_inc(zone, dct);
+ spin_unlock_irq(&zone->lru_lock);
+
+ /*
+ * Limit the hot hand to a full revolution.
+ */
+ if (zone->nr_cold < zone->nr_cold_target) {
+ int i, nr = zone->nr_resident / SWAP_CLUSTER_MAX;
+ int rm = reclaim_mapped(zone);
+ for (i = 0; zone->nr_cold < zone->nr_cold_target && i < nr; ++i)
+ rotate_hot(zone, SWAP_CLUSTER_MAX, rm, &pvec);
+ }
+
+ pagevec_release(&pvec);
+}
+
+/*
+ * Puts cold pages that have their test bit set on the non-resident lists.
+ *
+ * @zone: dead pages zone.
+ * @page: dead page.
+ */
+void page_replace_remember(struct zone *zone, struct page *page)
+{
+ if (TestClearPageTest(page)) {
+ int list = nonresident_put(page_mapping(page),
+ page_index(page), NR_b1, NR_b1);
+ if (list != NR_free)
+ __cold_target_dec(zone, 1);
+ }
+}
+
+static unsigned long estimate_pageable_memory(void)
+{
+#if 0
+ static unsigned long next_check;
+ static unsigned long total = 0;
+
+ if (!total || time_after(jiffies, next_check)) {
+ struct zone *z;
+ total = 0;
+ for_each_zone(z)
+ total += z->nr_resident;
+ next_check = jiffies + HZ/10;
+ }
+
+ // gave 0 first time, SIGFPE in kernel sucks
+ // hence the !total
+#else
+ unsigned long total = 0;
+ struct zone *z;
+ for_each_zone(z)
+ total += z->nr_resident;
+#endif
+ return total;
+}
+
+static int reclaim_mapped(struct zone *zone)
+{
+ long mapped_ratio;
+ long distress;
+ long swap_tendency;
+
+ /*
+ * `distress' is a measure of how much trouble we're having reclaiming
+ * pages. 0 -> no problems. 100 -> great trouble.
+ */
+ distress = 100 >> zone->prev_priority;
+
+ /*
+ * The point of this algorithm is to decide when to start reclaiming
+ * mapped memory instead of just pagecache. Work out how much memory
+ * is mapped.
+ */
+ mapped_ratio = (read_page_state(nr_mapped) * 100) / total_memory;
+
+ /*
+ * Now decide how much we really want to unmap some pages. The mapped
+ * ratio is downgraded - just because there's a lot of mapped memory
+ * doesn't necessarily mean that page reclaim isn't succeeding.
+ *
+ * The distress ratio is important - we don't want to start going oom.
+ *
+ * A 100% value of vm_swappiness overrides this algorithm altogether.
+ */
+ swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+
+ /*
+ * Now use this metric to decide whether to start moving mapped memory
+ * onto the inactive list.
+ */
+ if (swap_tendency >= 100)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Rotate the non-resident hand; scale the rotation speed so that when all
+ * hot hands have made one full revolution the non-resident hand will have
+ * too.
+ *
+ * @zone: current zone
+ * @dh: number of pages the hot hand has moved
+ */
+static void __nonres_term(struct zone *zone, unsigned long dh)
+{
+ unsigned long long cycles;
+ /*
+ * |b1| Rhot |B| Rhot
+ * Rtest = ----------- ~ ----------
+ * |r1| |R|
+ *
+ * NOTE depends on |B|, hence include the nonresident_del patch
+ */
+ cycles = zone->nr_nonresident_scale + (unsigned long long)dh * nonresident_estimate();
+ zone->nr_nonresident_scale = do_div(cycles, estimate_pageable_memory() + 1UL);
+ __get_cpu_var(nonres_cycle) += (u32)cycles;
+ __cold_target_dec(zone, cycles);
+}
+
+/*
+ * Rotate hand hot;
+ *
+ * @zone: current zone
+ * @nr_to_scan: batch quanta
+ * @reclaim_mapped: whether to demote mapped pages too
+ * @pvec: release pagevec
+ */
+static void rotate_hot(struct zone *zone, int nr_to_scan, int reclaim_mapped,
+ struct pagevec *pvec)
+{
+ LIST_HEAD(l_hold);
+ LIST_HEAD(l_tmp);
+ unsigned long dh = 0, dct = 0;
+ int pgscanned;
+ int pgdeactivate = 0;
+ int nr_taken;
+
+ nr_taken = isolate_lru_pages(zone, nr_to_scan,
+ &zone->list_hand[hand_hot],
+ &l_hold, &pgscanned);
+
+ mod_page_state_zone(zone, pgrefill, pgscanned);
+
+ while (!list_empty(&l_hold)) {
+ struct page *page = lru_to_page(&l_hold);
+ prefetchw_prev_lru_page(page, &l_hold, flags);
+
+ if (PageActive(page)) {
+ BUG_ON(PageTest(page));
+
+ /*
+ * Ignore the swap token; this is not actual reclaim
+ * and it will give a better reflection of the actual
+ * hotness of pages.
+ *
+ * XXX do something with this reclaim_mapped stuff.
+ */
+ if (/*(((reclaim_mapped && mapped) || !mapped) ||
+ (total_swap_pages == 0 && PageAnon(page))) && */
+ !page_referenced(page, 0, 1)) {
+ ClearPageActive(page);
+ ++pgdeactivate;
+ }
+
+ ++dh;
+ } else {
+ if (TestClearPageTest(page))
+ ++dct;
+ }
+ list_move(&page->lru, &l_tmp);
+
+ cond_resched();
+ }
+
+ spin_lock_irq(&zone->lru_lock);
+ while (!list_empty(&l_tmp)) {
+ struct page *page = lru_to_page(&l_tmp);
+ prefetchw_prev_lru_page(page, &l_tmp, flags);
+ list_move(&page->lru, &zone->list_hand[hand_cold]);
+ __page_release(zone, page, pvec);
+ }
+ __nonres_term(zone, nr_taken);
+ __cold_target_dec(zone, dct);
+ spin_unlock_irq(&zone->lru_lock);
+
+ mod_page_state(pgdeactivate, pgdeactivate);
+}
Index: linux-2.6-git/include/linux/mm_page_replace.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_page_replace.h
+++ linux-2.6-git/include/linux/mm_page_replace.h
@@ -7,6 +7,7 @@
#include <linux/mm.h>
#include <linux/list.h>
#include <linux/page-flags.h>
+#include <linux/swap.h>

#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))

@@ -38,44 +39,41 @@
#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif

+enum {
+ hand_hot = 0,
+ hand_cold = 1
+};
+
void __init page_replace_init_zone(struct zone *);
void __page_replace_insert(struct zone *, struct page *);
void page_replace_candidates(struct zone *, int, struct list_head *);
-
-static inline
-void page_replace_activate(struct page *page)
-{
- SetPageActive(page);
-}
-
+void page_replace_activate(struct page *);
void page_replace_reinsert(struct zone *, struct list_head *);
+void page_replace_remember(struct zone *, struct page *);

+
+/*
+ * Make page available for direct reclaim.
+ *
+ * @zone: page's zone.
+ * @page: page.
+ */
static inline
void __page_replace_rotate_reclaimable(struct zone *zone, struct page *page)
{
if (PageLRU(page) && !PageActive(page)) {
- list_move_tail(&page->lru, &zone->inactive_list);
+ list_move_tail(&page->lru, &zone->list_hand[hand_cold]);
inc_page_state(pgrotated);
}
}

static inline void
-add_page_to_active_list(struct zone *zone, struct page *page)
-{
- list_add(&page->lru, &zone->active_list);
- zone->nr_active++;
-}
-
-static inline void
del_page_from_lru(struct zone *zone, struct page *page)
{
list_del(&page->lru);
- if (PageActive(page)) {
- ClearPageActive(page);
- zone->nr_active--;
- } else {
- zone->nr_inactive--;
- }
+ --zone->nr_resident;
+ if (!TestClearPageActive(page))
+ --zone->nr_cold;
}

#endif /* __KERNEL__ */
Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c
+++ linux-2.6-git/mm/vmscan.c
@@ -339,10 +339,11 @@ static try_pageout_t try_pageout(struct
int may_enter_fs;
int referenced;

- if (TestSetPageLocked(page))
+ if (PageActive(page))
goto keep;

- BUG_ON(PageActive(page));
+ if (TestSetPageLocked(page))
+ goto keep;

sc->nr_scanned++;
/* Double the slab pressure for mapped and swapcache pages */
@@ -467,6 +468,7 @@ static try_pageout_t try_pageout(struct
#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
+ page_replace_remember(page_zone(page), page);
__delete_from_swap_cache(page);
write_unlock_irq(&mapping->tree_lock);
swap_free(swap);
@@ -475,6 +477,7 @@ static try_pageout_t try_pageout(struct
}
#endif /* CONFIG_SWAP */

+ page_replace_remember(page_zone(page), page);
__remove_from_page_cache(page);
write_unlock_irq(&mapping->tree_lock);
__put_page(page);
@@ -572,7 +575,7 @@ shrink_zone(struct zone *zone, struct sc

atomic_inc(&zone->reclaim_in_progress);

- nr_inactive = (zone->nr_inactive >> sc->priority) + SWAP_CLUSTER_MAX;
+ nr_inactive = (zone->nr_resident >> sc->priority) + SWAP_CLUSTER_MAX;
nr_inactive &= ~(SWAP_CLUSTER_MAX - 1);

sc->nr_to_scan = SWAP_CLUSTER_MAX;
@@ -667,7 +670,7 @@ int try_to_free_pages(struct zone **zone
continue;

zone->temp_priority = DEF_PRIORITY;
- lru_pages += zone->nr_active + zone->nr_inactive;
+ lru_pages += zone->nr_resident;
}

for (priority = DEF_PRIORITY; priority >= 0; priority--) {
@@ -811,14 +814,14 @@ loop_again:
zone->temp_priority = priority;
if (zone->prev_priority > priority)
zone->prev_priority = priority;
- lru_pages += zone->nr_active + zone->nr_inactive;
+ lru_pages += zone->nr_resident;

atomic_inc(&zone->reclaim_in_progress);
shrink_zone(zone, &sc);
atomic_dec(&zone->reclaim_in_progress);

if (zone->pages_scanned >=
- (zone->nr_active + zone->nr_inactive) * 4)
+ (zone->nr_resident) * 4)
zone->all_unreclaimable = 1;
}
reclaim_state->reclaimed_slab = 0;
Index: linux-2.6-git/mm/Makefile
===================================================================
--- linux-2.6-git.orig/mm/Makefile
+++ linux-2.6-git/mm/Makefile
@@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o \
readahead.o slab.o swap.o truncate.o vmscan.o \
- prio_tree.o page_replace.o $(mmu-y)
+ prio_tree.o clockpro.o $(mmu-y)

obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o \
nonresident.o
Index: linux-2.6-git/mm/page_alloc.c
===================================================================
--- linux-2.6-git.orig/mm/page_alloc.c
+++ linux-2.6-git/mm/page_alloc.c
@@ -1263,8 +1263,8 @@ void __get_zone_counts(unsigned long *ac
*inactive = 0;
*free = 0;
for (i = 0; i < MAX_NR_ZONES; i++) {
- *active += zones[i].nr_active;
- *inactive += zones[i].nr_inactive;
+ *active += zones[i].nr_resident - zones[i].nr_cold;
+ *inactive += zones[i].nr_cold;
*free += zones[i].free_pages;
}
}
@@ -1387,8 +1387,8 @@ void show_free_areas(void)
" min:%lukB"
" low:%lukB"
" high:%lukB"
- " active:%lukB"
- " inactive:%lukB"
+ " resident:%lukB"
+ " cold:%lukB"
" present:%lukB"
" pages_scanned:%lu"
" all_unreclaimable? %s"
@@ -1398,8 +1398,8 @@ void show_free_areas(void)
K(zone->pages_min),
K(zone->pages_low),
K(zone->pages_high),
- K(zone->nr_active),
- K(zone->nr_inactive),
+ K(zone->nr_resident),
+ K(zone->nr_cold),
K(zone->present_pages),
zone->pages_scanned,
(zone->all_unreclaimable ? "yes" : "no")
@@ -2156,8 +2156,8 @@ static int zoneinfo_show(struct seq_file
"\n min %lu"
"\n low %lu"
"\n high %lu"
- "\n active %lu"
- "\n inactive %lu"
+ "\n resident %lu"
+ "\n cold %lu"
"\n scanned %lu"
"\n spanned %lu"
"\n present %lu",
@@ -2165,8 +2165,8 @@ static int zoneinfo_show(struct seq_file
zone->pages_min,
zone->pages_low,
zone->pages_high,
- zone->nr_active,
- zone->nr_inactive,
+ zone->nr_resident,
+ zone->nr_cold,
zone->pages_scanned,
zone->spanned_pages,
zone->present_pages);

2005-12-30 22:43:26

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 12/14] page-replace-rotate.patch


From: Peter Zijlstra <[email protected]>

page-replace interface function:
__page_replace_rotate_reclaimable()

This function moves the page so that it will be available to the next
candidate scan.

Removes the knowledge of the page reclaim lists from the actual function.

Signed-off-by: Peter Zijlstra <[email protected]>

include/linux/mm_page_replace.h | 15 ++++++++++++++-
mm/page_replace.c | 1 -
mm/swap.c | 6 +-----
3 files changed, 15 insertions(+), 7 deletions(-)

Index: linux-2.6-git/include/linux/mm_page_replace.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_page_replace.h
+++ linux-2.6-git/include/linux/mm_page_replace.h
@@ -6,6 +6,7 @@
#include <linux/mmzone.h>
#include <linux/mm.h>
#include <linux/list.h>
+#include <linux/page-flags.h>

#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))

@@ -39,12 +40,24 @@

void __page_replace_insert(struct zone *, struct page *);
void page_replace_candidates(struct zone *, int, struct list_head *);
-static inline void page_replace_activate(struct page *page)
+
+static inline
+void page_replace_activate(struct page *page)
{
SetPageActive(page);
}
+
void page_replace_reinsert(struct zone *, struct list_head *);

+static inline
+void __page_replace_rotate_reclaimable(struct zone *zone, struct page *page)
+{
+ if (PageLRU(page) && !PageActive(page)) {
+ list_move_tail(&page->lru, &zone->inactive_list);
+ inc_page_state(pgrotated);
+ }
+}
+
static inline void
add_page_to_active_list(struct zone *zone, struct page *page)
{
Index: linux-2.6-git/mm/page_replace.c
===================================================================
--- linux-2.6-git.orig/mm/page_replace.c
+++ linux-2.6-git/mm/page_replace.c
@@ -1,7 +1,6 @@
#include <linux/mm_page_replace.h>
#include <linux/swap.h>
#include <linux/pagevec.h>
-#include <linux/page-flags.h>
#include <linux/init.h>
#include <linux/rmap.h>
#include <linux/buffer_head.h> /* for try_to_release_page(),
Index: linux-2.6-git/mm/swap.c
===================================================================
--- linux-2.6-git.orig/mm/swap.c
+++ linux-2.6-git/mm/swap.c
@@ -82,11 +82,7 @@ int rotate_reclaimable_page(struct page

zone = page_zone(page);
spin_lock_irqsave(&zone->lru_lock, flags);
- if (PageLRU(page) && !PageActive(page)) {
- list_del(&page->lru);
- list_add_tail(&page->lru, &zone->inactive_list);
- inc_page_state(pgrotated);
- }
+ __page_replace_rotate_reclaimable(zone, page);
if (!test_clear_page_writeback(page))
BUG();
spin_unlock_irqrestore(&zone->lru_lock, flags);

2005-12-30 22:44:05

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 8/9] clockpro-rename_PG_active.patch


From: Peter Zijlstra <[email protected]>

New sematics, new name. Since the semantics of PG_activate changed
drastically with the clockpro code, change its name.

Signed-off-by: Peter Zijlstra <[email protected]>

fs/exec.c | 2 +-
include/linux/mm_page_replace.h | 4 ++--
include/linux/page-flags.h | 12 ++++++------
mm/clockpro.c | 22 +++++++++++-----------
mm/hugetlb.c | 2 +-
mm/memory.c | 6 +++---
mm/page_alloc.c | 6 +++---
mm/swap.c | 2 +-
mm/swap_state.c | 2 +-
mm/vmscan.c | 2 +-
10 files changed, 30 insertions(+), 30 deletions(-)

Index: linux-2.6-git/fs/exec.c
===================================================================
--- linux-2.6-git.orig/fs/exec.c
+++ linux-2.6-git/fs/exec.c
@@ -321,7 +321,7 @@ void install_arg_page(struct vm_area_str
goto out;
}
inc_mm_counter(mm, anon_rss);
- SetPageActive(page);
+ SetPageHot(page);
lru_cache_add(page);
set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
page, vma->vm_page_prot))));
Index: linux-2.6-git/include/linux/mm_page_replace.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_page_replace.h
+++ linux-2.6-git/include/linux/mm_page_replace.h
@@ -61,7 +61,7 @@ void page_replace_remember(struct zone *
static inline
void __page_replace_rotate_reclaimable(struct zone *zone, struct page *page)
{
- if (PageLRU(page) && !PageActive(page)) {
+ if (PageLRU(page) && !PageHot(page)) {
list_move_tail(&page->lru, &zone->list_hand[hand_cold]);
inc_page_state(pgrotated);
}
@@ -72,7 +72,7 @@ del_page_from_lru(struct zone *zone, str
{
list_del(&page->lru);
--zone->nr_resident;
- if (!TestClearPageActive(page))
+ if (!TestClearPageHot(page))
--zone->nr_cold;
}

Index: linux-2.6-git/include/linux/page-flags.h
===================================================================
--- linux-2.6-git.orig/include/linux/page-flags.h
+++ linux-2.6-git/include/linux/page-flags.h
@@ -58,7 +58,7 @@

#define PG_dirty 4
#define PG_lru 5
-#define PG_active 6
+#define PG_hot 6
#define PG_slab 7 /* slab debug (Suparna wants this) */

#define PG_checked 8 /* kill me in 2.5.<early>. */
@@ -205,11 +205,11 @@ extern void __mod_page_state(unsigned lo
#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags)
#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags)

-#define PageActive(page) test_bit(PG_active, &(page)->flags)
-#define SetPageActive(page) set_bit(PG_active, &(page)->flags)
-#define ClearPageActive(page) clear_bit(PG_active, &(page)->flags)
-#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags)
-#define TestSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags)
+#define PageHot(page) test_bit(PG_hot, &(page)->flags)
+#define SetPageHot(page) set_bit(PG_hot, &(page)->flags)
+#define ClearPageHot(page) clear_bit(PG_hot, &(page)->flags)
+#define TestClearPageHot(page) test_and_clear_bit(PG_hot, &(page)->flags)
+#define TestSetPageHot(page) test_and_set_bit(PG_hot, &(page)->flags)

#define PageSlab(page) test_bit(PG_slab, &(page)->flags)
#define SetPageSlab(page) set_bit(PG_slab, &(page)->flags)
Index: linux-2.6-git/mm/clockpro.c
===================================================================
--- linux-2.6-git.orig/mm/clockpro.c
+++ linux-2.6-git/mm/clockpro.c
@@ -127,7 +127,7 @@ void __select_list_hand(struct zone *zon
* Insert page into @zones clock and update adaptive parameters.
*
* Several page flags are used for insertion hints:
- * PG_active - insert as an active page
+ * PG_hot - insert as an active page
* PG_test - use the use-once logic
*
* For now we will ignore the active hint; the use once logic is
@@ -142,8 +142,8 @@ void __page_replace_insert(struct zone *

rflags = nonresident_get(page_mapping(page), page_index(page));

- /* ignore the PG_active hint */
- ClearPageActive(page);
+ /* ignore the PG_hot hint */
+ ClearPageHot(page);

/* abuse the PG_test flag for pagecache use-once */
if (!TestClearPageTest(page)) {
@@ -153,7 +153,7 @@ void __page_replace_insert(struct zone *
* ie. right behind Hcold.
*/
if (rflags & NR_found) {
- SetPageActive(page);
+ SetPageHot(page);
__cold_target_inc(zone, 1);
} else {
SetPageTest(page);
@@ -233,7 +233,7 @@ static int isolate_lru_pages(struct zone
} else {
list_add(&page->lru, dst);
nr_taken++;
- if (!PageActive(page))
+ if (!PageHot(page))
--zone->nr_cold;
}
}
@@ -258,7 +258,7 @@ static void __page_release(struct zone *
{
if (TestSetPageLRU(page))
BUG();
- if (!PageActive(page))
+ if (!PageHot(page))
++zone->nr_cold;
++zone->nr_resident;

@@ -311,14 +311,14 @@ void page_replace_activate(struct page *
{
int hot, test;

- hot = PageActive(page);
+ hot = PageHot(page);
test = PageTest(page);

if (hot) {
BUG_ON(test);
} else {
if (test) {
- SetPageActive(page);
+ SetPageHot(page);
/*
* Leave PG_test set for new hot pages in order to
* recognise then in reinsert() and do accounting.
@@ -358,7 +358,7 @@ void page_replace_reinsert(struct zone *
struct page *page = lru_to_page(page_list);
prefetchw_prev_lru_page(page, page_list, flags);

- if (PageActive(page) && PageTest(page)) {
+ if (PageHot(page) && PageTest(page)) {
ClearPageTest(page);
++dct;
}
@@ -515,7 +515,7 @@ static void rotate_hot(struct zone *zone
struct page *page = lru_to_page(&l_hold);
prefetchw_prev_lru_page(page, &l_hold, flags);

- if (PageActive(page)) {
+ if (PageHot(page)) {
BUG_ON(PageTest(page));

/*
@@ -528,7 +528,7 @@ static void rotate_hot(struct zone *zone
if (/*(((reclaim_mapped && mapped) || !mapped) ||
(total_swap_pages == 0 && PageAnon(page))) && */
!page_referenced(page, 0, 1)) {
- ClearPageActive(page);
+ ClearPageHot(page);
++pgdeactivate;
}

Index: linux-2.6-git/mm/hugetlb.c
===================================================================
--- linux-2.6-git.orig/mm/hugetlb.c
+++ linux-2.6-git/mm/hugetlb.c
@@ -145,7 +145,7 @@ static void update_and_free_page(struct
nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
- 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+ 1 << PG_dirty | 1 << PG_hot | 1 << PG_reserved |
1 << PG_private | 1<< PG_writeback);
set_page_count(&page[i], 0);
}
Index: linux-2.6-git/mm/memory.c
===================================================================
--- linux-2.6-git.orig/mm/memory.c
+++ linux-2.6-git/mm/memory.c
@@ -1521,7 +1521,7 @@ gotten:
ptep_establish(vma, address, page_table, entry);
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
- SetPageActive(new_page);
+ SetPageHot(new_page);
lru_cache_add(new_page);
page_add_anon_rmap(new_page, vma, address);

@@ -1978,7 +1978,7 @@ static int do_anonymous_page(struct mm_s
if (!pte_none(*page_table))
goto release;
inc_mm_counter(mm, anon_rss);
- SetPageActive(page);
+ SetPageHot(page);
lru_cache_add(page);
SetPageReferenced(page);
page_add_anon_rmap(page, vma, address);
@@ -2111,7 +2111,7 @@ retry:
set_pte_at(mm, address, page_table, entry);
if (anon) {
inc_mm_counter(mm, anon_rss);
- SetPageActive(new_page);
+ SetPageHot(new_page);
lru_cache_add(new_page);
page_add_anon_rmap(new_page, vma, address);
} else {
Index: linux-2.6-git/mm/page_alloc.c
===================================================================
--- linux-2.6-git.orig/mm/page_alloc.c
+++ linux-2.6-git/mm/page_alloc.c
@@ -136,7 +136,7 @@ static void bad_page(const char *functio
page->flags &= ~(1 << PG_lru |
1 << PG_private |
1 << PG_locked |
- 1 << PG_active |
+ 1 << PG_hot |
1 << PG_dirty |
1 << PG_reclaim |
1 << PG_slab |
@@ -344,7 +344,7 @@ static inline int free_pages_check(const
1 << PG_lru |
1 << PG_private |
1 << PG_locked |
- 1 << PG_active |
+ 1 << PG_hot |
1 << PG_reclaim |
1 << PG_slab |
1 << PG_swapcache |
@@ -481,7 +481,7 @@ static int prep_new_page(struct page *pa
1 << PG_lru |
1 << PG_private |
1 << PG_locked |
- 1 << PG_active |
+ 1 << PG_hot |
1 << PG_dirty |
1 << PG_reclaim |
1 << PG_slab |
Index: linux-2.6-git/mm/swap.c
===================================================================
--- linux-2.6-git.orig/mm/swap.c
+++ linux-2.6-git/mm/swap.c
@@ -75,7 +75,7 @@ int rotate_reclaimable_page(struct page
return 1;
if (PageDirty(page))
return 1;
- if (PageActive(page))
+ if (PageHot(page))
return 1;
if (!PageLRU(page))
return 1;
Index: linux-2.6-git/mm/swap_state.c
===================================================================
--- linux-2.6-git.orig/mm/swap_state.c
+++ linux-2.6-git/mm/swap_state.c
@@ -353,7 +353,7 @@ struct page *read_swap_cache_async(swp_e
/*
* Initiate read into locked page and return.
*/
- SetPageActive(new_page);
+ SetPageHot(new_page);
lru_cache_add(new_page);
swap_readpage(NULL, new_page);
return new_page;
Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c
+++ linux-2.6-git/mm/vmscan.c
@@ -339,7 +339,7 @@ static try_pageout_t try_pageout(struct
int may_enter_fs;
int referenced;

- if (PageActive(page))
+ if (PageHot(page))
goto keep;

if (TestSetPageLocked(page))

2005-12-30 22:44:48

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 9/9] clockpro-clockpro-stats.patch


From: Peter Zijlstra <[email protected]>

Adds some /proc debugging information to the clockpro patch.

TODO:
- use debugfs?

Signed-off-by: Peter Zijlstra <[email protected]>

fs/proc/proc_misc.c | 15 +++++++++++++
mm/clockpro.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 75 insertions(+)

Index: linux-2.6-git/fs/proc/proc_misc.c
===================================================================
--- linux-2.6-git.orig/fs/proc/proc_misc.c
+++ linux-2.6-git/fs/proc/proc_misc.c
@@ -220,6 +220,20 @@
.release = seq_release,
};

+extern struct seq_operations clockpro_op;
+static int clockpro_open(struct inode *inode, struct file *file)
+{
+ (void)inode;
+ return seq_open(file, &clockpro_op);
+}
+
+static struct file_operations clockpro_file_operations = {
+ .open = clockpro_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
extern struct seq_operations zoneinfo_op;
static int zoneinfo_open(struct inode *inode, struct file *file)
{
@@ -602,6 +616,7 @@
create_seq_entry("interrupts", 0, &proc_interrupts_operations);
create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
+ create_seq_entry("clockpro",S_IRUGO, &clockpro_file_operations);
create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
create_seq_entry("diskstats", 0, &proc_diskstats_operations);
Index: linux-2.6-git/mm/clockpro.c
--- linux-2.6-git.orig/mm/clockpro.c
+++ linux-2.6-git/mm/clockpro.c
@@ -555,3 +555,62 @@

mod_page_state(pgdeactivate, pgdeactivate);
}
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/seq_file.h>
+
+static void *stats_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos != 0)
+ return NULL;
+
+ lru_add_drain();
+
+ return pos;
+}
+
+static void *stats_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ return NULL;
+}
+
+static void stats_stop(struct seq_file *m, void *arg)
+{
+}
+
+static int stats_show(struct seq_file *m, void *arg)
+{
+ struct zone *zone;
+ for_each_zone(zone) {
+ seq_printf(m, "\n\n======> zone: %lu <=====\n", (unsigned long)zone);
+ seq_printf(m, "struct zone values:\n");
+ seq_printf(m, " zone->nr_resident: %lu\n", zone->nr_resident);
+ seq_printf(m, " zone->nr_cold: %lu\n", zone->nr_cold);
+ seq_printf(m, " zone->nr_cold_target: %lu\n", zone->nr_cold_target);
+ seq_printf(m, " zone->nr_nonresident_scale: %lu\n", zone->nr_nonresident_scale);
+ seq_printf(m, " zone->present_pages: %lu\n", zone->present_pages);
+ seq_printf(m, " zone->free_pages: %lu\n", zone->free_pages);
+ seq_printf(m, " zone->pages_min: %lu\n", zone->pages_min);
+ seq_printf(m, " zone->pages_low: %lu\n", zone->pages_low);
+ seq_printf(m, " zone->pages_high: %lu\n", zone->pages_high);
+
+ seq_printf(m, "\n");
+ seq_printf(m, "nonresident values:\n");
+ seq_printf(m, " nonres_cycle: %lu\n", __sum_cpu_var(unsigned long, nonres_cycle));
+ seq_printf(m, " T3-raw: %lu\n", __sum_cpu_var(unsigned long, nonres_count[NR_b1]));
+ seq_printf(m, " T3-est: %u\n", nonresident_estimate());
+
+ }
+
+ return 0;
+}
+
+struct seq_operations clockpro_op = {
+ .start = stats_start,
+ .next = stats_next,
+ .stop = stats_stop,
+ .show = stats_show,
+};
+
+#endif /* CONFIG_PROC_FS */

2005-12-30 22:43:39

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 3/9] clockpro-PG_test.patch


From: Peter Zijlstra <[email protected]>

Introduce a new PG_flag, needed for the clockpro work.

Signed-off-by: Peter Zijlstra <[email protected]>

include/linux/page-flags.h | 8 ++++++++
mm/page_alloc.c | 3 ++-
2 files changed, 10 insertions(+), 1 deletion(-)

Index: linux-2.6-git/include/linux/page-flags.h
===================================================================
--- linux-2.6-git.orig/include/linux/page-flags.h
+++ linux-2.6-git/include/linux/page-flags.h
@@ -76,6 +76,8 @@
#define PG_nosave_free 18 /* Free, should not be written */
#define PG_uncached 19 /* Page has been mapped as uncached */

+#define PG_test 20 /* Page is in its test period */
+
/*
* Global page accounting. One instance per CPU. Only unsigned longs are
* allowed.
@@ -303,6 +305,12 @@ extern void __mod_page_state(unsigned lo
#define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags)
#define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags)

+#define PageTest(page) test_bit(PG_test, &(page)->flags)
+#define SetPageTest(page) set_bit(PG_test, &(page)->flags)
+#define TestSetPageTest(page) test_and_set_bit(PG_test, &(page)->flags)
+#define ClearPageTest(page) clear_bit(PG_test, &(page)->flags)
+#define TestClearPageTest(page) test_and_clear_bit(PG_test, &(page)->flags)
+
struct page; /* forward declaration */

int test_clear_page_dirty(struct page *page);
Index: linux-2.6-git/mm/page_alloc.c
===================================================================
--- linux-2.6-git.orig/mm/page_alloc.c
+++ linux-2.6-git/mm/page_alloc.c
@@ -499,7 +499,8 @@ static int prep_new_page(struct page *pa

page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
1 << PG_referenced | 1 << PG_arch_1 |
- 1 << PG_checked | 1 << PG_mappedtodisk);
+ 1 << PG_checked | 1 << PG_mappedtodisk |
+ 1 << PG_test);
set_page_private(page, 0);
set_page_refs(page, order);
kernel_map_pages(page, 1 << order, 1);

2005-12-30 22:46:52

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 1/9] clockpro-nonresident.patch


From: Peter Zijlstra <[email protected]>

Originally started by Rik van Riel, I heavily modified the code
to suit my needs.

The nonresident code approximates a clock but sacrifices precision in order
to accomplish faster lookups.

The actual datastructure is a hash of small clocks, so that, assuming an
equal distribution by the hash function, each clock has comparable order.

TODO:
- remove the ARC requirements.

Signed-off-by: Peter Zijlstra <[email protected]>

include/linux/swap.h | 32 ++++
init/main.c | 2
mm/Makefile | 3
mm/nonresident.c | 391 +++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 427 insertions(+), 1 deletion(-)

Index: linux-2.6-git/mm/nonresident.c
===================================================================
--- /dev/null
+++ linux-2.6-git/mm/nonresident.c
@@ -0,0 +1,391 @@
+/*
+ * mm/nonresident.c
+ * (C) 2004,2005 Red Hat, Inc
+ * Written by Rik van Riel <[email protected]>
+ * Released under the GPL, see the file COPYING for details.
+ * Adapted by Peter Zijlstra <[email protected]> for use by ARC
+ * like algorithms.
+ *
+ * Keeps track of whether a non-resident page was recently evicted
+ * and should be immediately promoted to the active list. This also
+ * helps automatically tune the inactive target.
+ *
+ * The pageout code stores a recently evicted page in this cache
+ * by calling nonresident_put(mapping/mm, index/vaddr)
+ * and can look it up in the cache by calling nonresident_find()
+ * with the same arguments.
+ *
+ * Note that there is no way to invalidate pages after eg. truncate
+ * or exit, we let the pages fall out of the non-resident set through
+ * normal replacement.
+ *
+ *
+ * Modified to work with ARC like algorithms who:
+ * - need to balance two FIFOs; |b1| + |b2| = c,
+ *
+ * The bucket contains four single linked cyclic lists (CLOCKS) and each
+ * clock has a tail hand. By selecting a victim clock upon insertion it
+ * is possible to balance them.
+ *
+ * The first two lists are used for B1/B2 and a third for a free slot list.
+ * The fourth list is unused.
+ *
+ * The slot looks like this:
+ * struct slot_t {
+ * u32 cookie : 24; // LSB
+ * u32 index : 6;
+ * u32 listid : 2;
+ * };
+ *
+ * The bucket is guarded by a spinlock.
+ */
+#include <linux/swap.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/hash.h>
+#include <linux/prefetch.h>
+#include <linux/kernel.h>
+
+#define TARGET_SLOTS 64
+#define NR_CACHELINES (TARGET_SLOTS*sizeof(u32) / L1_CACHE_BYTES)
+#define NR_SLOTS (((NR_CACHELINES * L1_CACHE_BYTES) - sizeof(spinlock_t) - 4*sizeof(u8) - sizeof(u32)) / sizeof(u32))
+#if 0
+#if NR_SLOTS < (TARGET_SLOTS / 2)
+#warning very small slot size
+#if NR_SLOTS <= 0
+#error no room for slots left
+#endif
+#endif
+#endif
+
+#define BUILD_MASK(bits, shift) (((1 << (bits)) - 1) << (shift))
+
+#define LISTID_BITS 2
+#define LISTID_SHIFT (sizeof(u32)*8 - LISTID_BITS)
+#define LISTID_MASK BUILD_MASK(LISTID_BITS, LISTID_SHIFT)
+
+#define SET_LISTID(x, flg) ((x) = ((x) & ~LISTID_MASK) | ((flg) << LISTID_SHIFT))
+#define GET_LISTID(x) (((x) & LISTID_MASK) >> LISTID_SHIFT)
+
+#define INDEX_BITS 6 /* ceil(log2(NR_SLOTS)) */
+#define INDEX_SHIFT (LISTID_SHIFT - INDEX_BITS)
+#define INDEX_MASK BUILD_MASK(INDEX_BITS, INDEX_SHIFT)
+
+#define SET_INDEX(x, idx) ((x) = ((x) & ~INDEX_MASK) | ((idx) << INDEX_SHIFT))
+#define GET_INDEX(x) (((x) & INDEX_MASK) >> INDEX_SHIFT)
+
+#define COOKIE_MASK BUILD_MASK(sizeof(u32)*8 - LISTID_BITS - INDEX_BITS, 0)
+
+struct nr_bucket
+{
+ spinlock_t lock;
+ u8 hand[4];
+ u32 cycle;
+ u32 slot[NR_SLOTS];
+} ____cacheline_aligned;
+
+/* The non-resident page hash table. */
+static struct nr_bucket * nonres_table;
+static unsigned int nonres_shift;
+static unsigned int nonres_mask;
+
+/* hash the address into a bucket */
+static struct nr_bucket * nr_hash(void * mapping, unsigned long index)
+{
+ unsigned long bucket;
+ unsigned long hash;
+
+ hash = (unsigned long)mapping + 37 * index;
+ bucket = hash_long(hash, nonres_shift);
+
+ return nonres_table + bucket;
+}
+
+/* hash the address and inode into a cookie */
+static u32 nr_cookie(struct address_space * mapping, unsigned long index)
+{
+ unsigned long hash;
+
+ hash = 37 * (unsigned long)mapping + index;
+
+ if (mapping && mapping->host)
+ hash = 37 * hash + mapping->host->i_ino;
+
+ return hash_long(hash, sizeof(u32)*8 - LISTID_BITS - INDEX_BITS);
+}
+
+DEFINE_PER_CPU(unsigned long[4], nonres_count);
+
+/*
+ * remove current (b from 'abc'):
+ *
+ * initial swap(2,3)
+ *
+ * 1: -> [2],a 1: -> [2],a
+ * * 2: -> [3],b 2: -> [1],c
+ * 3: -> [1],c * 3: -> [3],b
+ *
+ * 3 is now free for use.
+ *
+ * @nr_bucket: bucket to operate in
+ * @listid: list that the deletee belongs to
+ * @pos: slot position of deletee
+ * @slot: possible pointer to slot
+ *
+ * returns pointer to removed slot, NULL when list empty.
+ */
+static u32 * __nonresident_del(struct nr_bucket *nr_bucket, int listid, u8 pos, u32 *slot)
+{
+ int next_pos;
+ u32 *next;
+
+ if (slot == NULL) {
+ slot = &nr_bucket->slot[pos];
+ if (GET_LISTID(*slot) != listid)
+ return NULL;
+ }
+
+ --__get_cpu_var(nonres_count[listid]);
+
+ next_pos = GET_INDEX(*slot);
+ if (pos == next_pos) {
+ next = slot;
+ goto out;
+ }
+
+ next = &nr_bucket->slot[next_pos];
+ *next = xchg(slot, *next);
+
+ if (next_pos == nr_bucket->hand[listid])
+ nr_bucket->hand[listid] = pos;
+out:
+ BUG_ON(GET_INDEX(*next) != next_pos);
+ return next;
+}
+
+static inline u32 * __nonresident_pop(struct nr_bucket *nr_bucket, int listid)
+{
+ return __nonresident_del(nr_bucket, listid, nr_bucket->hand[listid], NULL);
+}
+
+/*
+ * insert before (d before b in 'abc')
+ *
+ * initial set 4 swap(2,4)
+ *
+ * 1: -> [2],a 1: -> [2],a 1: -> [2],a
+ * * 2: -> [3],b 2: -> [3],b 2: -> [4],d
+ * 3: -> [1],c 3: -> [1],c 3: -> [1],c
+ * 4: -> [4],nil 4: -> [4],d * 4: -> [3],b
+ *
+ * leaving us with 'adbc'.
+ *
+ * @nr_bucket: bucket to operator in
+ * @listid: list to insert into
+ * @pos: position to insert before
+ * @slot: slot to insert
+ */
+static void __nonresident_insert(struct nr_bucket *nr_bucket, int listid, u8 *pos, u32 *slot)
+{
+ u32 *head;
+
+ SET_LISTID(*slot, listid);
+
+ head = &nr_bucket->slot[*pos];
+
+ *pos = GET_INDEX(*slot);
+ if (GET_LISTID(*head) == listid)
+ *slot = xchg(head, *slot);
+
+ ++__get_cpu_var(nonres_count[listid]);
+}
+
+static inline void __nonresident_push(struct nr_bucket *nr_bucket, int listid, u32 *slot)
+{
+ __nonresident_insert(nr_bucket, listid, &nr_bucket->hand[listid], slot);
+}
+
+
+DEFINE_PER_CPU(u32, nonres_cycle);
+static DEFINE_PER_CPU(u32, nonres_delay);
+
+static void __nonresident_rotate(struct nr_bucket *nr_bucket)
+{
+ u32 nr_cycle = __sum_cpu_var(u32, nonres_cycle) & ~((1 << nonres_shift) - 1);
+ u32 * slot;
+ while (nr_bucket->cycle != nr_cycle) {
+ ++__get_cpu_var(nonres_delay);
+ nr_bucket->cycle += (1 << nonres_shift);
+ slot = __nonresident_pop(nr_bucket, NR_b1);
+ if (slot)
+ __nonresident_push(nr_bucket, NR_free, slot);
+ }
+}
+
+/*
+ * Remembers a page by putting a hash-cookie on the @listid list.
+ *
+ * @mapping: page_mapping()
+ * @index: page_index()
+ * @listid: list to put the page on (NR_b1, NR_b2 and NR_free).
+ * @listid_evict: list to get a free page from when NR_free is empty.
+ *
+ * returns the list an empty page was taken from.
+ */
+int nonresident_put(struct address_space * mapping, unsigned long index, int listid, int listid_evict)
+{
+ struct nr_bucket *nr_bucket;
+ u32 cookie;
+ unsigned long flags;
+ u32 *slot;
+ int evict = NR_free;
+
+ prefetch(mapping->host);
+ nr_bucket = nr_hash(mapping, index);
+
+ spin_lock_prefetch(nr_bucket); // prefetchw_range(nr_bucket, NR_CACHELINES);
+ cookie = nr_cookie(mapping, index);
+
+ spin_lock_irqsave(&nr_bucket->lock, flags);
+ __nonresident_rotate(nr_bucket);
+ slot = __nonresident_pop(nr_bucket, evict);
+ if (!slot) {
+ evict = listid_evict;
+ slot = __nonresident_pop(nr_bucket, evict);
+ }
+ BUG_ON(!slot);
+ SET_INDEX(cookie, GET_INDEX(*slot));
+ cookie = xchg(slot, cookie);
+ __nonresident_push(nr_bucket, listid, slot);
+ spin_unlock_irqrestore(&nr_bucket->lock, flags);
+
+ return evict;
+}
+
+/*
+ * Searches a page on the first two lists, and places it on the free list.
+ *
+ * @mapping: page_mapping()
+ * @index: page_index()
+ *
+ * returns listid of the list the item was found on with NR_found set if found.
+ */
+int nonresident_get(struct address_space * mapping, unsigned long index)
+{
+ struct nr_bucket * nr_bucket;
+ u32 wanted;
+ int j;
+ unsigned long flags;
+ int ret = 0;
+
+ if (mapping)
+ prefetch(mapping->host);
+ nr_bucket = nr_hash(mapping, index);
+
+ spin_lock_prefetch(nr_bucket); // prefetch_range(nr_bucket, NR_CACHELINES);
+ wanted = nr_cookie(mapping, index) & COOKIE_MASK;
+
+ spin_lock_irqsave(&nr_bucket->lock, flags);
+ __nonresident_rotate(nr_bucket);
+ j = nr_bucket->hand[NR_b1];
+ do {
+ u32 *slot = &nr_bucket->slot[j];
+ if (GET_LISTID(*slot) != NR_b1)
+ break;
+
+ if ((*slot & COOKIE_MASK) == wanted) {
+ slot = __nonresident_del(nr_bucket, NR_b1, j, slot);
+ __nonresident_push(nr_bucket, NR_free, slot);
+ ret = NR_b1 | NR_found;
+ break;
+ }
+
+ j = GET_INDEX(*slot);
+ } while (j != nr_bucket->hand[NR_b1]);
+ spin_unlock_irqrestore(&nr_bucket->lock, flags);
+
+ return ret;
+}
+
+unsigned int nonresident_total(void)
+{
+ return (1 << nonres_shift) * NR_SLOTS;
+}
+
+unsigned int nonresident_estimate(void)
+{
+ u32 count, cycle, delay, diff;
+
+ preempt_disable();
+ count = __sum_cpu_var(u32, nonres_count[NR_b1]);
+ cycle = __sum_cpu_var(u32, nonres_cycle);
+ delay = __sum_cpu_var(u32, nonres_delay);
+ preempt_enable();
+
+ diff = cycle - delay;
+
+ if (diff > count)
+ return 0;
+
+ return count - diff;
+}
+
+/*
+ * For interactive workloads, we remember about as many non-resident pages
+ * as we have actual memory pages. For server workloads with large inter-
+ * reference distances we could benefit from remembering more.
+ */
+static __initdata unsigned long nonresident_factor = 1;
+void __init nonresident_init(void)
+{
+ int target;
+ int i, j;
+
+ /*
+ * Calculate the non-resident hash bucket target. Use a power of
+ * two for the division because alloc_large_system_hash rounds up.
+ */
+ target = nr_all_pages * nonresident_factor;
+ target /= (sizeof(struct nr_bucket) / sizeof(u32));
+
+ nonres_table = alloc_large_system_hash("Non-resident page tracking",
+ sizeof(struct nr_bucket),
+ target,
+ 0,
+ HASH_EARLY | HASH_HIGHMEM,
+ &nonres_shift,
+ &nonres_mask,
+ 0);
+
+ for (i = 0; i < (1 << nonres_shift); i++) {
+ spin_lock_init(&nonres_table[i].lock);
+ for (j = 0; j < 4; ++j)
+ nonres_table[i].hand[j] = 0;
+
+ for (j = 0; j < NR_SLOTS; ++j) {
+ nonres_table[i].slot[j] = 0;
+ SET_LISTID(nonres_table[i].slot[j], NR_free);
+ if (j < NR_SLOTS - 1)
+ SET_INDEX(nonres_table[i].slot[j], j+1);
+ else /* j == NR_SLOTS - 1 */
+ SET_INDEX(nonres_table[i].slot[j], 0);
+ }
+ }
+
+ for_each_cpu(i) {
+ for (j=0; j<4; ++j)
+ per_cpu(nonres_count[j], i) = 0;
+ }
+}
+
+static int __init set_nonresident_factor(char * str)
+{
+ if (!str)
+ return 0;
+ nonresident_factor = simple_strtoul(str, &str, 0);
+ return 1;
+}
+
+__setup("nonresident_factor=", set_nonresident_factor);
Index: linux-2.6-git/include/linux/swap.h
===================================================================
--- linux-2.6-git.orig/include/linux/swap.h
+++ linux-2.6-git/include/linux/swap.h
@@ -152,6 +152,31 @@ extern void out_of_memory(gfp_t gfp_mask
/* linux/mm/memory.c */
extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);

+/* linux/mm/nonresident.c */
+#define NR_b1 0
+#define NR_b2 1
+#define NR_free 2
+#define NR_lost 3
+
+#define NR_listid 3
+#define NR_found 0x80000000
+
+
+extern int nonresident_put(struct address_space *, unsigned long, int, int);
+extern int nonresident_get(struct address_space *, unsigned long);
+extern unsigned int nonresident_total(void);
+extern unsigned int nonresident_estimate(void);
+extern void nonresident_init(void);
+
+DECLARE_PER_CPU(unsigned long[4], nonres_count);
+DECLARE_PER_CPU(u32, nonres_cycle);
+
+#define __sum_cpu_var(type, var) ({ __typeof__(type) sum = 0; \
+ int cpu; \
+ for_each_cpu(cpu) sum += per_cpu(var, cpu); \
+ sum; })
+
+
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
extern unsigned long totalhigh_pages;
@@ -288,6 +313,13 @@ static inline swp_entry_t get_swap_page(
#define has_swap_token(x) 0
#define disable_swap_token() do { } while(0)

+/* linux/mm/nonresident.c */
+#define nonresident_put(w,x,y,z) 0
+#define nonresident_find(x,y) 0
+#define nonresident_count(x) 0
+#define nonresident_total() 0
+#define nonresident_init() do { } while (0)
+
#endif /* CONFIG_SWAP */
#endif /* __KERNEL__*/
#endif /* _LINUX_SWAP_H */
Index: linux-2.6-git/init/main.c
===================================================================
--- linux-2.6-git.orig/init/main.c
+++ linux-2.6-git/init/main.c
@@ -46,6 +46,7 @@
#include <linux/unistd.h>
#include <linux/rmap.h>
#include <linux/mempolicy.h>
+#include <linux/swap.h>
#include <linux/key.h>
#include <net/sock.h>

@@ -509,6 +510,7 @@ asmlinkage void __init start_kernel(void
}
#endif
vfs_caches_init_early();
+ nonresident_init();
mem_init();
kmem_cache_init();
setup_per_cpu_pageset();
Index: linux-2.6-git/mm/Makefile
===================================================================
--- linux-2.6-git.orig/mm/Makefile
+++ linux-2.6-git/mm/Makefile
@@ -12,7 +12,8 @@ obj-y := bootmem.o filemap.o mempool.o
readahead.o slab.o swap.o truncate.o vmscan.o \
prio_tree.o page_replace.o $(mmu-y)

-obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o \
+ nonresident.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o

2005-12-30 22:42:47

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 11/14] page-replace-move-refill.patch


From: Peter Zijlstra <[email protected]>

Move refill_inactive_page() to the new mm/page_replace.c file. And
couple its invocation to the: page_replace_candidate() function.
Keep the scan-rate equal to nr_active/nr_inactive. Also make sure
we only scan in full multiples of swap_cluster_max.

Kudos to Wu Fengguang for showing me the way to decouple the active
and inactive scans.

Signed-off-by: Peter Zijlstra <[email protected]>

include/linux/mm_page_replace.h | 2
include/linux/mmzone.h | 1
mm/page_alloc.c | 4
mm/page_replace.c | 180 ++++++++++++++++++++++++++++++++++++
mm/vmscan.c | 196 ++--------------------------------------
5 files changed, 189 insertions(+), 194 deletions(-)

Index: linux-2.6-git/mm/page_replace.c
===================================================================
--- linux-2.6-git.orig/mm/page_replace.c
+++ linux-2.6-git/mm/page_replace.c
@@ -1,6 +1,27 @@
#include <linux/mm_page_replace.h>
#include <linux/swap.h>
#include <linux/pagevec.h>
+#include <linux/page-flags.h>
+#include <linux/init.h>
+#include <linux/rmap.h>
+#include <linux/buffer_head.h> /* for try_to_release_page(),
+ buffer_heads_over_limit */
+
+/*
+ * From 0 .. 100. Higher means more swappy.
+ */
+int vm_swappiness = 60;
+static long total_memory;
+
+static void refill_inactive_zone(struct zone *, int);
+
+static int __init page_replace_init(void)
+{
+ total_memory = nr_free_pagecache_pages();
+ return 0;
+}
+
+module_init(page_replace_init)

static inline void
add_page_to_inactive_list(struct zone *zone, struct page *page)
@@ -34,8 +55,8 @@ void __page_replace_insert(struct zone *
*
* returns how many pages were moved onto *@dst.
*/
-int isolate_lru_pages(int nr_to_scan, struct list_head *src,
- struct list_head *dst, int *scanned)
+static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
+ struct list_head *dst, int *scanned)
{
int nr_taken = 0;
struct page *page;
@@ -70,6 +91,7 @@ void page_replace_candidates(struct zone
{
int nr_taken;
int nr_scan;
+ unsigned long long nr_scan_active;

spin_lock_irq(&zone->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, &zone->inactive_list,
@@ -82,6 +104,18 @@ void page_replace_candidates(struct zone
mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
else
mod_page_state_zone(zone, pgscan_direct, nr_scan);
+
+ /*
+ * Add one to `nr_to_scan' just to make sure that the kernel will
+ * slowly sift through the active list.
+ */
+ nr_scan_active = (nr_scan + 1ULL) * zone->nr_active * 1024ULL;
+ do_div(nr_scan_active, zone->nr_inactive + nr_taken + 1UL);
+ zone->nr_scan_active += nr_scan_active;
+ while (zone->nr_scan_active >= SWAP_CLUSTER_MAX * 1024UL) {
+ zone->nr_scan_active -= SWAP_CLUSTER_MAX * 1024UL;
+ refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
+ }
}

/*
@@ -112,3 +146,145 @@ void page_replace_reinsert(struct zone *
pagevec_release(&pvec);
}

+/*
+ * This moves pages from the active list to the inactive list.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes, from rmap.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold zone->lru_lock across the whole operation. But if
+ * the pages are mapped, the processing is slow (page_referenced()) so we
+ * should drop zone->lru_lock around each page. It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_count against each page.
+ * But we had to alter page->flags anyway.
+ */
+static void refill_inactive_zone(struct zone *zone, int nr_pages)
+{
+ int pgmoved;
+ int pgdeactivate = 0;
+ int pgscanned;
+ LIST_HEAD(l_hold); /* The pages which were snipped off */
+ LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
+ LIST_HEAD(l_active); /* Pages to go onto the active_list */
+ struct page *page;
+ struct pagevec pvec;
+ int reclaim_mapped = 0;
+ long mapped_ratio;
+ long distress;
+ long swap_tendency;
+
+ lru_add_drain();
+ spin_lock_irq(&zone->lru_lock);
+ pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
+ &l_hold, &pgscanned);
+ zone->pages_scanned += pgscanned;
+ zone->nr_active -= pgmoved;
+ spin_unlock_irq(&zone->lru_lock);
+
+ /*
+ * `distress' is a measure of how much trouble we're having reclaiming
+ * pages. 0 -> no problems. 100 -> great trouble.
+ */
+ distress = 100 >> zone->prev_priority;
+
+ /*
+ * The point of this algorithm is to decide when to start reclaiming
+ * mapped memory instead of just pagecache. Work out how much memory
+ * is mapped.
+ */
+ mapped_ratio = (read_page_state(nr_mapped) * 100) / total_memory;
+
+ /*
+ * Now decide how much we really want to unmap some pages. The mapped
+ * ratio is downgraded - just because there's a lot of mapped memory
+ * doesn't necessarily mean that page reclaim isn't succeeding.
+ *
+ * The distress ratio is important - we don't want to start going oom.
+ *
+ * A 100% value of vm_swappiness overrides this algorithm altogether.
+ */
+ swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+
+ /*
+ * Now use this metric to decide whether to start moving mapped memory
+ * onto the inactive list.
+ */
+ if (swap_tendency >= 100)
+ reclaim_mapped = 1;
+
+ while (!list_empty(&l_hold)) {
+ cond_resched();
+ page = lru_to_page(&l_hold);
+ list_del(&page->lru);
+ if (page_mapped(page)) {
+ if (!reclaim_mapped ||
+ (total_swap_pages == 0 && PageAnon(page)) ||
+ page_referenced(page, 0)) {
+ list_add(&page->lru, &l_active);
+ continue;
+ }
+ }
+ list_add(&page->lru, &l_inactive);
+ }
+
+ pagevec_init(&pvec, 1);
+ pgmoved = 0;
+ spin_lock_irq(&zone->lru_lock);
+ while (!list_empty(&l_inactive)) {
+ page = lru_to_page(&l_inactive);
+ prefetchw_prev_lru_page(page, &l_inactive, flags);
+ if (TestSetPageLRU(page))
+ BUG();
+ if (!TestClearPageActive(page))
+ BUG();
+ list_move(&page->lru, &zone->inactive_list);
+ pgmoved++;
+ if (!pagevec_add(&pvec, page)) {
+ zone->nr_inactive += pgmoved;
+ spin_unlock_irq(&zone->lru_lock);
+ pgdeactivate += pgmoved;
+ pgmoved = 0;
+ if (buffer_heads_over_limit)
+ pagevec_strip(&pvec);
+ __pagevec_release(&pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
+ }
+ zone->nr_inactive += pgmoved;
+ pgdeactivate += pgmoved;
+ if (buffer_heads_over_limit) {
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_strip(&pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
+
+ pgmoved = 0;
+ while (!list_empty(&l_active)) {
+ page = lru_to_page(&l_active);
+ prefetchw_prev_lru_page(page, &l_active, flags);
+ if (TestSetPageLRU(page))
+ BUG();
+ BUG_ON(!PageActive(page));
+ list_move(&page->lru, &zone->active_list);
+ pgmoved++;
+ if (!pagevec_add(&pvec, page)) {
+ zone->nr_active += pgmoved;
+ pgmoved = 0;
+ spin_unlock_irq(&zone->lru_lock);
+ __pagevec_release(&pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
+ }
+ zone->nr_active += pgmoved;
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_release(&pvec);
+
+ mod_page_state_zone(zone, pgrefill, pgscanned);
+ mod_page_state(pgdeactivate, pgdeactivate);
+}
+
Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c
+++ linux-2.6-git/mm/vmscan.c
@@ -103,12 +103,6 @@ struct shrinker {
long nr; /* objs pending delete */
};

-/*
- * From 0 .. 100. Higher means more swappy.
- */
-int vm_swappiness = 60;
-static long total_memory;
-
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);

@@ -590,200 +584,31 @@ static void shrink_cache(struct zone *zo
}

/*
- * This moves pages from the active list to the inactive list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone->lru_lock across the whole operation. But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone->lru_lock around each page. It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_count against each page.
- * But we had to alter page->flags anyway.
- */
-static void
-refill_inactive_zone(struct zone *zone, int nr_pages)
-{
- int pgmoved;
- int pgdeactivate = 0;
- int pgscanned;
- LIST_HEAD(l_hold); /* The pages which were snipped off */
- LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
- LIST_HEAD(l_active); /* Pages to go onto the active_list */
- struct page *page;
- struct pagevec pvec;
- int reclaim_mapped = 0;
- long mapped_ratio;
- long distress;
- long swap_tendency;
-
- lru_add_drain();
- spin_lock_irq(&zone->lru_lock);
- pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
- &l_hold, &pgscanned);
- zone->pages_scanned += pgscanned;
- zone->nr_active -= pgmoved;
- spin_unlock_irq(&zone->lru_lock);
-
- /*
- * `distress' is a measure of how much trouble we're having reclaiming
- * pages. 0 -> no problems. 100 -> great trouble.
- */
- distress = 100 >> zone->prev_priority;
-
- /*
- * The point of this algorithm is to decide when to start reclaiming
- * mapped memory instead of just pagecache. Work out how much memory
- * is mapped.
- */
- mapped_ratio = (read_page_state(nr_mapped) * 100) / total_memory;
-
- /*
- * Now decide how much we really want to unmap some pages. The mapped
- * ratio is downgraded - just because there's a lot of mapped memory
- * doesn't necessarily mean that page reclaim isn't succeeding.
- *
- * The distress ratio is important - we don't want to start going oom.
- *
- * A 100% value of vm_swappiness overrides this algorithm altogether.
- */
- swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
- /*
- * Now use this metric to decide whether to start moving mapped memory
- * onto the inactive list.
- */
- if (swap_tendency >= 100)
- reclaim_mapped = 1;
-
- while (!list_empty(&l_hold)) {
- cond_resched();
- page = lru_to_page(&l_hold);
- list_del(&page->lru);
- if (page_mapped(page)) {
- if (!reclaim_mapped ||
- (total_swap_pages == 0 && PageAnon(page)) ||
- page_referenced(page, 0)) {
- list_add(&page->lru, &l_active);
- continue;
- }
- }
- list_add(&page->lru, &l_inactive);
- }
-
- pagevec_init(&pvec, 1);
- pgmoved = 0;
- spin_lock_irq(&zone->lru_lock);
- while (!list_empty(&l_inactive)) {
- page = lru_to_page(&l_inactive);
- prefetchw_prev_lru_page(page, &l_inactive, flags);
- if (TestSetPageLRU(page))
- BUG();
- if (!TestClearPageActive(page))
- BUG();
- list_move(&page->lru, &zone->inactive_list);
- pgmoved++;
- if (!pagevec_add(&pvec, page)) {
- zone->nr_inactive += pgmoved;
- spin_unlock_irq(&zone->lru_lock);
- pgdeactivate += pgmoved;
- pgmoved = 0;
- if (buffer_heads_over_limit)
- pagevec_strip(&pvec);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- zone->nr_inactive += pgmoved;
- pgdeactivate += pgmoved;
- if (buffer_heads_over_limit) {
- spin_unlock_irq(&zone->lru_lock);
- pagevec_strip(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
-
- pgmoved = 0;
- while (!list_empty(&l_active)) {
- page = lru_to_page(&l_active);
- prefetchw_prev_lru_page(page, &l_active, flags);
- if (TestSetPageLRU(page))
- BUG();
- BUG_ON(!PageActive(page));
- list_move(&page->lru, &zone->active_list);
- pgmoved++;
- if (!pagevec_add(&pvec, page)) {
- zone->nr_active += pgmoved;
- pgmoved = 0;
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- zone->nr_active += pgmoved;
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
-
- mod_page_state_zone(zone, pgrefill, pgscanned);
- mod_page_state(pgdeactivate, pgdeactivate);
-}
-
-/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
static void
shrink_zone(struct zone *zone, struct scan_control *sc)
{
- unsigned long nr_active;
unsigned long nr_inactive;

atomic_inc(&zone->reclaim_in_progress);

- /*
- * Add one to `nr_to_scan' just to make sure that the kernel will
- * slowly sift through the active list.
- */
- zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
- nr_active = zone->nr_scan_active;
- if (nr_active >= sc->swap_cluster_max)
- zone->nr_scan_active = 0;
- else
- nr_active = 0;
-
- zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
- nr_inactive = zone->nr_scan_inactive;
- if (nr_inactive >= sc->swap_cluster_max)
- zone->nr_scan_inactive = 0;
- else
- nr_inactive = 0;
+ nr_inactive = (zone->nr_inactive >> sc->priority) + SWAP_CLUSTER_MAX;
+ nr_inactive &= ~(SWAP_CLUSTER_MAX - 1);

+ sc->nr_to_scan = SWAP_CLUSTER_MAX;
sc->nr_to_reclaim = sc->swap_cluster_max;

- while (nr_active || nr_inactive) {
- if (nr_active) {
- sc->nr_to_scan = min(nr_active,
- (unsigned long)sc->swap_cluster_max);
- nr_active -= sc->nr_to_scan;
- refill_inactive_zone(zone, sc->nr_to_scan);
- }
-
- if (nr_inactive) {
- sc->nr_to_scan = min(nr_inactive,
- (unsigned long)sc->swap_cluster_max);
- nr_inactive -= sc->nr_to_scan;
- shrink_cache(zone, sc);
- if (sc->nr_to_reclaim <= 0)
- break;
- }
+ while (nr_inactive >= SWAP_CLUSTER_MAX) {
+ nr_inactive -= SWAP_CLUSTER_MAX;
+ shrink_cache(zone, sc);
+ if (sc->nr_to_reclaim <= 0)
+ break;
}

- throttle_vm_writeout();
-
atomic_dec(&zone->reclaim_in_progress);
+
+ throttle_vm_writeout();
}

/*
@@ -1245,7 +1070,6 @@ static int __init kswapd_init(void)
for_each_pgdat(pgdat)
pgdat->kswapd
= find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
- total_memory = nr_free_pagecache_pages();
hotcpu_notifier(cpu_callback, 0);
return 0;
}
Index: linux-2.6-git/include/linux/mm_page_replace.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_page_replace.h
+++ linux-2.6-git/include/linux/mm_page_replace.h
@@ -45,8 +45,6 @@ static inline void page_replace_activate
}
void page_replace_reinsert(struct zone *, struct list_head *);

-int isolate_lru_pages(int, struct list_head *, struct list_head *, int *);
-
static inline void
add_page_to_active_list(struct zone *zone, struct page *page)
{
Index: linux-2.6-git/include/linux/mmzone.h
===================================================================
--- linux-2.6-git.orig/include/linux/mmzone.h
+++ linux-2.6-git/include/linux/mmzone.h
@@ -144,7 +144,6 @@ struct zone {
struct list_head active_list;
struct list_head inactive_list;
unsigned long nr_scan_active;
- unsigned long nr_scan_inactive;
unsigned long nr_active;
unsigned long nr_inactive;
unsigned long pages_scanned; /* since last reclaim */
Index: linux-2.6-git/mm/page_alloc.c
===================================================================
--- linux-2.6-git.orig/mm/page_alloc.c
+++ linux-2.6-git/mm/page_alloc.c
@@ -2010,7 +2010,6 @@ static void __init free_area_init_core(s
INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list);
zone->nr_scan_active = 0;
- zone->nr_scan_inactive = 0;
zone->nr_active = 0;
zone->nr_inactive = 0;
atomic_set(&zone->reclaim_in_progress, 0);
@@ -2161,7 +2160,7 @@ static int zoneinfo_show(struct seq_file
"\n high %lu"
"\n active %lu"
"\n inactive %lu"
- "\n scanned %lu (a: %lu i: %lu)"
+ "\n scanned %lu"
"\n spanned %lu"
"\n present %lu",
zone->free_pages,
@@ -2171,7 +2170,6 @@ static int zoneinfo_show(struct seq_file
zone->nr_active,
zone->nr_inactive,
zone->pages_scanned,
- zone->nr_scan_active, zone->nr_scan_inactive,
zone->spanned_pages,
zone->present_pages);
seq_printf(m,

2005-12-30 22:42:01

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 07/14] page-replace-move-isolate_lru_pages.patch


From: Peter Zijlstra <[email protected]>

Manipulation of the page lists is done exclusivly in page_replace.c.

Signed-off-by: Peter Zijlstra <[email protected]>

--- linux-2.6-git.orig/include/linux/mm_page_replace.h 2005-12-10 23:41:17.000000000 +0100
+++ linux-2.6-git/include/linux/mm_page_replace.h 2005-12-11 11:27:39.000000000 +0100
@@ -43,4 +43,6 @@
}

+int isolate_lru_pages(int, struct list_head *, struct list_head *, int *);
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_PAGE_REPLACE_H */
--- linux-2.6-git.orig/mm/page_replace.c 2005-12-10 23:41:17.000000000 +0100
+++ linux-2.6-git/mm/page_replace.c 2005-12-11 11:27:39.000000000 +0100
@@ -9,3 +9,52 @@ void __page_replace_insert(struct zone *
else
add_page_to_inactive_list(zone, page);
}
+
+/*
+ * zone->lru_lock is heavily contended. Some of the functions that
+ * shrink the lists perform better by taking out a batch of pages
+ * and working on them outside the LRU lock.
+ *
+ * For pagecache intensive workloads, this function is the hottest
+ * spot in the kernel (apart from copy_*_user functions).
+ *
+ * Appropriate locks must be held before calling this function.
+ *
+ * @nr_to_scan: The number of pages to look through on the list.
+ * @src: The LRU list to pull pages off.
+ * @dst: The temp list to put pages on to.
+ * @scanned: The number of pages that were scanned.
+ *
+ * returns how many pages were moved onto *@dst.
+ */
+int isolate_lru_pages(int nr_to_scan, struct list_head *src,
+ struct list_head *dst, int *scanned)
+{
+ int nr_taken = 0;
+ struct page *page;
+ int scan = 0;
+
+ while (scan++ < nr_to_scan && !list_empty(src)) {
+ page = lru_to_page(src);
+ prefetchw_prev_lru_page(page, src, flags);
+
+ if (!TestClearPageLRU(page))
+ BUG();
+ list_del(&page->lru);
+ if (get_page_testone(page)) {
+ /*
+ * It is being freed elsewhere
+ */
+ __put_page(page);
+ SetPageLRU(page);
+ list_add(&page->lru, src);
+ continue;
+ } else {
+ list_add(&page->lru, dst);
+ nr_taken++;
+ }
+ }
+
+ *scanned = scan;
+ return nr_taken;
+}
--- linux-2.6-git.orig/mm/vmscan.c 2005-12-10 23:41:17.000000000 +0100
+++ linux-2.6-git/mm/vmscan.c 2005-12-11 11:27:39.000000000 +0100
@@ -568,55 +568,6 @@
}

/*
- * zone->lru_lock is heavily contended. Some of the functions that
- * shrink the lists perform better by taking out a batch of pages
- * and working on them outside the LRU lock.
- *
- * For pagecache intensive workloads, this function is the hottest
- * spot in the kernel (apart from copy_*_user functions).
- *
- * Appropriate locks must be held before calling this function.
- *
- * @nr_to_scan: The number of pages to look through on the list.
- * @src: The LRU list to pull pages off.
- * @dst: The temp list to put pages on to.
- * @scanned: The number of pages that were scanned.
- *
- * returns how many pages were moved onto *@dst.
- */
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
- struct list_head *dst, int *scanned)
-{
- int nr_taken = 0;
- struct page *page;
- int scan = 0;
-
- while (scan++ < nr_to_scan && !list_empty(src)) {
- page = lru_to_page(src);
- prefetchw_prev_lru_page(page, src, flags);
-
- if (!TestClearPageLRU(page))
- BUG();
- list_del(&page->lru);
- if (get_page_testone(page)) {
- /*
- * It is being freed elsewhere
- */
- __put_page(page);
- SetPageLRU(page);
- list_add(&page->lru, src);
- continue;
- } else {
- list_add(&page->lru, dst);
- nr_taken++;
- }
- }
-
- *scanned = scan;
- return nr_taken;
-}
-
-/*
* shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
*/
static void shrink_cache(struct zone *zone, struct scan_control *sc)

2005-12-30 22:46:55

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 5/9] clockpro-ignore_token.patch


From: Peter Zijlstra <[email protected]>

Re-introduce the ignore_token argument to page_referenced(); hand hot
rotation will make use of this feature.

Signed-off-by: Peter Zijlstra <[email protected]>

include/linux/rmap.h | 4 ++--
mm/page_replace.c | 2 +-
mm/rmap.c | 26 ++++++++++++++++----------
mm/vmscan.c | 2 +-
4 files changed, 20 insertions(+), 14 deletions(-)

Index: linux-2.6-git-2/include/linux/rmap.h
===================================================================
--- linux-2.6-git-2.orig/include/linux/rmap.h
+++ linux-2.6-git-2/include/linux/rmap.h
@@ -89,7 +89,7 @@ static inline void page_dup_rmap(struct
/*
* Called from mm/vmscan.c to handle paging out
*/
-int page_referenced(struct page *, int is_locked);
+int page_referenced(struct page *, int is_locked, int ignore_token);
int try_to_unmap(struct page *);

/*
@@ -109,7 +109,7 @@ unsigned long page_address_in_vma(struct
#define anon_vma_prepare(vma) (0)
#define anon_vma_link(vma) do {} while (0)

-#define page_referenced(page,l) TestClearPageReferenced(page)
+#define page_referenced(page,l,i) TestClearPageReferenced(page)
#define try_to_unmap(page) SWAP_FAIL

#endif /* CONFIG_MMU */
Index: linux-2.6-git-2/mm/page_replace.c
===================================================================
--- linux-2.6-git-2.orig/mm/page_replace.c
+++ linux-2.6-git-2/mm/page_replace.c
@@ -232,7 +232,7 @@ static void refill_inactive_zone(struct
if (page_mapped(page)) {
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
- page_referenced(page, 0)) {
+ page_referenced(page, 0, 0)) {
list_add(&page->lru, &l_active);
continue;
}
Index: linux-2.6-git-2/mm/rmap.c
===================================================================
--- linux-2.6-git-2.orig/mm/rmap.c
+++ linux-2.6-git-2/mm/rmap.c
@@ -290,7 +290,7 @@ pte_t *page_check_address(struct page *p
* repeatedly from either page_referenced_anon or page_referenced_file.
*/
static int page_referenced_one(struct page *page,
- struct vm_area_struct *vma, unsigned int *mapcount)
+ struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -311,7 +311,7 @@ static int page_referenced_one(struct pa

/* Pretend the page is referenced if the task has the
swap token and is in the middle of a page fault. */
- if (mm != current->mm && has_swap_token(mm) &&
+ if (mm != current->mm && !ignore_token && has_swap_token(mm) &&
rwsem_is_locked(&mm->mmap_sem))
referenced++;

@@ -321,7 +321,7 @@ out:
return referenced;
}

-static int page_referenced_anon(struct page *page)
+static int page_referenced_anon(struct page *page, int ignore_token)
{
unsigned int mapcount;
struct anon_vma *anon_vma;
@@ -334,7 +334,8 @@ static int page_referenced_anon(struct p

mapcount = page_mapcount(page);
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- referenced += page_referenced_one(page, vma, &mapcount);
+ referenced += page_referenced_one(page, vma, &mapcount,
+ ignore_token);
if (!mapcount)
break;
}
@@ -353,7 +354,7 @@ static int page_referenced_anon(struct p
*
* This function is only called from page_referenced for object-based pages.
*/
-static int page_referenced_file(struct page *page)
+static int page_referenced_file(struct page *page, int ignore_token)
{
unsigned int mapcount;
struct address_space *mapping = page->mapping;
@@ -391,7 +392,8 @@ static int page_referenced_file(struct p
referenced++;
break;
}
- referenced += page_referenced_one(page, vma, &mapcount);
+ referenced += page_referenced_one(page, vma, &mapcount,
+ ignore_token);
if (!mapcount)
break;
}
@@ -408,10 +410,13 @@ static int page_referenced_file(struct p
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
*/
-int page_referenced(struct page *page, int is_locked)
+int page_referenced(struct page *page, int is_locked, int ignore_token)
{
int referenced = 0;

+ if (!swap_token_default_timeout)
+ ignore_token = 1;
+
if (page_test_and_clear_young(page))
referenced++;

@@ -420,14 +425,15 @@ int page_referenced(struct page *page, i

if (page_mapped(page) && page->mapping) {
if (PageAnon(page))
- referenced += page_referenced_anon(page);
+ referenced += page_referenced_anon(page, ignore_token);
else if (is_locked)
- referenced += page_referenced_file(page);
+ referenced += page_referenced_file(page, ignore_token);
else if (TestSetPageLocked(page))
referenced++;
else {
if (page->mapping)
- referenced += page_referenced_file(page);
+ referenced += page_referenced_file(page,
+ ignore_token);
unlock_page(page);
}
}
Index: linux-2.6-git-2/mm/vmscan.c
===================================================================
--- linux-2.6-git-2.orig/mm/vmscan.c
+++ linux-2.6-git-2/mm/vmscan.c
@@ -352,7 +352,7 @@ static try_pageout_t try_pageout(struct
if (PageWriteback(page))
goto keep_locked;

- referenced = page_referenced(page, 1);
+ referenced = page_referenced(page, 1, 0);

if (referenced)
goto activate_locked;

2005-12-30 22:48:05

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 09/14] page-replace-reinsert.patch


From: Peter Zijlstra <[email protected]>

page-replace interface function:
page_replace_reinsert()

This function will reinsert those candidate pages that were not
freed by try_pageout().

Signed-off-by: Peter Zijlstra <[email protected]>

include/linux/mm_page_replace.h | 1 +
mm/page_replace.c | 29 +++++++++++++++++++++++++++++
mm/vmscan.c | 25 +------------------------
3 files changed, 31 insertions(+), 24 deletions(-)

Index: linux-2.6-git/include/linux/mm_page_replace.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_page_replace.h
+++ linux-2.6-git/include/linux/mm_page_replace.h
@@ -43,6 +43,7 @@ static inline void page_replace_activate
{
SetPageActive(page);
}
+void page_replace_reinsert(struct zone *, struct list_head *);

int isolate_lru_pages(int, struct list_head *, struct list_head *, int *);

Index: linux-2.6-git/mm/page_replace.c
===================================================================
--- linux-2.6-git.orig/mm/page_replace.c
+++ linux-2.6-git/mm/page_replace.c
@@ -1,6 +1,7 @@
#include <linux/mm_page_replace.h>
#include <linux/mm_inline.h>
#include <linux/swap.h>
+#include <linux/pagevec.h>


void __page_replace_insert(struct zone *zone, struct page *page)
@@ -78,3 +79,31 @@ void page_replace_candidates(struct zone
mod_page_state_zone(zone, pgscan_direct, nr_scan);
}

+/*
+ * Put back any unfreeable pages.
+ */
+void page_replace_reinsert(struct zone *zone, struct list_head *page_list)
+{
+ struct pagevec pvec;
+
+ pagevec_init(&pvec, 1);
+ spin_lock_irq(&zone->lru_lock);
+ while (!list_empty(page_list)) {
+ struct page *page = lru_to_page(page_list);
+ BUG_ON(PageLRU(page));
+ SetPageLRU(page);
+ list_del(&page->lru);
+ if (PageActive(page))
+ add_page_to_active_list(zone, page);
+ else
+ add_page_to_inactive_list(zone, page);
+ if (!pagevec_add(&pvec, page)) {
+ spin_unlock_irq(&zone->lru_lock);
+ __pagevec_release(&pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_release(&pvec);
+}
+
Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c
+++ linux-2.6-git/mm/vmscan.c
@@ -573,8 +573,6 @@ static int shrink_list(struct list_head
static void shrink_cache(struct zone *zone, struct scan_control *sc)
{
LIST_HEAD(page_list);
- struct pagevec pvec;
- struct page *page;
int nr_freed;

lru_add_drain();
@@ -589,28 +587,7 @@ static void shrink_cache(struct zone *zo
mod_page_state_zone(zone, pgsteal, nr_freed);
sc->nr_to_reclaim -= nr_freed;

- /*
- * Put back any unfreeable pages.
- */
- pagevec_init(&pvec, 1);
- spin_lock_irq(&zone->lru_lock);
- while (!list_empty(&page_list)) {
- page = lru_to_page(&page_list);
- if (TestSetPageLRU(page))
- BUG();
- list_del(&page->lru);
- if (PageActive(page))
- add_page_to_active_list(zone, page);
- else
- add_page_to_inactive_list(zone, page);
- if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
+ page_replace_reinsert(zone, &page_list);
}

/*

2005-12-31 01:14:22

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH 1/9] clockpro-nonresident.patch

On Fri, Dec 30, 2005 at 11:42:44PM +0100, Peter Zijlstra wrote:
>
> From: Peter Zijlstra <[email protected]>
>
> Originally started by Rik van Riel, I heavily modified the code
> to suit my needs.
>
> The nonresident code approximates a clock but sacrifices precision in order
> to accomplish faster lookups.
>
> The actual datastructure is a hash of small clocks, so that, assuming an
> equal distribution by the hash function, each clock has comparable order.
>
> TODO:
> - remove the ARC requirements.
>
> Signed-off-by: Peter Zijlstra <[email protected]>

<snip>

> + *
> + *
> + * Modified to work with ARC like algorithms who:
> + * - need to balance two FIFOs; |b1| + |b2| = c,
> + *
> + * The bucket contains four single linked cyclic lists (CLOCKS) and each
> + * clock has a tail hand. By selecting a victim clock upon insertion it
> + * is possible to balance them.
> + *
> + * The first two lists are used for B1/B2 and a third for a free slot list.
> + * The fourth list is unused.
> + *
> + * The slot looks like this:
> + * struct slot_t {
> + * u32 cookie : 24; // LSB
> + * u32 index : 6;
> + * u32 listid : 2;
> + * };

8 and 16 bit accesses are slower than 32 bit on i386 (Arjan pointed this out sometime ago).

Might be faster to load a full word and shape it as necessary, will see if I can do
something instead of talking. ;)

> +/*
> + * For interactive workloads, we remember about as many non-resident pages
> + * as we have actual memory pages. For server workloads with large inter-
> + * reference distances we could benefit from remembering more.
> + */

This comment is bogus. Interactive or server loads have nothing to do
with the inter reference distance. To the contrary, interactive loads
have a higher chance to contain large inter reference distances, and
many common server loads have strong locality.

<snip>

> +++ linux-2.6-git/include/linux/swap.h
> @@ -152,6 +152,31 @@ extern void out_of_memory(gfp_t gfp_mask
> /* linux/mm/memory.c */
> extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
>
> +/* linux/mm/nonresident.c */
> +#define NR_b1 0
> +#define NR_b2 1
> +#define NR_free 2
> +#define NR_lost 3

What is the meaning of "NR_lost" ?

> +
> +#define NR_listid 3
> +#define NR_found 0x80000000

2005-12-31 01:15:24

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH 14/14] page-replace-kswapd-incmin.patch

On Fri, Dec 30, 2005 at 11:42:34PM +0100, Peter Zijlstra wrote:
>
> From: Nick Piggin <[email protected]>
>
> Explicitly teach kswapd about the incremental min logic instead of just scanning
> all zones under the first low zone. This should keep more even pressure applied
> on the zones.
>
> The new shrink_zone() logic exposes the very worst side of the current
> balance_pgdat() function. Without this patch reclaim is limited to ZONE_DMA.

Can you please describe the issue with over protection of DMA zone you experienced?

I'll see if I can reproduce it with Nick's standalone patch on top of vanilla, what
load was that?

2005-12-31 01:16:03

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch


Hi Peter,

_Nice_ work!

IMHO you're going into the right direction, abstracting away page
replacement policy from page reclaim.

I think that final objective should be to abstract it away completly,
making it possible to select between different policies, allowing
further experimentation and implementations such as energy efficient
algorithms.

How hard do you think would it be to enhance your patches to allow for
compile-time selectable policies?

For instance, moving?"page reclaim scanner" specific information into
its own container:

@@ -140,12 +140,13 @@ struct zone {
/* Fields commonly accessed by the page reclaim scanner */
- spinlock_t lru_lock;
- struct list_head active_list;
- struct list_head inactive_list;
- unsigned long nr_scan_active;
- unsigned long nr_active;
- unsigned long nr_inactive;
+ spinlock_t lru_lock;
+ struct list_head list_hand[2];
+ unsigned long nr_resident;
+ unsigned long nr_cold;
+ unsigned long nr_cold_target;
+ unsigned long nr_nonresident_scale;
+

Such as "struct reclaim_policy_data" or a better name.

About CLOCK-Pro itself, I think that a small document with a short
introduction would be very useful... explaining that it uses inter
reference distance instead of recency for the page replacement criteria,
and why this criteria is fundamentally more appropriate for a large set
of common access patterns aka "a resume of the CLOCK-Pro paper".

On Fri, Dec 30, 2005 at 11:43:34PM +0100, Peter Zijlstra wrote:
>
> From: Peter Zijlstra <[email protected]>
>
> The flesh of the clockpro implementation.
>
> The paper: http://www.cs.wm.edu/hpcs/WWW/HTML/publications/abs05-3.html
> describes the aglorithm aproximated. It described a clock which has three
typo
> hands, hand cold, hot and test. The table below describes the actions
> each hand performs.
>
> res | hot | tst | ref || Hcold | Hhot | Htst || Flt
> ----+-----+-----+-----++-------+------+------++-----
> 1 | 1 | 0 | 1 ||=1101 | 1100 |=1101 ||
> 1 | 1 | 0 | 0 ||=1100 | 1000 |=1100 ||
> ----+-----+-----+-----++-------+------+------++-----
> 1 | 0 | 1 | 1 || 1100 | 1001 | 1001 ||
> 1 | 0 | 1 | 0 ||X0010 | 1000 | 1000 ||
> 1 | 0 | 0 | 1 || 1010 |=1001 |=1001 ||
> 1 | 0 | 0 | 0 ||X0000 |=1000 |=1000 ||
> ----+-----+-----+-----++-------+------+------++-----
> ----+-----+-----+-----++-------+------+------++-----
> 0 | 0 | 1 | 1 || | | || 1100
> 0 | 0 | 1 | 0 ||=0010 |X0000 |X0000 ||
> 0 | 0 | 0 | 1 || | | || 1010
>
> The approximation made is the removal of the nonresident pages from the one
> clock. The conceptual model is two clocks superimposed, one containing the
> resident and one containing the nonresident pages.
>
> Implementation wise I use something based on Rik van Riel's nonresident code
> which actually aproximates a clock with reduced order.

I'm curious about hash collisions, would like to know more details about
the hash distribution under different loads.

Would be nice to measure the rate of updates on each hash bucket and
confirm that they are approximate.

> The resident clock with two hands is implemented using two lists which are to
> be seen as laid head to tail to form the clock. When one hand laps the other
> the lists are swapped.

How does that differ from the original CLOCK-Pro algorithm, and why, and what are
the expected outcomes? Please make it easier for others to understand why the hands
swap, and when, and why.

> Each page has 3 state bits:
>
> hot -> PageHot()
> test -> PageTest()
> ref -> page_referenced()
>
> (PG_active will be renamed to PG_hot in a following patch, since the semantics
> changed also change the name in order to avoid confusion))
>
> The HandCold rotation is driven by page reclaim needs. HandCold in turn
> drives HandHot, for every page HandCold promotes to hot HandHot needs to
> degrade one hot page to cold.

Why do you use only two clock hands and not three (HandHot, HandCold and HandTest)
as in the original paper?

> Changing the cold page target number also has influence on the HandHot
> rotation speed, when incremented the actual number of cold pages will be
> less than the desired number and hence we need to degrade some extra hot
> pages. When decreased, the actual number of cold pages is too large, so
> we would need to inhibit the degradation of hot pages.
>
> The cold page target count is maintained in zone->nr_cold_target; it is
> incremented when a page is referenced in its test period and decremented
> when a page's test period expires.
>
> The nonresident CLOCK is coupled to HandHot and is rotated so that when
> all resident zone CLOCKs have made one revolution, it too has made one
> whole revolution.
>
> Signed-off-by: Peter Zijlstra <[email protected]>
>
> include/linux/mm_page_replace.h | 40 +-
> include/linux/mmzone.h | 13
> mm/Makefile | 2
> mm/clockpro.c | 557 ++++++++++++++++++++++++++++++++++++++++
> mm/page_alloc.c | 20 -
> mm/vmscan.c | 15 -
> 6 files changed, 603 insertions(+), 44 deletions(-)
>
> Index: linux-2.6-git/include/linux/mmzone.h
> ===================================================================
> --- linux-2.6-git.orig/include/linux/mmzone.h
> +++ linux-2.6-git/include/linux/mmzone.h
> @@ -140,12 +140,13 @@ struct zone {
> ZONE_PADDING(_pad1_)
>
> /* Fields commonly accessed by the page reclaim scanner */
> - spinlock_t lru_lock;
> - struct list_head active_list;
> - struct list_head inactive_list;
> - unsigned long nr_scan_active;
> - unsigned long nr_active;
> - unsigned long nr_inactive;
> + spinlock_t lru_lock;
> + struct list_head list_hand[2];
> + unsigned long nr_resident;
> + unsigned long nr_cold;
> + unsigned long nr_cold_target;
> + unsigned long nr_nonresident_scale;
> +
> unsigned long pages_scanned; /* since last reclaim */
> int all_unreclaimable; /* All pages pinned */
>
> Index: linux-2.6-git/mm/clockpro.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6-git/mm/clockpro.c
> @@ -0,0 +1,557 @@
> +/*
> + * mm/clockpro.c
> + *
> + * Written by Peter Zijlstra <[email protected]>
> + * Released under the GPLv2, see the file COPYING for details.
> + *
> + * res | h/c | tst | ref || Hcold | Hhot | Htst || Flt
> + * ----+-----+-----+-----++-------+------+------++-----
> + * 1 | 1 | 0 | 1 ||=1101 | 1100 |=1101 ||
> + * 1 | 1 | 0 | 0 ||=1100 | 1000 |=1100 ||
> + * ----+-----+-----+-----++-------+------+------++-----
> + * 1 | 0 | 1 | 1 || 1100 | 1001 | 1001 ||
> + * 1 | 0 | 1 | 0 ||X0010 | 1000 | 1000 ||
> + * 1 | 0 | 0 | 1 || 1010 |=1001 |=1001 ||
> + * 1 | 0 | 0 | 0 ||X0000 |=1000 |=1000 ||
> + * ----+-----+-----+-----++-------+------+------++-----
> + * ----+-----+-----+-----++-------+------+------++-----
> + * 0 | 0 | 1 | 1 || | | || 1100
> + * 0 | 0 | 1 | 0 ||=0010 |X0000 |X0000 ||
> + * 0 | 0 | 0 | 1 || | | || 1010

What does this mean? Can you make it easier for ignorant people like
myself to understand?

> + *
> + * h/c -> PageHot()
> + * tst -> PageTest()
> + * ref -> page_referenced()
> + *
> + * The HandCold rotation is driven by page reclaim needs. HandCold in turn
> + * drives HandHot, for every page HandCold promotes to hot HandHot needs to
> + * degrade one hot page to cold.
> + *
> + * Changing the cold page target number also has influence on the HandHot
> + * rotation speed, when incremented the actual number of cold pages will be
> + * less than the desired number and hence we need to degrade some extra hot
> + * pages. When decreased, the actual number of cold pages is too large, so
> + * we would need to inhibit the degradation of hot pages.
> + *
> + * The cold page target count is maintained in zone->nr_cold_target; it is
> + * incremented when a page is referenced in its test period and decremented
> + * when a page's test period expires.
> + *
> + * The nonresident CLOCK is coupled to HandHot and is rotated so that when
> + * all resident zone CLOCKs have made one revolution, it too has made one
> + * whole revolution (see __nonres_term()).
> + *
> + * All functions that are prefixed with '__' assume that zone->lru_lock is taken.
> + */
> +
> +#include <linux/mm_page_replace.h>
> +#include <linux/rmap.h>
> +#include <linux/buffer_head.h>
> +#include <linux/pagevec.h>
> +#include <linux/bootmem.h>
> +#include <linux/init.h>
> +
> +#include <asm/div64.h>
> +
> +/*
> + * From 0 .. 100. Higher means more swappy.
> + */
> +int vm_swappiness = 100;
> +static long total_memory;
> +
> +static int __init page_replace_init(void)
> +{
> + total_memory = nr_free_pagecache_pages();
> + return 0;
> +}
> +
> +module_init(page_replace_init)
> +
> +/* Called to initialize the clockpro parameters */
> +void __init page_replace_init_zone(struct zone *zone)
> +{
> + INIT_LIST_HEAD(&zone->list_hand[0]);
> + INIT_LIST_HEAD(&zone->list_hand[1]);
> + zone->nr_resident = 0;
> + zone->nr_cold = 0;
> + zone->nr_cold_target = zone->pages_high;
> + zone->nr_nonresident_scale = 0;
> +}
> +
> +/*
> + * Increase the cold pages target; limit it to the total number of resident
> + * pages present in the current zone.
> + *
> + * @zone: current zone
> + * @dct: intended increase
> + */
> +static void __cold_target_inc(struct zone *zone, unsigned long dct)
> +{
> + if (zone->nr_cold_target < zone->nr_resident - dct)
> + zone->nr_cold_target += dct;
> + else
> + zone->nr_cold_target = zone->nr_resident;
> +}
> +
> +/*
> + * Decrease the cold pages target; limit it to the high watermark in order
> + * to always have some pages available for quick reclaim.
> + *
> + * @zone: current zone
> + * @dct: intended decrease
> + */
> +static void __cold_target_dec(struct zone *zone, unsigned long dct)
> +{
> + if (zone->nr_cold_target > zone->pages_high + dct)
> + zone->nr_cold_target -= dct;
> + else
> + zone->nr_cold_target = zone->pages_high;
> +}
> +
> +static void swap_lists(struct zone *zone)
> +{
> + LIST_HEAD(tmp);
> + list_splice_init(&zone->list_hand[0], &tmp);
> + list_splice_init(&zone->list_hand[1], &zone->list_hand[0]);
> + list_splice(&tmp, &zone->list_hand[1]);
> +}
> +
> +static inline
> +void __select_list_hand(struct zone *zone, struct list_head *list)
> +{
> + if (list_empty(list))
> + swap_lists(zone);
> +}
> +
> +/*
> + * Insert page into @zones clock and update adaptive parameters.
> + *
> + * Several page flags are used for insertion hints:
> + * PG_active - insert as an active page
> + * PG_test - use the use-once logic
> + *
> + * For now we will ignore the active hint; the use once logic is
> + * explained below.
> + *
> + * @zone: target zone.
> + * @page: new page.
> + */
> +void __page_replace_insert(struct zone *zone, struct page *page)
> +{
> + unsigned int rflags;
> +
> + rflags = nonresident_get(page_mapping(page), page_index(page));
> +
> + /* ignore the PG_active hint */
> + ClearPageActive(page);
> +
> + /* abuse the PG_test flag for pagecache use-once */
> + if (!TestClearPageTest(page)) {
> + /*
> + * Insert (hot) when found in the nonresident list, otherwise
> + * insert as (cold,test). Insert at the head of the Hhot list,
> + * ie. right behind Hcold.
> + */
> + if (rflags & NR_found) {
> + SetPageActive(page);
> + __cold_target_inc(zone, 1);
> + } else {
> + SetPageTest(page);
> + ++zone->nr_cold;
> + }
> + ++zone->nr_resident;
> + __select_list_hand(zone, &zone->list_hand[hand_hot]);
> + list_add(&page->lru, &zone->list_hand[hand_hot]);
> + } else {
> + /*
> + * Pagecache insert; we want to avoid activation on the first
> + * reference (which we know will come); use-once logic.
> + *
> + * This is accomplished by inserting the page one state lower
> + * than usual so the activation that does come ups it to the
> + * normal insert state. Also we insert right behind Hhot so
> + * 1) Hhot cannot interfere; and 2) we lose the first reference
> + * quicker.
> + *
> + * Insert (cold,test)/(cold) so the following activation will
> + * elevate the state to (hot)/(cold,test). (NOTE: the activation
> + * will take care of the cold target increment).
> + */
> + BUG_ON(PageTest(page));
> +
> + if (rflags & NR_found) {
> + SetPageTest(page);
> + }
> + ++zone->nr_cold;
> + ++zone->nr_resident;
> + __select_list_hand(zone, &zone->list_hand[hand_cold]);
> + list_add(&page->lru, &zone->list_hand[hand_cold]);
> + }
> +
> + BUG_ON(!PageLRU(page));
> +}
> +
> +/*
> + * zone->lru_lock is heavily contended. Some of the functions that
> + * shrink the lists perform better by taking out a batch of pages
> + * and working on them outside the LRU lock.
> + *
> + * For pagecache intensive workloads, this function is the hottest
> + * spot in the kernel (apart from copy_*_user functions).
> + *
> + * @nr_to_scan: The number of pages to look through on the list.
> + * @src: The LRU list to pull pages off.
> + * @dst: The temp list to put pages on to.
> + * @scanned: The number of pages that were scanned.
> + *
> + * returns how many pages were moved onto *@dst.
> + */
> +static int isolate_lru_pages(struct zone * zone, int nr_to_scan,
> + struct list_head *src, struct list_head *dst, int *scanned)
> +{
> + int nr_taken = 0;
> + struct page *page;
> + int scan = 0;
> +
> + spin_lock_irq(&zone->lru_lock);
> + __select_list_hand(zone, src);
> + while (scan++ < nr_to_scan && !list_empty(src)) {
> + page = lru_to_page(src);
> + prefetchw_prev_lru_page(page, src, flags);
> +
> + if (!TestClearPageLRU(page))
> + BUG();
> + list_del(&page->lru);
> + if (get_page_testone(page)) {
> + /*
> + * It is being freed elsewhere
> + */
> + __put_page(page);
> + SetPageLRU(page);
> + list_add(&page->lru, src);
> + continue;
> + } else {
> + list_add(&page->lru, dst);
> + nr_taken++;
> + if (!PageActive(page))
> + --zone->nr_cold;
> + }
> + }
> + zone->nr_resident -= nr_taken;
> + zone->pages_scanned += scan;
> + spin_unlock_irq(&zone->lru_lock);
> +
> + *scanned = scan;
> + return nr_taken;
> +}
> +
> +/*
> + * Add page to a release pagevec, temp. drop zone lock to release pagevec if full.
> + * Set PG_lru, update zone->nr_cold and zone->nr_resident.
> + *
> + * @zone: @pages zone.
> + * @page: page to be released.
> + * @pvec: pagevec to collect pages in.
> + */
> +static void __page_release(struct zone *zone, struct page *page,
> + struct pagevec *pvec)
> +{
> + if (TestSetPageLRU(page))
> + BUG();
> + if (!PageActive(page))
> + ++zone->nr_cold;
> + ++zone->nr_resident;
> +
> + if (!pagevec_add(pvec, page)) {
> + spin_unlock_irq(&zone->lru_lock);
> + if (buffer_heads_over_limit)
> + pagevec_strip(pvec);
> + __pagevec_release(pvec);
> + spin_lock_irq(&zone->lru_lock);
> + }
> +}
> +
> +/*
> + * Try to reclaim a specified number of pages.
> + *
> + * Reclaim cadidates have:
> + * - PG_lru cleared
> + * - 1 extra ref
> + *
> + * NOTE: hot pages are also returned but will be spit back by try_pageout()
> + * this to preserve CLOCK order.
> + *
> + * @zone: target zone to reclaim pages from.
> + * @nr_to_scan: nr of pages to try for reclaim.
> + *
> + * returns candidate list.
> + */
> +void page_replace_candidates(struct zone *zone, int nr_to_scan, struct list_head *page_list)
> +{
> + int nr_scan;
> +
> + isolate_lru_pages(zone, nr_to_scan,
> + &zone->list_hand[hand_cold],
> + page_list, &nr_scan);
> +
> + if (current_is_kswapd())
> + mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
> + else
> + mod_page_state_zone(zone, pgscan_direct, nr_scan);
> +}
> +
> +/*
> + * Activate a cold page:
> + * cold, !test -> cold, test
> + * cold, test -> hot
> + *
> + * @page: page to activate
> + */
> +void page_replace_activate(struct page *page)
> +{
> + int hot, test;
> +
> + hot = PageActive(page);
> + test = PageTest(page);
> +
> + if (hot) {
> + BUG_ON(test);
> + } else {
> + if (test) {
> + SetPageActive(page);
> + /*
> + * Leave PG_test set for new hot pages in order to
> + * recognise then in reinsert() and do accounting.
> + */
> + } else {
> + SetPageTest(page);
> + }
> + }
> +}
> +
> +static int reclaim_mapped(struct zone *);
> +static void rotate_hot(struct zone *, int, int, struct pagevec *);
> +
> +/*
> + * Reinsert those candidate pages that were not freed by try_pageout().
> + * Account pages that were promoted to hot by page_replace_activate().
> + * Rotate hand hot to balance the new hot and lost cold pages vs.
> + * the cold pages target.
> + *
> + * Candidate pages have:
> + * - PG_lru cleared
> + * - 1 extra ref
> + * undo that.
> + *
> + * @zone: zone we're working on.
> + * @page_list: the left over pages.
> + */
> +void page_replace_reinsert(struct zone *zone, struct list_head *page_list)
> +{
> + struct pagevec pvec;
> + unsigned long dct = 0;
> +
> + pagevec_init(&pvec, 1);
> + spin_lock_irq(&zone->lru_lock);
> + __select_list_hand(zone, &zone->list_hand[hand_hot]);
> + while (!list_empty(page_list)) {
> + struct page *page = lru_to_page(page_list);
> + prefetchw_prev_lru_page(page, page_list, flags);
> +
> + if (PageActive(page) && PageTest(page)) {
> + ClearPageTest(page);
> + ++dct;
> + }
> +
> + list_move(&page->lru, &zone->list_hand[hand_hot]);
> + __page_release(zone, page, &pvec);
> + }
> + __cold_target_inc(zone, dct);
> + spin_unlock_irq(&zone->lru_lock);
> +
> + /*
> + * Limit the hot hand to a full revolution.
> + */
> + if (zone->nr_cold < zone->nr_cold_target) {
> + int i, nr = zone->nr_resident / SWAP_CLUSTER_MAX;
> + int rm = reclaim_mapped(zone);
> + for (i = 0; zone->nr_cold < zone->nr_cold_target && i < nr; ++i)
> + rotate_hot(zone, SWAP_CLUSTER_MAX, rm, &pvec);
> + }
> +
> + pagevec_release(&pvec);
> +}
> +
> +/*
> + * Puts cold pages that have their test bit set on the non-resident lists.
> + *
> + * @zone: dead pages zone.
> + * @page: dead page.
> + */
> +void page_replace_remember(struct zone *zone, struct page *page)
> +{
> + if (TestClearPageTest(page)) {
> + int list = nonresident_put(page_mapping(page),
> + page_index(page), NR_b1, NR_b1);
> + if (list != NR_free)
> + __cold_target_dec(zone, 1);
> + }
> +}
> +
> +static unsigned long estimate_pageable_memory(void)
> +{
> +#if 0
> + static unsigned long next_check;
> + static unsigned long total = 0;
> +
> + if (!total || time_after(jiffies, next_check)) {
> + struct zone *z;
> + total = 0;
> + for_each_zone(z)
> + total += z->nr_resident;
> + next_check = jiffies + HZ/10;
> + }
> +
> + // gave 0 first time, SIGFPE in kernel sucks
> + // hence the !total
> +#else
> + unsigned long total = 0;
> + struct zone *z;
> + for_each_zone(z)
> + total += z->nr_resident;
> +#endif
> + return total;
> +}
> +
> +static int reclaim_mapped(struct zone *zone)
> +{
> + long mapped_ratio;
> + long distress;
> + long swap_tendency;
> +
> + /*
> + * `distress' is a measure of how much trouble we're having reclaiming
> + * pages. 0 -> no problems. 100 -> great trouble.
> + */
> + distress = 100 >> zone->prev_priority;
> +
> + /*
> + * The point of this algorithm is to decide when to start reclaiming
> + * mapped memory instead of just pagecache. Work out how much memory
> + * is mapped.
> + */
> + mapped_ratio = (read_page_state(nr_mapped) * 100) / total_memory;
> +
> + /*
> + * Now decide how much we really want to unmap some pages. The mapped
> + * ratio is downgraded - just because there's a lot of mapped memory
> + * doesn't necessarily mean that page reclaim isn't succeeding.
> + *
> + * The distress ratio is important - we don't want to start going oom.
> + *
> + * A 100% value of vm_swappiness overrides this algorithm altogether.
> + */
> + swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
> +
> + /*
> + * Now use this metric to decide whether to start moving mapped memory
> + * onto the inactive list.
> + */
> + if (swap_tendency >= 100)
> + return 1;
> +
> + return 0;
> +}
> +
> +/*
> + * Rotate the non-resident hand; scale the rotation speed so that when all
> + * hot hands

all hot hands?

> have made one full revolution the non-resident hand will have
> + * too.
> + *
> + * @zone: current zone
> + * @dh: number of pages the hot hand has moved
> + */
> +static void __nonres_term(struct zone *zone, unsigned long dh)
> +{
> + unsigned long long cycles;
> + /*
> + * |b1| Rhot |B| Rhot
> + * Rtest = ----------- ~ ----------
> + * |r1| |R|
> + *
> + * NOTE depends on |B|, hence include the nonresident_del patch
> + */
> + cycles = zone->nr_nonresident_scale + (unsigned long long)dh * nonresident_estimate();
> + zone->nr_nonresident_scale = do_div(cycles, estimate_pageable_memory() + 1UL);
> + __get_cpu_var(nonres_cycle) += (u32)cycles;
> + __cold_target_dec(zone, cycles);
> +}
> +
> +/*
> + * Rotate hand hot;
> + *
> + * @zone: current zone
> + * @nr_to_scan: batch quanta
> + * @reclaim_mapped: whether to demote mapped pages too
> + * @pvec: release pagevec
> + */
> +static void rotate_hot(struct zone *zone, int nr_to_scan, int reclaim_mapped,
> + struct pagevec *pvec)
> +{
> + LIST_HEAD(l_hold);
> + LIST_HEAD(l_tmp);
> + unsigned long dh = 0, dct = 0;
> + int pgscanned;
> + int pgdeactivate = 0;
> + int nr_taken;
> +
> + nr_taken = isolate_lru_pages(zone, nr_to_scan,
> + &zone->list_hand[hand_hot],
> + &l_hold, &pgscanned);
> +
> + mod_page_state_zone(zone, pgrefill, pgscanned);
> +
> + while (!list_empty(&l_hold)) {
> + struct page *page = lru_to_page(&l_hold);
> + prefetchw_prev_lru_page(page, &l_hold, flags);
> +
> + if (PageActive(page)) {
> + BUG_ON(PageTest(page));
> +
> + /*
> + * Ignore the swap token; this is not actual reclaim
> + * and it will give a better reflection of the actual
> + * hotness of pages.
> + *
> + * XXX do something with this reclaim_mapped stuff.
> + */
> + if (/*(((reclaim_mapped && mapped) || !mapped) ||
> + (total_swap_pages == 0 && PageAnon(page))) && */
> + !page_referenced(page, 0, 1)) {
> + ClearPageActive(page);
> + ++pgdeactivate;
> + }
> +
> + ++dh;
> + } else {
> + if (TestClearPageTest(page))
> + ++dct;
> + }
> + list_move(&page->lru, &l_tmp);
> +
> + cond_resched();
> + }
> +
> + spin_lock_irq(&zone->lru_lock);
> + while (!list_empty(&l_tmp)) {
> + struct page *page = lru_to_page(&l_tmp);
> + prefetchw_prev_lru_page(page, &l_tmp, flags);
> + list_move(&page->lru, &zone->list_hand[hand_cold]);
> + __page_release(zone, page, pvec);
> + }
> + __nonres_term(zone, nr_taken);
> + __cold_target_dec(zone, dct);
> + spin_unlock_irq(&zone->lru_lock);
> +
> + mod_page_state(pgdeactivate, pgdeactivate);
> +}
> Index: linux-2.6-git/include/linux/mm_page_replace.h
> ===================================================================
> --- linux-2.6-git.orig/include/linux/mm_page_replace.h
> +++ linux-2.6-git/include/linux/mm_page_replace.h
> @@ -7,6 +7,7 @@
> #include <linux/mm.h>
> #include <linux/list.h>
> #include <linux/page-flags.h>
> +#include <linux/swap.h>
>
> #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
>
> @@ -38,44 +39,41 @@
> #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
> #endif
>
> +enum {
> + hand_hot = 0,
> + hand_cold = 1
> +};
> +
> void __init page_replace_init_zone(struct zone *);
> void __page_replace_insert(struct zone *, struct page *);
> void page_replace_candidates(struct zone *, int, struct list_head *);
> -
> -static inline
> -void page_replace_activate(struct page *page)
> -{
> - SetPageActive(page);
> -}
> -
> +void page_replace_activate(struct page *);
> void page_replace_reinsert(struct zone *, struct list_head *);
> +void page_replace_remember(struct zone *, struct page *);
>
> +
> +/*
> + * Make page available for direct reclaim.
> + *
> + * @zone: page's zone.
> + * @page: page.
> + */
> static inline
> void __page_replace_rotate_reclaimable(struct zone *zone, struct page *page)
> {
> if (PageLRU(page) && !PageActive(page)) {
> - list_move_tail(&page->lru, &zone->inactive_list);
> + list_move_tail(&page->lru, &zone->list_hand[hand_cold]);
> inc_page_state(pgrotated);
> }
> }
>
> static inline void
> -add_page_to_active_list(struct zone *zone, struct page *page)
> -{
> - list_add(&page->lru, &zone->active_list);
> - zone->nr_active++;
> -}
> -
> -static inline void
> del_page_from_lru(struct zone *zone, struct page *page)
> {
> list_del(&page->lru);
> - if (PageActive(page)) {
> - ClearPageActive(page);
> - zone->nr_active--;
> - } else {
> - zone->nr_inactive--;
> - }
> + --zone->nr_resident;
> + if (!TestClearPageActive(page))
> + --zone->nr_cold;
> }
>
> #endif /* __KERNEL__ */
> Index: linux-2.6-git/mm/vmscan.c
> ===================================================================
> --- linux-2.6-git.orig/mm/vmscan.c
> +++ linux-2.6-git/mm/vmscan.c
> @@ -339,10 +339,11 @@ static try_pageout_t try_pageout(struct
> int may_enter_fs;
> int referenced;
>
> - if (TestSetPageLocked(page))
> + if (PageActive(page))
> goto keep;
>
> - BUG_ON(PageActive(page));
> + if (TestSetPageLocked(page))
> + goto keep;
>
> sc->nr_scanned++;
> /* Double the slab pressure for mapped and swapcache pages */
> @@ -467,6 +468,7 @@ static try_pageout_t try_pageout(struct
> #ifdef CONFIG_SWAP
> if (PageSwapCache(page)) {
> swp_entry_t swap = { .val = page_private(page) };
> + page_replace_remember(page_zone(page), page);
> __delete_from_swap_cache(page);
> write_unlock_irq(&mapping->tree_lock);
> swap_free(swap);
> @@ -475,6 +477,7 @@ static try_pageout_t try_pageout(struct
> }
> #endif /* CONFIG_SWAP */
>
> + page_replace_remember(page_zone(page), page);
> __remove_from_page_cache(page);
> write_unlock_irq(&mapping->tree_lock);
> __put_page(page);
> @@ -572,7 +575,7 @@ shrink_zone(struct zone *zone, struct sc
>
> atomic_inc(&zone->reclaim_in_progress);
>
> - nr_inactive = (zone->nr_inactive >> sc->priority) + SWAP_CLUSTER_MAX;
> + nr_inactive = (zone->nr_resident >> sc->priority) + SWAP_CLUSTER_MAX;
> nr_inactive &= ~(SWAP_CLUSTER_MAX - 1);
>
> sc->nr_to_scan = SWAP_CLUSTER_MAX;
> @@ -667,7 +670,7 @@ int try_to_free_pages(struct zone **zone
> continue;
>
> zone->temp_priority = DEF_PRIORITY;
> - lru_pages += zone->nr_active + zone->nr_inactive;
> + lru_pages += zone->nr_resident;
> }
>
> for (priority = DEF_PRIORITY; priority >= 0; priority--) {
> @@ -811,14 +814,14 @@ loop_again:
> zone->temp_priority = priority;
> if (zone->prev_priority > priority)
> zone->prev_priority = priority;
> - lru_pages += zone->nr_active + zone->nr_inactive;
> + lru_pages += zone->nr_resident;
>
> atomic_inc(&zone->reclaim_in_progress);
> shrink_zone(zone, &sc);
> atomic_dec(&zone->reclaim_in_progress);
>
> if (zone->pages_scanned >=
> - (zone->nr_active + zone->nr_inactive) * 4)
> + (zone->nr_resident) * 4)
> zone->all_unreclaimable = 1;
> }
> reclaim_state->reclaimed_slab = 0;
> Index: linux-2.6-git/mm/Makefile
> ===================================================================
> --- linux-2.6-git.orig/mm/Makefile
> +++ linux-2.6-git/mm/Makefile
> @@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o
> obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
> page_alloc.o page-writeback.o pdflush.o \
> readahead.o slab.o swap.o truncate.o vmscan.o \
> - prio_tree.o page_replace.o $(mmu-y)
> + prio_tree.o clockpro.o $(mmu-y)
>
> obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o \
> nonresident.o
> Index: linux-2.6-git/mm/page_alloc.c
> ===================================================================
> --- linux-2.6-git.orig/mm/page_alloc.c
> +++ linux-2.6-git/mm/page_alloc.c
> @@ -1263,8 +1263,8 @@ void __get_zone_counts(unsigned long *ac
> *inactive = 0;
> *free = 0;
> for (i = 0; i < MAX_NR_ZONES; i++) {
> - *active += zones[i].nr_active;
> - *inactive += zones[i].nr_inactive;
> + *active += zones[i].nr_resident - zones[i].nr_cold;
> + *inactive += zones[i].nr_cold;
> *free += zones[i].free_pages;
> }
> }
> @@ -1387,8 +1387,8 @@ void show_free_areas(void)
> " min:%lukB"
> " low:%lukB"
> " high:%lukB"
> - " active:%lukB"
> - " inactive:%lukB"
> + " resident:%lukB"
> + " cold:%lukB"
> " present:%lukB"
> " pages_scanned:%lu"
> " all_unreclaimable? %s"
> @@ -1398,8 +1398,8 @@ void show_free_areas(void)
> K(zone->pages_min),
> K(zone->pages_low),
> K(zone->pages_high),
> - K(zone->nr_active),
> - K(zone->nr_inactive),
> + K(zone->nr_resident),
> + K(zone->nr_cold),
> K(zone->present_pages),
> zone->pages_scanned,
> (zone->all_unreclaimable ? "yes" : "no")
> @@ -2156,8 +2156,8 @@ static int zoneinfo_show(struct seq_file
> "\n min %lu"
> "\n low %lu"
> "\n high %lu"
> - "\n active %lu"
> - "\n inactive %lu"
> + "\n resident %lu"
> + "\n cold %lu"
> "\n scanned %lu"
> "\n spanned %lu"
> "\n present %lu",
> @@ -2165,8 +2165,8 @@ static int zoneinfo_show(struct seq_file
> zone->pages_min,
> zone->pages_low,
> zone->pages_high,
> - zone->nr_active,
> - zone->nr_inactive,
> + zone->nr_resident,
> + zone->nr_cold,
> zone->pages_scanned,
> zone->spanned_pages,
> zone->present_pages);
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2005-12-31 01:23:11

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

On Fri, 30 Dec 2005, Marcelo Tosatti wrote:

> I think that final objective should be to abstract it away completly,
> making it possible to select between different policies, allowing
> further experimentation and implementations such as energy efficient
> algorithms.

I'm not convinced. That might just make vmscan.c harder to read ;)

> About CLOCK-Pro itself, I think that a small document with a short
> introduction would be very useful...

http://linux-mm.org/AdvancedPageReplacement

> > The HandCold rotation is driven by page reclaim needs. HandCold in turn
> > drives HandHot, for every page HandCold promotes to hot HandHot needs to
> > degrade one hot page to cold.
>
> Why do you use only two clock hands and not three (HandHot, HandCold and
> HandTest) as in the original paper?

Because the non-resident pages cannot be in the clock.
This is both because of space overhead, and because the
non-resident list cannot be per zone.

I agree though, Peter's patch could use a lot more
documentation.

--
All Rights Reversed

2005-12-31 03:31:24

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

Hi Rik!

On Fri, Dec 30, 2005 at 08:22:12PM -0500, Rik van Riel wrote:
> On Fri, 30 Dec 2005, Marcelo Tosatti wrote:
>
> > I think that final objective should be to abstract it away completly,
> > making it possible to select between different policies, allowing
> > further experimentation and implementations such as energy efficient
> > algorithms.
>
> I'm not convinced. That might just make vmscan.c harder to read ;)

Are you serious or just joking? :)

Sure it might make it harder to read, but allowing selectable policies
is very interesting. Peter's patches go half-way into that direction.

Lets say, if CLOCK-Pro underperforms for a given workload (take into
account that its simply optimizing reclaim for a subset of all existing
access patterns, ie. heuristics), it would be easier for people to
develop/use different policies.

> > About CLOCK-Pro itself, I think that a small document with a short
> > introduction would be very useful...
>
> http://linux-mm.org/AdvancedPageReplacement

I meant something more like Documentation/vm/clockpro.txt, for easier
reading of patch reviewers and community in general.

> > > The HandCold rotation is driven by page reclaim needs. HandCold in turn
> > > drives HandHot, for every page HandCold promotes to hot HandHot needs to
> > > degrade one hot page to cold.
> >
> > Why do you use only two clock hands and not three (HandHot, HandCold and
> > HandTest) as in the original paper?
>
> Because the non-resident pages cannot be in the clock.
> This is both because of space overhead, and because the
> non-resident list cannot be per zone.

I see - that is a fundamental change from the original CLOCK-Pro
algorithm, right?

Do you have a clear idea about the consequences of not having
non-resident pages in the clock?

> I agree though, Peter's patch could use a lot more
> documentation.

2005-12-31 05:25:48

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

On Sat, 31 Dec 2005, Marcelo Tosatti wrote:

> I meant something more like Documentation/vm/clockpro.txt, for easier
> reading of patch reviewers and community in general.

Agreed.

> > > Why do you use only two clock hands and not three (HandHot, HandCold and
> > > HandTest) as in the original paper?
> >
> > Because the non-resident pages cannot be in the clock.
> > This is both because of space overhead, and because the
> > non-resident list cannot be per zone.
>
> I see - that is a fundamental change from the original CLOCK-Pro
> algorithm, right?
>
> Do you have a clear idea about the consequences of not having
> non-resident pages in the clock?

The consequence is that we could falsely consider a non-resident
page to be active, or not to be active. However, this can only
happen if we let the scan rate in each of the memory zones get
way too much out of whack (which is bad regardless).

--
All Rights Reversed

2005-12-31 07:04:07

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH 01/14] page-replace-single-batch-insert.patch

Hi Peter,

On Fri, Dec 30, 2005 at 11:40:24PM +0100, Peter Zijlstra wrote:
>
> From: Peter Zijlstra <[email protected]>
>
> page-replace interface function:
> __page_replace_insert()
>
> This function inserts a page into the page replace data structure.
>
> Unify the active and inactive per cpu page lists. For now provide insertion
> hints using the LRU specific page flags.

Unification of active and inactive per cpu page lists is a requirement
for CLOCK-Pro, right?

Would be nicer to have unchanged functionality from vanilla VM
(including the active/inactive per cpu lists).

Happy new year!

2005-12-31 09:43:55

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 01/14] page-replace-single-batch-insert.patch

On Sat, 2005-12-31 at 05:03 -0200, Marcelo Tosatti wrote:
> Hi Peter,
>
> On Fri, Dec 30, 2005 at 11:40:24PM +0100, Peter Zijlstra wrote:
> >
> > From: Peter Zijlstra <[email protected]>
> >
> > page-replace interface function:
> > __page_replace_insert()
> >
> > This function inserts a page into the page replace data structure.
> >
> > Unify the active and inactive per cpu page lists. For now provide insertion
> > hints using the LRU specific page flags.
>
> Unification of active and inactive per cpu page lists is a requirement
> for CLOCK-Pro, right?
>
> Would be nicer to have unchanged functionality from vanilla VM
> (including the active/inactive per cpu lists).

I guess I could pull all that pcp stuff into page_replace if that would
make you happy ;-)

> Happy new year!

Best wishes!

--
Peter Zijlstra <[email protected]>

2005-12-31 09:41:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 14/14] page-replace-kswapd-incmin.patch

On Fri, 2005-12-30 at 23:15 -0200, Marcelo Tosatti wrote:
> On Fri, Dec 30, 2005 at 11:42:34PM +0100, Peter Zijlstra wrote:
> >
> > From: Nick Piggin <[email protected]>
> >
> > Explicitly teach kswapd about the incremental min logic instead of just scanning
> > all zones under the first low zone. This should keep more even pressure applied
> > on the zones.
> >
> > The new shrink_zone() logic exposes the very worst side of the current
> > balance_pgdat() function. Without this patch reclaim is limited to ZONE_DMA.
>
> Can you please describe the issue with over protection of DMA zone you experienced?
>
> I'll see if I can reproduce it with Nick's standalone patch on top of vanilla, what
> load was that?

With the mdb bench the following behaviour was observed:
(mem=128M)

- PageCache would fill zone_normal
- PageCache would fill zone_dma
- reclaim starts
- initially things look right
- after a while zone_dma is reclaimed so fast that it frequently gets a
full eviction (nr_resident == 0).
- from this point onward zone_normal practiaclly sits idle and zone_dma
goes wild with all the action.

--
Peter Zijlstra <[email protected]>

2005-12-31 09:55:14

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/9] clockpro-nonresident.patch

On Fri, 2005-12-30 at 23:13 -0200, Marcelo Tosatti wrote:
> On Fri, Dec 30, 2005 at 11:42:44PM +0100, Peter Zijlstra wrote:
> >
> > From: Peter Zijlstra <[email protected]>
> >
> > Originally started by Rik van Riel, I heavily modified the code
> > to suit my needs.
> >
> > The nonresident code approximates a clock but sacrifices precision in order
> > to accomplish faster lookups.
> >
> > The actual datastructure is a hash of small clocks, so that, assuming an
> > equal distribution by the hash function, each clock has comparable order.
> >
> > TODO:
> > - remove the ARC requirements.
> >
> > Signed-off-by: Peter Zijlstra <[email protected]>
>
> <snip>
>
> > + *
> > + *
> > + * Modified to work with ARC like algorithms who:
> > + * - need to balance two FIFOs; |b1| + |b2| = c,
> > + *
> > + * The bucket contains four single linked cyclic lists (CLOCKS) and each
> > + * clock has a tail hand. By selecting a victim clock upon insertion it
> > + * is possible to balance them.
> > + *
> > + * The first two lists are used for B1/B2 and a third for a free slot list.
> > + * The fourth list is unused.
> > + *
> > + * The slot looks like this:
> > + * struct slot_t {
> > + * u32 cookie : 24; // LSB
> > + * u32 index : 6;
> > + * u32 listid : 2;
> > + * };
>
> 8 and 16 bit accesses are slower than 32 bit on i386 (Arjan pointed this out sometime ago).
>
> Might be faster to load a full word and shape it as necessary, will see if I can do
> something instead of talking. ;)

everything is 32bit except for the hands, but yes, this code needs to be
redone.

> > +/*
> > + * For interactive workloads, we remember about as many non-resident pages
> > + * as we have actual memory pages. For server workloads with large inter-
> > + * reference distances we could benefit from remembering more.
> > + */
>
> This comment is bogus. Interactive or server loads have nothing to do
> with the inter reference distance. To the contrary, interactive loads
> have a higher chance to contain large inter reference distances, and
> many common server loads have strong locality.
>
> <snip>

Happy to drop it, Rik?

> > +++ linux-2.6-git/include/linux/swap.h
> > @@ -152,6 +152,31 @@ extern void out_of_memory(gfp_t gfp_mask
> > /* linux/mm/memory.c */
> > extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
> >
> > +/* linux/mm/nonresident.c */
> > +#define NR_b1 0
> > +#define NR_b2 1
> > +#define NR_free 2
> > +#define NR_lost 3
>
> What is the meaning of "NR_lost" ?

should have read, NR_unused, it is the available fourth hand which is
unused. I just put it there for completeness sake and remember
struggling with the name while doing it, guess I should've taken that as
a hint.

--
Peter Zijlstra <[email protected]>

2005-12-31 10:49:09

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

On Fri, 2005-12-30 at 22:24 -0200, Marcelo Tosatti wrote:
> Hi Peter,
>
> _Nice_ work!

Thanks!

> IMHO you're going into the right direction, abstracting away page
> replacement policy from page reclaim.
>
> I think that final objective should be to abstract it away completly,
> making it possible to select between different policies, allowing
> further experimentation and implementations such as energy efficient
> algorithms.
>
> How hard do you think would it be to enhance your patches to allow for
> compile-time selectable policies?

Not that much more work, it would need abstracing all the usage of the
list counters (nr_active/nr_inactive vs. nr_resident/nr_cold).

> For instance, moving "page reclaim scanner" specific information into
> its own container:
>
> @@ -140,12 +140,13 @@ struct zone {
> /* Fields commonly accessed by the page reclaim scanner */
> - spinlock_t lru_lock;
> - struct list_head active_list;
> - struct list_head inactive_list;
> - unsigned long nr_scan_active;
> - unsigned long nr_active;
> - unsigned long nr_inactive;
> + spinlock_t lru_lock;
> + struct list_head list_hand[2];
> + unsigned long nr_resident;
> + unsigned long nr_cold;
> + unsigned long nr_cold_target;
> + unsigned long nr_nonresident_scale;
> +
>
> Such as "struct reclaim_policy_data" or a better name.

Yes, I have toyed with that idea, rik didn't like it and I didn't spend
any effort on it, but it could very well be done.

> About CLOCK-Pro itself, I think that a small document with a short
> introduction would be very useful... explaining that it uses inter
> reference distance instead of recency for the page replacement criteria,
> and why this criteria is fundamentally more appropriate for a large set
> of common access patterns aka "a resume of the CLOCK-Pro paper".

Ok, I shall give this Documentation/vm/clockpro.txt thing a try.


> > Implementation wise I use something based on Rik van Riel's nonresident code
> > which actually aproximates a clock with reduced order.
>
> I'm curious about hash collisions, would like to know more details about
> the hash distribution under different loads.
>
> Would be nice to measure the rate of updates on each hash bucket and
> confirm that they are approximate.

I have/had a patch that prints stats on each bucket, I did some stats a
few months back and the deviation in bucket usage was not very high,
which would indicate a rather good distribution.

Could revive that patch so you can have a go at it if you wish.

> > The resident clock with two hands is implemented using two lists which are to
> > be seen as laid head to tail to form the clock. When one hand laps the other
> > the lists are swapped.
>
> How does that differ from the original CLOCK-Pro algorithm, and why, and what are
> the expected outcomes? Please make it easier for others to understand why the hands
> swap, and when, and why.

The original clockpro algorithm has one clock with 3 hands. In order to
make it work with multiple resident zones, the non-resident pages have
to be isolated.

I did that by having two clocks, one resident with two hands (per zone)
and one non-resident with one hand (global), where the non-resident
clock should be viewed as an overlay on the resident one (imagine the
single zone case).

This loses some page order information, ie. the exact position of the
non-resident pages wrt. the resident pages, however it is a good
approximation when the rotation speeds of the respective hands are tied
together such that:
when the resident hot hand has made a full revolution so too has the
non-resident hand.

>
> > Each page has 3 state bits:
> >
> > hot -> PageHot()
> > test -> PageTest()
> > ref -> page_referenced()
> >
> > (PG_active will be renamed to PG_hot in a following patch, since the semantics
> > changed also change the name in order to avoid confusion))
> >
> > The HandCold rotation is driven by page reclaim needs. HandCold in turn
> > drives HandHot, for every page HandCold promotes to hot HandHot needs to
> > degrade one hot page to cold.
>
> Why do you use only two clock hands and not three (HandHot, HandCold and HandTest)
> as in the original paper?

As explanied above, the multi-zone thing requires the non-resident pages
to be separated.

> > + * res | h/c | tst | ref || Hcold | Hhot | Htst || Flt
> > + * ----+-----+-----+-----++-------+------+------++-----
> > + * 1 | 1 | 0 | 1 ||=1101 | 1100 |=1101 ||
> > + * 1 | 1 | 0 | 0 ||=1100 | 1000 |=1100 ||
> > + * ----+-----+-----+-----++-------+------+------++-----
> > + * 1 | 0 | 1 | 1 || 1100 | 1001 | 1001 ||
> > + * 1 | 0 | 1 | 0 ||X0010 | 1000 | 1000 ||
> > + * 1 | 0 | 0 | 1 || 1010 |=1001 |=1001 ||
> > + * 1 | 0 | 0 | 0 ||X0000 |=1000 |=1000 ||
> > + * ----+-----+-----+-----++-------+------+------++-----
> > + * ----+-----+-----+-----++-------+------+------++-----
> > + * 0 | 0 | 1 | 1 || | | || 1100
> > + * 0 | 0 | 1 | 0 ||=0010 |X0000 |X0000 ||
> > + * 0 | 0 | 0 | 1 || | | || 1010
>
> What does this mean? Can you make it easier for ignorant people like
> myself to understand?

state table, it describes how (in the original paper) the three hands
modify the page state. Given the state in the first four columns, the
next three columns give a new state for each hand; hand cold, hot and
test. The last column describes the action of a pagefault.

Ex. given a resident cold page in its test period that is referenced
(1011):
- Hand cold will make it 1100, that is, a resident hot page;
- Hand hot will make it 1001, that is, a resident cold page with a
reference; and
- Hand test will also make it 1001.

(The prefixes '=' and 'X' are used to indicate: not changed, and remove
from list - that can be either move from resident->non-resident or
remove altogether).

> > +/*
> > + * Rotate the non-resident hand; scale the rotation speed so that when all
> > + * hot hands
>
> all hot hands?

As explained, each zone has a hot and cold hand vs. the one non-resident
hand.

> > have made one full revolution the non-resident hand will have
> > + * too.
> > + *

--
Peter Zijlstra <[email protected]>

2005-12-31 10:57:40

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

On Sat, 2005-12-31 at 00:24 -0500, Rik van Riel wrote:
> > > > Why do you use only two clock hands and not three (HandHot, HandCold and
> > > > HandTest) as in the original paper?
> > >
> > > Because the non-resident pages cannot be in the clock.
> > > This is both because of space overhead, and because the
> > > non-resident list cannot be per zone.
> >
> > I see - that is a fundamental change from the original CLOCK-Pro
> > algorithm, right?
> >
> > Do you have a clear idea about the consequences of not having
> > non-resident pages in the clock?
>
> The consequence is that we could falsely consider a non-resident
> page to be active, or not to be active. However, this can only
> happen if we let the scan rate in each of the memory zones get
> way too much out of whack (which is bad regardless).

Yes, the uncertainty of position causes a time uncertainty wrt.
terminating the test period (heisenberg anyone?). So individual pages
can be terminated either too soon or too late, however statistics make
it come out even in the end.

--
Peter Zijlstra <[email protected]>

2005-12-31 11:29:32

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

Forgot one in the previous mail.

On Fri, 2005-12-30 at 22:24 -0200, Marcelo Tosatti wrote:
> Please make it easier for others to understand why the hands
> swap, and when, and why.

Its not the hands that swap, its the lists. The hands will lap each
other, like the minute hand will lap the hour hand every ~65 minutes.

Let me try some ascii art.

====
^---<>---v
====

'=' a page
'^---<' hand cold
'>---v' hand hot

now let hand cold move 4 pages:


^---<>---v
========

ie. hand hot and hand cold have the same position.
now if we want to move hand cold one more position this happens:

=======
^---<>---v
=

see the swap?
--
Peter Zijlstra <[email protected]>

2005-12-31 14:46:00

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 01/14] page-replace-single-batch-insert.patch

On Sat, 31 Dec 2005, Marcelo Tosatti wrote:

> Unification of active and inactive per cpu page lists is a requirement
> for CLOCK-Pro, right?

You can approximate the functionality through use of scan
rates. Not quite as accurate as a unified clock, though.

--
All Rights Reversed

2005-12-31 14:53:37

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 1/9] clockpro-nonresident.patch

On Sat, 31 Dec 2005, Peter Zijlstra wrote:

> > > +/*
> > > + * For interactive workloads, we remember about as many non-resident pages
> > > + * as we have actual memory pages. For server workloads with large inter-
> > > + * reference distances we could benefit from remembering more.
> > > + */
> >
> > This comment is bogus. Interactive or server loads have nothing to do
> > with the inter reference distance. To the contrary, interactive loads
> > have a higher chance to contain large inter reference distances, and
> > many common server loads have strong locality.
> >
> > <snip>
>
> Happy to drop it, Rik?

Sorry, but the comment is accurate.

For interactive workloads you want to forget interreference
distances between two updatedbs, even if mozilla didn't get
used all weekend.

OTOH, on NFS servers, or other systems with large interreference
distances, you may _need_ to remember a larger set of non-resident
pages in order to find the pages that are the hottest.

In those workloads, the shortest inter-reference distance might
still be larger than the size of memory...

--
All Rights Reversed

2005-12-31 18:59:49

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 10/9] clockpro-document.patch

By popular request,
I'll finish it some time next year.

Best wishes all.

--- /dev/null 2003-12-29 19:37:00.000000000 +0100
+++ linux-2.6-git/Documentation/vm/clockpro.txt 2005-12-31 19:55:45.000000000 +0100
@@ -0,0 +1,97 @@
+This document describes the page replace algorithm as implemented in the linux
+kernel. It is based on CLOCK-Pro, found here:
+ http://www.cs.wm.edu/hpcs/WWW/HTML/publications/abs05-3.html
+
+ Base Algorithm Summary
+
+The algorithm is based on reuse distance as opposed to the recency familair
+from LRU. The reuse distance is the number of pages referenced between the
+current and previous page reference.
+
+It categorizes pages with a small reuse distance as hot and those with a large
+reuse distance as cold. The threshold between hot and cold is the test period,
+that is, if a page is referenced during its test period its reuse distance is
+small, ie. it becomes hot. The test period is the largest reuse distance of a
+hot page, which in turn depends on the number of resident cold pages.
+
+The number of resident cold pages is an adaptive target which is incremented
+when a page is referenced in its test period and decremented when a test
+period expires.
+
+Reclaim looks for unreferenced cold pages, for cold pages that are still in
+their test period the metadata is kept until the test period expires.
+
+In order to be able to compare reuse distance all pages are kept on one CLOCK
+however the management of the page state requires more than one hand.
+CLOCK-Pro has three, the following table gives the actions of each hand:
+
+ res | hot | tst | ref || Hcold | Hhot | Htst || Flt
+ ----+-----+-----+-----++--------+--------+--------++------
+ 1 | 1 | 0 | 1 || = 1101 | 1100 | = 1101 ||
+ 1 | 1 | 0 | 0 || = 1100 | 1000 | = 1100 ||
+ ----+-----+-----+-----++--------+--------+--------++------
+ 1 | 0 | 1 | 1 || 1100 | 1001 | 1001 ||
+ 1 | 0 | 1 | 0 || N 0010 | 1000 | 1000 ||
+ 1 | 0 | 0 | 1 || 1010 | = 1001 | = 1001 ||
+ 1 | 0 | 0 | 0 || X 0000 | = 1000 | = 1000 ||
+ ====+=====+=====+=====++========+========+========++======
+ 0 | 0 | 1 | 1 || | | || 1100
+ 0 | 0 | 1 | 0 || = 0010 | X 0000 | X 0000 ||
+ 0 | 0 | 0 | 1 || | | || 1010
+
+Where the first four columns give the page state and the next three columns
+give the new state when the respective hand moves along. The prefixes '=', 'N'
+and 'X' are used to indicate: state unchanged, page tracked as non-resident
+and remove page. The last column gives the state on page fault.
+
+The hand dynamics is as follows, reclaim rotates the cold hand and takes
+unreferenced cold pages. When during this rotation the actual number of cold
+pages drops below the target number of cold pages the hot hand is rotated.
+
+The hot hand demotes unreferenced hot pages to cold, and terminates the test
+period of pages is passes by. If however the total number of pages tracked
+rises above twice the total available resident pages the test hand is rotated.
+
+The test hand, like the hot hand, terminates the test period of any page it
+passes by. Remember that terminating the test period of a non-resident cold
+page removes it altogether, thus limiting the total pages tracked.
+
+
+ Implementation Notes
+
+Since pages reclaimed in one zone can end up being faulted back in another
+zone it is incorrect to have per-zone non-resident page tracking. Hence the
+resident and non-resident page tracking needs to be separated.
+
+In order to accomplish this the following is done; take two CLOCKs, a two
+handed one for resident pages and a single handed one for non-resident pages.
+The resident CLOCK's hands will reflect hand cold and the resident part of hand
+hot, the non-resident CLOCK's hand will reflect the non-resident part of hand
+hot. Hence the rotation speeds of the resident hand hot and non-resident hand
+are coupled so that when one has made a full revolution so will have the
+other.
+
+The functionality of hand test is accomplished by simply limiting the number
+of entries on the non-resident clock to the number of pages on the resident
+clock. When a new entry is added to an already full non-resident clock the
+oldest entry will be removed; ie. its test period terminated.
+
+This uncoupling of non-resident and resident pages has the effect that the
+exact position of the non-resident pages relative to the resident pages is
+lost. This uncertainty propagates to the duration of the test period, some
+pages will be terminated to soon, other too late. However this is a bounded
+error with an average of 0.
+
+This scheme can then be extended to multiple zones by scaling the rotation
+speed coupling between the resident hot hand and the non-resident hand to the
+zone size. That is, when all resident hot hands have made one full revolution
+so will the non-resident hand have.
+
+Demand paging introduces yet another problem, when a page is faulted into
+memory it effectivly doesn't matter what the referenced bit is set to, it will
+be used as soon as we finish the fault anyway. Hence the first check will
+always activate the page even though we have only had a single use. The
+classic use-once problem. To tackle this the pages can be inserted one state
+lower than normal and behind hand hot instead of behind hand cold, this so
+that hand hot cannot interfere with the lowered page state and the first
+reference is lost quicker.


2006-01-01 02:58:41

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

On Fri, Dec 30, 2005 at 11:43:34PM +0100, Peter Zijlstra wrote:
>
> From: Peter Zijlstra <[email protected]>

Peter,

I tried your "scan-shared.c" proggy which loops over 140M of a file
using mmap (on a 128MB box). The number of loops was configured to "5".

The amount of major/minor pagefaults was exactly the same between
vanilla and clockpro, isnt the clockpro algorithm supposed to be
superior than LRU in such "sequential scan of MEMSIZE+1" cases?

Oh well, to be sincere, I still haven't understood what makes CLOCK-Pro
use inter reference distance instead of recency, given that its a simple
CLOCK using reference bits (but with three clocks instead of one).

But thats probably just my ignorance, need to study more.


2006-01-01 02:58:40

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH 1/9] clockpro-nonresident.patch

On Sat, Dec 31, 2005 at 09:53:11AM -0500, Rik van Riel wrote:
> On Sat, 31 Dec 2005, Peter Zijlstra wrote:
>
> > > > +/*
> > > > + * For interactive workloads, we remember about as many non-resident pages
> > > > + * as we have actual memory pages. For server workloads with large inter-
> > > > + * reference distances we could benefit from remembering more.
> > > > + */
> > >
> > > This comment is bogus. Interactive or server loads have nothing to do
> > > with the inter reference distance. To the contrary, interactive loads
> > > have a higher chance to contain large inter reference distances, and
> > > many common server loads have strong locality.
> > >
> > > <snip>
> >
> > Happy to drop it, Rik?
>
> Sorry, but the comment is accurate.
>
> For interactive workloads you want to forget interreference
> distances between two updatedbs, even if mozilla didn't get
> used all weekend.
>
> OTOH, on NFS servers, or other systems with large interreference
> distances, you may _need_ to remember a larger set of non-resident
> pages in order to find the pages that are the hottest.
>
> In those workloads, the shortest inter-reference distance might
> still be larger than the size of memory...

Sure, for the few cases you describe here the comment is valid.

Happy new year!

2006-01-01 02:59:08

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH 01/14] page-replace-single-batch-insert.patch

On Sat, Dec 31, 2005 at 09:44:07AM -0500, Rik van Riel wrote:
> On Sat, 31 Dec 2005, Marcelo Tosatti wrote:
>
> > Unification of active and inactive per cpu page lists is a requirement
> > for CLOCK-Pro, right?
>
> You can approximate the functionality through use of scan
> rates. Not quite as accurate as a unified clock, though.

Rik, I dont understand what you mean.

My point is that the page-replacement-policy abstraction patches affect
the original behaviour, and they shouldnt. See the post from Peter about
abstracting the per-cpu lists.

We're talking about different things.

2006-01-01 02:59:06

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

On Sat, Dec 31, 2005 at 11:48:37AM +0100, Peter Zijlstra wrote:
> On Fri, 2005-12-30 at 22:24 -0200, Marcelo Tosatti wrote:
> > Hi Peter,
> >
> > _Nice_ work!
>
> Thanks!
>
> > IMHO you're going into the right direction, abstracting away page
> > replacement policy from page reclaim.
> >
> > I think that final objective should be to abstract it away completly,
> > making it possible to select between different policies, allowing
> > further experimentation and implementations such as energy efficient
> > algorithms.
> >
> > How hard do you think would it be to enhance your patches to allow for
> > compile-time selectable policies?
>
> Not that much more work, it would need abstracing all the usage of the
> list counters (nr_active/nr_inactive vs. nr_resident/nr_cold).

That would be very interesting.

> > For instance, moving "page reclaim scanner" specific information into
> > its own container:
> >
> > @@ -140,12 +140,13 @@ struct zone {
> > /* Fields commonly accessed by the page reclaim scanner */
> > - spinlock_t lru_lock;
> > - struct list_head active_list;
> > - struct list_head inactive_list;
> > - unsigned long nr_scan_active;
> > - unsigned long nr_active;
> > - unsigned long nr_inactive;
> > + spinlock_t lru_lock;
> > + struct list_head list_hand[2];
> > + unsigned long nr_resident;
> > + unsigned long nr_cold;
> > + unsigned long nr_cold_target;
> > + unsigned long nr_nonresident_scale;
> > +
> >
> > Such as "struct reclaim_policy_data" or a better name.
>
> Yes, I have toyed with that idea, rik didn't like it and I didn't spend
> any effort on it, but it could very well be done.

I'll come up with a proposal for review on top of your work.

> > About CLOCK-Pro itself, I think that a small document with a short
> > introduction would be very useful... explaining that it uses inter
> > reference distance instead of recency for the page replacement criteria,
> > and why this criteria is fundamentally more appropriate for a large set
> > of common access patterns aka "a resume of the CLOCK-Pro paper".
>
> Ok, I shall give this Documentation/vm/clockpro.txt thing a try.
>
>
> > > Implementation wise I use something based on Rik van Riel's nonresident code
> > > which actually aproximates a clock with reduced order.
> >
> > I'm curious about hash collisions, would like to know more details about
> > the hash distribution under different loads.
> >
> > Would be nice to measure the rate of updates on each hash bucket and
> > confirm that they are approximate.
>
> I have/had a patch that prints stats on each bucket, I did some stats a
> few months back and the deviation in bucket usage was not very high,
> which would indicate a rather good distribution.
>
> Could revive that patch so you can have a go at it if you wish.

Please, will give it a try.

> > > The resident clock with two hands is implemented using two lists which are to
> > > be seen as laid head to tail to form the clock. When one hand laps the other
> > > the lists are swapped.
> >
> > How does that differ from the original CLOCK-Pro algorithm, and why, and what are
> > the expected outcomes? Please make it easier for others to understand why the hands
> > swap, and when, and why.
>
> The original clockpro algorithm has one clock with 3 hands. In order to
> make it work with multiple resident zones, the non-resident pages have
> to be isolated.
>
> I did that by having two clocks, one resident with two hands (per zone)
> and one non-resident with one hand (global), where the non-resident
> clock should be viewed as an overlay on the resident one (imagine the
> single zone case).
>
> This loses some page order information, ie. the exact position of the
> non-resident pages wrt. the resident pages, however it is a good
> approximation when the rotation speeds of the respective hands are tied
> together such that:
> when the resident hot hand has made a full revolution so too has the
> non-resident hand.

> > > Each page has 3 state bits:
> > >
> > > hot -> PageHot()
> > > test -> PageTest()
> > > ref -> page_referenced()
> > >
> > > (PG_active will be renamed to PG_hot in a following patch, since the semantics
> > > changed also change the name in order to avoid confusion))
> > >
> > > The HandCold rotation is driven by page reclaim needs. HandCold in turn
> > > drives HandHot, for every page HandCold promotes to hot HandHot needs to
> > > degrade one hot page to cold.
> >
> > Why do you use only two clock hands and not three (HandHot, HandCold and HandTest)
> > as in the original paper?
>
> As explanied above, the multi-zone thing requires the non-resident pages
> to be separated.
>
> > > + * res | h/c | tst | ref || Hcold | Hhot | Htst || Flt
> > > + * ----+-----+-----+-----++-------+------+------++-----
> > > + * 1 | 1 | 0 | 1 ||=1101 | 1100 |=1101 ||
> > > + * 1 | 1 | 0 | 0 ||=1100 | 1000 |=1100 ||
> > > + * ----+-----+-----+-----++-------+------+------++-----
> > > + * 1 | 0 | 1 | 1 || 1100 | 1001 | 1001 ||
> > > + * 1 | 0 | 1 | 0 ||X0010 | 1000 | 1000 ||
> > > + * 1 | 0 | 0 | 1 || 1010 |=1001 |=1001 ||
> > > + * 1 | 0 | 0 | 0 ||X0000 |=1000 |=1000 ||
> > > + * ----+-----+-----+-----++-------+------+------++-----
> > > + * ----+-----+-----+-----++-------+------+------++-----
> > > + * 0 | 0 | 1 | 1 || | | || 1100
> > > + * 0 | 0 | 1 | 0 ||=0010 |X0000 |X0000 ||
> > > + * 0 | 0 | 0 | 1 || | | || 1010
> >
> > What does this mean? Can you make it easier for ignorant people like
> > myself to understand?
>
> state table, it describes how (in the original paper) the three hands
> modify the page state. Given the state in the first four columns, the
> next three columns give a new state for each hand; hand cold, hot and
> test. The last column describes the action of a pagefault.
>
> Ex. given a resident cold page in its test period that is referenced
> (1011):
> - Hand cold will make it 1100, that is, a resident hot page;
> - Hand hot will make it 1001, that is, a resident cold page with a
> reference; and
> - Hand test will also make it 1001.
>
> (The prefixes '=' and 'X' are used to indicate: not changed, and remove
> from list - that can be either move from resident->non-resident or
> remove altogether).

I see - can you add this info to the patch?

2006-01-01 10:38:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

On Sat, 2005-12-31 at 20:40 -0200, Marcelo Tosatti wrote:
> On Fri, Dec 30, 2005 at 11:43:34PM +0100, Peter Zijlstra wrote:
> >
> > From: Peter Zijlstra <[email protected]>
>
> Peter,
>
> I tried your "scan-shared.c" proggy which loops over 140M of a file
> using mmap (on a 128MB box). The number of loops was configured to "5".
>
> The amount of major/minor pagefaults was exactly the same between
> vanilla and clockpro, isnt the clockpro algorithm supposed to be
> superior than LRU in such "sequential scan of MEMSIZE+1" cases?

yes it should, hmm, have to look at that then.

What should happen is that nr_cold_target should drop to the bare
minimum, which effectivly pins all hot pages and only rotates the few
cold pages.

> Oh well, to be sincere, I still haven't understood what makes CLOCK-Pro
> use inter reference distance instead of recency, given that its a simple
> CLOCK using reference bits (but with three clocks instead of one).
>
> But thats probably just my ignorance, need to study more.

The reuse distance is in PG_test. Please see the clockpro-documentation
patch, which should explain this. If its still not clear after that let
me know, I'll be more verbose then.

--
Peter Zijlstra <[email protected]>

2006-01-03 14:22:04

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

On Sun, Jan 01, 2006 at 11:37:34AM +0100, Peter Zijlstra wrote:
> On Sat, 2005-12-31 at 20:40 -0200, Marcelo Tosatti wrote:
> > On Fri, Dec 30, 2005 at 11:43:34PM +0100, Peter Zijlstra wrote:
> > >
> > > From: Peter Zijlstra <[email protected]>
> >
> > Peter,
> >
> > I tried your "scan-shared.c" proggy which loops over 140M of a file
> > using mmap (on a 128MB box). The number of loops was configured to "5".
> >
> > The amount of major/minor pagefaults was exactly the same between
> > vanilla and clockpro, isnt the clockpro algorithm supposed to be
> > superior than LRU in such "sequential scan of MEMSIZE+1" cases?
>
> yes it should, hmm, have to look at that then.
>
> What should happen is that nr_cold_target should drop to the bare
> minimum, which effectivly pins all hot pages and only rotates the few
> cold pages.

I screwed up the tests. Here are the real numbers.

Test: scan 140MB file sequentially, 5 times.
Env: 128Mb machine

CLOCK-Pro: 0:49:98elapsed 18%CPU
7358maj+95308min

vanilla:
1:28.05elapsed 11%CPU
12950maj+166374min

Kicking some large arses!

2006-01-03 19:32:06

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

On Sat, 31 Dec 2005, Marcelo Tosatti wrote:

> > > > + * res | h/c | tst | ref || Hcold | Hhot | Htst || Flt
> > > > + * ----+-----+-----+-----++-------+------+------++-----
> > > > + * 1 | 1 | 0 | 1 ||=1101 | 1100 |=1101 ||
> > > > + * 1 | 1 | 0 | 0 ||=1100 | 1000 |=1100 ||
> > > > + * ----+-----+-----+-----++-------+------+------++-----
> > > > + * 1 | 0 | 1 | 1 || 1100 | 1001 | 1001 ||
> > > > + * 1 | 0 | 1 | 0 ||X0010 | 1000 | 1000 ||
> > > > + * 1 | 0 | 0 | 1 || 1010 |=1001 |=1001 ||
> > > > + * 1 | 0 | 0 | 0 ||X0000 |=1000 |=1000 ||
> > > > + * ----+-----+-----+-----++-------+------+------++-----
> > > > + * ----+-----+-----+-----++-------+------+------++-----
> > > > + * 0 | 0 | 1 | 1 || | | || 1100
> > > > + * 0 | 0 | 1 | 0 ||=0010 |X0000 |X0000 ||
> > > > + * 0 | 0 | 0 | 1 || | | || 1010
> > state table, it describes how (in the original paper) the three hands
> > modify the page state. Given the state in the first four columns, the
> > next three columns give a new state for each hand; hand cold, hot and
> > test. The last column describes the action of a pagefault.
> >
> > Ex. given a resident cold page in its test period that is referenced
> > (1011):
> > - Hand cold will make it 1100, that is, a resident hot page;
> > - Hand hot will make it 1001, that is, a resident cold page with a
> > reference; and
> > - Hand test will also make it 1001.
> >
> > (The prefixes '=' and 'X' are used to indicate: not changed, and remove
> > from list - that can be either move from resident->non-resident or
> > remove altogether).
>
> I see - can you add this info to the patch?

Hmm.. This looks as if it would be better to manage the page state as
a bitmap rather than individual bits. Could we put this state in
an integer variable and do an array lookup to get to the next state?

2006-01-05 09:47:25

by IWAMOTO Toshihiro

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

At Sat, 31 Dec 2005 12:29:06 +0100,
Peter Zijlstra wrote:
>
> Forgot one in the previous mail.
>
> On Fri, 2005-12-30 at 22:24 -0200, Marcelo Tosatti wrote:
> > Please make it easier for others to understand why the hands
> > swap, and when, and why.
>
> Its not the hands that swap, its the lists. The hands will lap each
> other, like the minute hand will lap the hour hand every ~65 minutes.
>
> Let me try some ascii art.
>
> ====
> ^---<>---v
> ====
>
> '=' a page
> '^---<' hand cold
> '>---v' hand hot
>
> now let hand cold move 4 pages:
>
>
> ^---<>---v
> ========
>
> ie. hand hot and hand cold have the same position.
> now if we want to move hand cold one more position this happens:
>
> =======
> ^---<>---v
> =
>
> see the swap?

Is it okay to allow Hcold to lap Hhot?
In my understanding of CLOCK-Pro, such lapping causes sudden increase
in the distance between Hhot and Hcold. As that distance is an
important parameter of page aging/replacement decisions, I'm afraid
that such lapping would result in incorrect page aging and bad
performance.

I guess the alternative is to advance Hhot together with Hcold, but
I'm not sure this is correct, either.

Please enlighten me.

--
IWAMOTO Toshihiro

2006-01-05 13:33:23

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

On Thu, 5 Jan 2006, IWAMOTO Toshihiro wrote:

> Is it okay to allow Hcold to lap Hhot?

I think it should be fine for Hcold to overtake Hhot, or
the other way around.

> In my understanding of CLOCK-Pro, such lapping causes sudden increase
> in the distance between Hhot and Hcold. As that distance is an
> important parameter of page aging/replacement decisions, I'm afraid
> that such lapping would result in incorrect page aging and bad
> performance.

Hcold only manipulates cold pages, Hhot only manipulates hot
pages and the test bit on cold pages. Having one hand overtake
the other should not disturb things at all, since they both do
something different.

--
All Rights Reversed

2006-01-06 09:01:36

by IWAMOTO Toshihiro

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

At Thu, 5 Jan 2006 08:32:19 -0500 (EST),
Rik van Riel wrote:
>
> On Thu, 5 Jan 2006, IWAMOTO Toshihiro wrote:

> > In my understanding of CLOCK-Pro, such lapping causes sudden increase
> > in the distance between Hhot and Hcold. As that distance is an
> > important parameter of page aging/replacement decisions, I'm afraid
> > that such lapping would result in incorrect page aging and bad
> > performance.
>
> Hcold only manipulates cold pages, Hhot only manipulates hot
> pages and the test bit on cold pages. Having one hand overtake
> the other should not disturb things at all, since they both do
> something different.

I don't think so. Hhot turns unreferenced hot pages into cold ones,
and those are freed if they aren't referenced before Hcold passes.
So, the distance between those hands is a sort of "expiry timer" of
such pages.
The distance also affects aging of newly inserted pages.

--
IWAMOTO Toshihiro

2006-01-24 06:30:13

by IWAMOTO Toshihiro

[permalink] [raw]
Subject: Re: [PATCH 6/9] clockpro-clockpro.patch

At Fri, 06 Jan 2006 18:01:35 +0900,
IWAMOTO Toshihiro wrote:
>
> At Thu, 5 Jan 2006 08:32:19 -0500 (EST),
> Rik van Riel wrote:
> >
> > On Thu, 5 Jan 2006, IWAMOTO Toshihiro wrote:
>
> > > In my understanding of CLOCK-Pro, such lapping causes sudden increase
> > > in the distance between Hhot and Hcold. As that distance is an
> > > important parameter of page aging/replacement decisions, I'm afraid
> > > that such lapping would result in incorrect page aging and bad
> > > performance.
> >
> > Hcold only manipulates cold pages, Hhot only manipulates hot
> > pages and the test bit on cold pages. Having one hand overtake
> > the other should not disturb things at all, since they both do
> > something different.
>
> I don't think so. Hhot turns unreferenced hot pages into cold ones,
> and those are freed if they aren't referenced before Hcold passes.
> So, the distance between those hands is a sort of "expiry timer" of
> such pages.
> The distance also affects aging of newly inserted pages.

I've added the following code to count lappings.
I also measured speeds of hands from pgrefill* and pgscan* in
/proc/vmstat.

While executing

$ while true; do cksum zero; done

, where zero is a 1100MB file and the amount of the system RAM is 1GB,
Hcold was almost twice faster than Hhot in ZONE_DMA32 (the system was
x86_64), and Hcold was steadily overtaking Hhot.
(Interestingly, in ZONE_DMA, Hhot was faster than Hcold. I think this
can be ignored for now.)

I thought this situation means that page access frequencies cannot be
correctly compared and leads to suboptimal performance, but I couldn't
prove that. However, I've managed to create an example workload where
clockpro performs worse. I'm not sure if the example is related to
this hand problem. I'll describe it in the next mail.


diff -urp linux-2.6.15-rc5-clockpro-orig/mm/clockpro.c linux-2.6.15-rc5-clockpro-20051231/mm/clockpro.c
--- linux-2.6.15-rc5-clockpro-orig/mm/clockpro.c 2006-01-24 15:05:01.000000000 +0900
+++ linux-2.6.15-rc5-clockpro-20051231/mm/clockpro.c 2006-01-13 16:36:16.000000000 +0900
@@ -119,8 +119,13 @@ static void swap_lists(struct zone *zone
static inline
void __select_list_hand(struct zone *zone, struct list_head *list)
{
- if (list_empty(list))
+ if (list_empty(list)) {
swap_lists(zone);
+ if (list == &zone->list_hand[hand_hot])
+ zone->handswapcnt++;
+ else
+ zone->handswapcnt--;
+ }
}

/*
@@ -589,6 +594,7 @@ static int stats_show(struct seq_file *m
seq_printf(m, " zone->nr_cold: %lu\n", zone->nr_cold);
seq_printf(m, " zone->nr_cold_target: %lu\n", zone->nr_cold_target);
seq_printf(m, " zone->nr_nonresident_scale: %lu\n", zone->nr_nonresident_scale);
+ seq_printf(m, " zone->handswapcnt: %ld\n", zone->handswapcnt);
seq_printf(m, " zone->present_pages: %lu\n", zone->present_pages);
seq_printf(m, " zone->free_pages: %lu\n", zone->free_pages);
seq_printf(m, " zone->pages_min: %lu\n", zone->pages_min);
diff -urp linux-2.6.15-rc5-clockpro-orig/include/linux/mmzone.h linux-2.6.15-rc5-clockpro-20051231/include/linux/mmzone.h
--- linux-2.6.15-rc5-clockpro-orig/include/linux/mmzone.h 2006-01-24 15:05:01.000000000 +0900
+++ linux-2.6.15-rc5-clockpro-20051231/include/linux/mmzone.h 2006-01-13 16:33:32.000000000 +0900
@@ -146,6 +146,7 @@ struct zone {
unsigned long nr_cold;
unsigned long nr_cold_target;
unsigned long nr_nonresident_scale;
+ long handswapcnt;

unsigned long pages_scanned; /* since last reclaim */
int all_unreclaimable; /* All pages pinned */
diff -urp linux-2.6.15-rc5-clockpro-orig/include/linux/page-flags.h linux-2.6.15-rc5-clockpro-20051231/include/linux/page-flags.h
--- linux-2.6.15-rc5-clockpro-orig/include/linux/page-flags.h 2006-01-24 15:05:01.000000000 +0900
+++ linux-2.6.15-rc5-clockpro-20051231/include/linux/page-flags.h 2006-01-16 18:17:19.000000000 +0900
@@ -155,7 +155,7 @@ extern void __mod_page_state(unsigned lo
#define mod_page_state_zone(zone, member, delta) \
do { \
unsigned offset; \
- if (is_highmem(zone)) \
+ if (is_highmem(zone) || zone == zone->zone_pgdat->node_zones + ZONE_DMA32) \
offset = offsetof(struct page_state, member##_high); \
else if (is_normal(zone)) \
offset = offsetof(struct page_state, member##_normal); \


--
IWAMOTO Toshihiro