Date: Tue, 18 Oct 2005 19:10:46 +0800
From: WU Fengguang <wfg@mail.ustc.edu.cn>
To: Andrew Morton <akpm@osdl.org>
Cc: joern@wohnheim.fh-wedel.de, linux-kernel@vger.kernel.org, mingo@elte.hu
Subject: Re: [PATCH] Adaptive read-ahead v4
Message-ID: <20051018111046.GA4123@mail.ustc.edu.cn>
Mail-Followup-To: WU Fengguang <wfg@mail.ustc.edu.cn>,
	Andrew Morton <akpm@osdl.org>, joern@wohnheim.fh-wedel.de,
	linux-kernel@vger.kernel.org, mingo@elte.hu
References: <20051015174731.GA5851@mail.ustc.edu.cn> <20051017173034.GA6558@wohnheim.fh-wedel.de> <20051018025806.GA3963@mail.ustc.edu.cn> <20051017203137.05350739.akpm@osdl.org>
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20051017203137.05350739.akpm@osdl.org>
User-Agent: Mutt/1.5.9i
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 81450
Lines: 2666

On Mon, Oct 17, 2005 at 08:31:37PM -0700, Andrew Morton wrote:
> All that stuff shouldn't remain there long-term.
> 
> > It should be better to create a stand alone struct and create a
> > dedicated entry in /proc/. The data would be a nice hint for
> > administrators who care the performance.
> 
> You should treat such informaton as a development aid, really.  People are
> very unlikely to look at it in real life.
> 
ok.

btw, I have a slightly new version against the latest -mm tree. It fixed a
silly bug of save_chunk() and corrected a misbehavior of try_read_backward().
More cleanups are under the way.

Regards,
Wu Fengguang


diff -rup linux-2.6.14-rc4-mm1-orig/drivers/block/loop.c linux-2.6.14-rc4-mm1/drivers/block/loop.c
--- linux-2.6.14-rc4-mm1-orig/drivers/block/loop.c	2005-08-29 07:41:01.000000000 +0800
+++ linux-2.6.14-rc4-mm1/drivers/block/loop.c	2005-10-17 11:42:30.000000000 +0800
@@ -768,6 +768,11 @@ static int loop_set_fd(struct loop_devic
 
 	mapping = file->f_mapping;
 	inode = mapping->host;
+	/*
+	 * The upper layer should already do proper read-ahead,
+	 * unlimited read-ahead here only ruins the cache hit rate.
+	 */
+	file->f_ra.ra_pages = 32 >> (PAGE_CACHE_SHIFT - 10);
 
 	if (!(file->f_mode & FMODE_WRITE))
 		lo_flags |= LO_FLAGS_READ_ONLY;
diff -rup linux-2.6.14-rc4-mm1-orig/include/linux/fs.h linux-2.6.14-rc4-mm1/include/linux/fs.h
--- linux-2.6.14-rc4-mm1-orig/include/linux/fs.h	2005-10-17 11:40:45.000000000 +0800
+++ linux-2.6.14-rc4-mm1/include/linux/fs.h	2005-10-17 11:42:30.000000000 +0800
@@ -562,17 +562,36 @@ struct file_ra_state {
 	unsigned long start;		/* Current window */
 	unsigned long size;
 	unsigned long flags;		/* ra flags RA_FLAG_xxx*/
-	unsigned long cache_hit;	/* cache hit count*/
+	uint64_t      cache_hit;	/* cache hit count*/
 	unsigned long prev_page;	/* Cache last read() position */
 	unsigned long ahead_start;	/* Ahead window */
 	unsigned long ahead_size;
 	unsigned long ra_pages;		/* Maximum readahead window */
 	unsigned long mmap_hit;		/* Cache hit stat for mmap accesses */
 	unsigned long mmap_miss;	/* Cache miss stat for mmap accesses */
+
+	unsigned long la_index;
+	unsigned long ra_index;
+	unsigned long lookahead_index;
+	unsigned long readahead_index;
+	unsigned long nr_page_aging;
 };
 #define RA_FLAG_MISS 0x01	/* a cache miss occured against this file */
 #define RA_FLAG_INCACHE 0x02	/* file is already in cache */
 
+#define RA_CLASS_SHIFT 3
+#define RA_CLASS_MASK  ((1 << RA_CLASS_SHIFT) - 1)
+enum file_ra_class { /* the same order must be kept in page_state */
+	RA_CLASS_NEWFILE = 1,
+	RA_CLASS_STATE,
+	RA_CLASS_CONTEXT,
+	RA_CLASS_CONTEXT_ACCELERATED,
+	RA_CLASS_BACKWARD,
+	/* RA_CLASS_AROUND, */
+	RA_CLASS_RANDOM,
+	RA_CLASS_END,
+};
+
 struct file {
 	/*
 	 * fu_list becomes invalid after file_free is called and queued via
diff -rup linux-2.6.14-rc4-mm1-orig/include/linux/mm.h linux-2.6.14-rc4-mm1/include/linux/mm.h
--- linux-2.6.14-rc4-mm1-orig/include/linux/mm.h	2005-10-17 11:40:45.000000000 +0800
+++ linux-2.6.14-rc4-mm1/include/linux/mm.h	2005-10-17 11:42:30.000000000 +0800
@@ -897,7 +897,7 @@ extern int filemap_populate(struct vm_ar
 int write_one_page(struct page *page, int wait);
 
 /* readahead.c */
-#define VM_MAX_READAHEAD	128	/* kbytes */
+#define VM_MAX_READAHEAD	1024	/* kbytes */
 #define VM_MIN_READAHEAD	16	/* kbytes (includes current page) */
 #define VM_MAX_CACHE_HIT    	256	/* max pages in a row in cache before
 					 * turning readahead off */
@@ -914,6 +914,14 @@ unsigned long  page_cache_readahead(stru
 void handle_ra_miss(struct address_space *mapping, 
 		    struct file_ra_state *ra, pgoff_t offset);
 unsigned long max_sane_readahead(unsigned long nr);
+unsigned long
+page_cache_readahead_adaptive(struct address_space *mapping,
+			struct file_ra_state *ra, struct file *filp,
+			struct page *prev_page, struct page *page,
+			unsigned long first_index,
+			unsigned long index, unsigned long last_index);
+void fastcall ra_access(struct file_ra_state *ra, struct page *page);
+int rescue_ra_pages(struct list_head *page_list, struct list_head *save_list);
 
 /* Do stack extension */
 extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
diff -rup linux-2.6.14-rc4-mm1-orig/include/linux/mm_inline.h linux-2.6.14-rc4-mm1/include/linux/mm_inline.h
--- linux-2.6.14-rc4-mm1-orig/include/linux/mm_inline.h	2005-08-29 07:41:01.000000000 +0800
+++ linux-2.6.14-rc4-mm1/include/linux/mm_inline.h	2005-10-17 11:42:30.000000000 +0800
@@ -11,6 +11,7 @@ add_page_to_inactive_list(struct zone *z
 {
 	list_add(&page->lru, &zone->inactive_list);
 	zone->nr_inactive++;
+	zone->nr_page_aging++;
 }
 
 static inline void
diff -rup linux-2.6.14-rc4-mm1-orig/include/linux/mmzone.h linux-2.6.14-rc4-mm1/include/linux/mmzone.h
--- linux-2.6.14-rc4-mm1-orig/include/linux/mmzone.h	2005-10-17 11:40:45.000000000 +0800
+++ linux-2.6.14-rc4-mm1/include/linux/mmzone.h	2005-10-17 11:42:30.000000000 +0800
@@ -160,6 +160,20 @@ struct zone {
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	int			all_unreclaimable; /* All pages pinned */
 
+	/* Fields for balanced page aging:
+	 * nr_page_aging   - The accumulated number of activities that may
+	 *                   cause page aging, that is, make some pages closer
+	 *                   to the tail of inactive_list.
+	 * aging_milestone - A snapshot of nr_page_aging every time a full
+	 *                   inactive_list of pages become aged.
+	 * page_age        - A normalized value showing the percent of pages
+	 *                   have been aged.  It is compared between zones to
+	 *                   balance the rate of page aging.
+	 */
+	unsigned long		nr_page_aging;
+	unsigned long		aging_milestone;
+	unsigned long		page_age;
+
 	/*
 	 * Does the allocator try to reclaim pages from the zone as soon
 	 * as it fails a watermark_ok() in __alloc_pages?
@@ -343,6 +357,43 @@ static inline void memory_present(int ni
 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
 #endif
 
+#ifdef CONFIG_HIGHMEM64G
+#define		PAGE_AGE_SHIFT  8
+#elif BITS_PER_LONG == 32
+#define		PAGE_AGE_SHIFT  12
+#elif BITS_PER_LONG == 64
+#define		PAGE_AGE_SHIFT  20
+#else
+#error unknown BITS_PER_LONG
+#endif
+#define		PAGE_AGE_MASK   ((1 << PAGE_AGE_SHIFT) - 1)
+
+/*
+ * The percent of pages in inactive_list that have been scanned / aged.
+ * It's not really ##%, but a high resolution normalized value.
+ */
+static inline void update_page_age(struct zone *z)
+{
+	if (z->nr_page_aging - z->aging_milestone > z->nr_inactive)
+		z->aging_milestone += z->nr_inactive;
+
+	z->page_age = ((z->nr_page_aging - z->aging_milestone)
+				<< PAGE_AGE_SHIFT) / (1 + z->nr_inactive);
+}
+
+/*
+ * The simplified code is:
+ *         return (a->page_age > b->page_age);
+ * The complexity deals with the wrap-around problem.
+ * Two page ages not close enough should also be ignored:
+ * they are out of sync and the comparison may be nonsense.
+ */
+static inline int pages_more_aged(struct zone *a, struct zone *b)
+{
+	return ((b->page_age - a->page_age) & PAGE_AGE_MASK) >
+			PAGE_AGE_MASK - (1 << (PAGE_AGE_SHIFT - 2));
+}
+
 /*
  * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
  */
diff -rup linux-2.6.14-rc4-mm1-orig/include/linux/page-flags.h linux-2.6.14-rc4-mm1/include/linux/page-flags.h
--- linux-2.6.14-rc4-mm1-orig/include/linux/page-flags.h	2005-10-17 11:40:45.000000000 +0800
+++ linux-2.6.14-rc4-mm1/include/linux/page-flags.h	2005-10-17 11:42:30.000000000 +0800
@@ -76,6 +76,8 @@
 #define PG_reclaim		17	/* To be reclaimed asap */
 #define PG_nosave_free		18	/* Free, should not be written */
 #define PG_uncached		19	/* Page has been mapped as uncached */
+#define PG_activate		20	/* delayed activate */
+#define PG_readahead		21	/* check readahead when reading this page */
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -132,6 +134,63 @@ struct page_state {
 
 	unsigned long pgrotated;	/* pages rotated to tail of the LRU */
 	unsigned long nr_bounce;	/* pages for bounce buffers */
+
+	unsigned long cache_miss;	/* read cache misses */
+	unsigned long readrandom;	/* random reads */
+	unsigned long pgreadrandom;	/* random read pages */
+	unsigned long readahead_rescue; /* read-aheads rescued*/
+	unsigned long pgreadahead_rescue;
+	unsigned long readahead_end;	/* read-aheads passed EOF */
+
+	unsigned long readahead;	/* read-aheads issued */
+	unsigned long readahead_return;	/* look-ahead marks returned */
+	unsigned long readahead_eof;	/* read-aheads stop at EOF */
+	unsigned long pgreadahead;	/* read-ahead pages issued */
+	unsigned long pgreadahead_hit;	/* read-ahead pages accessed */
+	unsigned long pgreadahead_eof;
+
+	unsigned long ra_newfile;	/* read-ahead on start of file */
+	unsigned long ra_newfile_return;
+	unsigned long ra_newfile_eof;
+	unsigned long pgra_newfile;
+	unsigned long pgra_newfile_hit;
+	unsigned long pgra_newfile_eof;
+
+	unsigned long ra_state;		/* state based read-ahead */
+	unsigned long ra_state_return;
+	unsigned long ra_state_eof;
+	unsigned long pgra_state;
+	unsigned long pgra_state_hit;
+	unsigned long pgra_state_eof;
+
+	unsigned long ra_context;	/* context based read-ahead */
+	unsigned long ra_context_return;
+	unsigned long ra_context_eof;
+	unsigned long pgra_context;
+	unsigned long pgra_context_hit;
+	unsigned long pgra_context_eof;
+
+	unsigned long ra_contexta;	/* accelerated context based read-ahead */
+	unsigned long ra_contexta_return;
+	unsigned long ra_contexta_eof;
+	unsigned long pgra_contexta;
+	unsigned long pgra_contexta_hit;
+	unsigned long pgra_contexta_eof;
+
+	unsigned long ra_backward;	/* prefetch pages for backward reading */
+	unsigned long ra_backward_return;
+	unsigned long ra_backward_eof;
+	unsigned long pgra_backward;
+	unsigned long pgra_backward_hit;
+	unsigned long pgra_backward_eof;
+
+	unsigned long ra_random;	/* read-ahead on seek-and-read-pages */
+	unsigned long ra_random_return;
+	unsigned long ra_random_eof;
+	unsigned long pgra_random;
+	unsigned long pgra_random_hit;
+	unsigned long pgra_random_eof;
+
 };
 
 extern void get_page_state(struct page_state *ret);
@@ -308,6 +367,18 @@ extern void __mod_page_state(unsigned lo
 #define SetPageUncached(page)	set_bit(PG_uncached, &(page)->flags)
 #define ClearPageUncached(page)	clear_bit(PG_uncached, &(page)->flags)
 
+#define PageActivate(page)	test_bit(PG_activate, &(page)->flags)
+#define SetPageActivate(page)	set_bit(PG_activate, &(page)->flags)
+#define ClearPageActivate(page)	clear_bit(PG_activate, &(page)->flags)
+#define TestClearPageActivate(page) test_and_clear_bit(PG_activate, &(page)->flags)
+#define TestSetPageActivate(page) test_and_set_bit(PG_activate, &(page)->flags)
+
+#define PageReadahead(page)	test_bit(PG_readahead, &(page)->flags)
+#define SetPageReadahead(page)	set_bit(PG_readahead, &(page)->flags)
+#define ClearPageReadahead(page) clear_bit(PG_readahead, &(page)->flags)
+#define TestClearPageReadahead(page) test_and_clear_bit(PG_readahead, &(page)->flags)
+#define TestSetPageReadahead(page) test_and_set_bit(PG_readahead, &(page)->flags)
+
 struct page;	/* forward declaration */
 
 int test_clear_page_dirty(struct page *page);
diff -rup linux-2.6.14-rc4-mm1-orig/include/linux/radix-tree.h linux-2.6.14-rc4-mm1/include/linux/radix-tree.h
--- linux-2.6.14-rc4-mm1-orig/include/linux/radix-tree.h	2005-10-17 11:40:45.000000000 +0800
+++ linux-2.6.14-rc4-mm1/include/linux/radix-tree.h	2005-10-17 19:16:31.000000000 +0800
@@ -22,12 +22,39 @@
 #include <linux/preempt.h>
 #include <linux/types.h>
 
+#ifdef __KERNEL__
+#define RADIX_TREE_MAP_SHIFT	6
+#else
+#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
+#endif
+#define RADIX_TREE_TAGS		2
+
+#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
+
+#define RADIX_TREE_TAG_LONGS	\
+	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
+
+struct radix_tree_node {
+	unsigned int	count;
+	void		*slots[RADIX_TREE_MAP_SIZE];
+	unsigned long	tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
+};
+
 struct radix_tree_root {
 	unsigned int		height;
 	unsigned int		gfp_mask;
 	struct radix_tree_node	*rnode;
 };
 
+/*
+ * Support access patterns with strong locality.
+ */
+struct radix_tree_cache {
+	unsigned long first_index;
+	struct radix_tree_node *tree_node;
+};
+
 #define RADIX_TREE_INIT(mask)	{					\
 	.height = 0,							\
 	.gfp_mask = (mask),						\
@@ -45,8 +72,8 @@ do {									\
 } while (0)
 
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
-void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
-void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+void *radix_tree_lookup_node(struct radix_tree_root *, unsigned long,
+							unsigned int);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
@@ -69,4 +96,86 @@ static inline void radix_tree_preload_en
 	preempt_enable();
 }
 
+/**
+ *	radix_tree_lookup    -    perform lookup operation on a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Lookup the item at the position @index in the radix tree @root.
+ */
+static inline void *radix_tree_lookup(struct radix_tree_root *root,
+							unsigned long index)
+{
+	return radix_tree_lookup_node(root, index, 0);
+}
+
+/**
+ *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Lookup the slot corresponding to the position @index in the radix tree
+ *	@root. This is useful for update-if-exists operations.
+ */
+static inline void **radix_tree_lookup_slot(struct radix_tree_root *root,
+							unsigned long index)
+{
+	struct radix_tree_node *node;
+
+	node = radix_tree_lookup_node(root, index, 1);
+	return node->slots + (index & RADIX_TREE_MAP_MASK);
+}
+
+/**
+ *	radix_tree_cache_lookup    -    perform cached lookup
+ *	@root:		radix tree root
+ *	@cache:		look-aside cache
+ *	@index:		index key
+ *
+ *	Lookup the item at the position @index in the radix tree @root.
+ *	@cache stores the last accessed bottom level tree node by this
+ *	function, and is always checked first before searching in the tree.
+ *	It can improve speed for access patterns with strong locality.
+ */
+static inline void *radix_tree_cache_lookup(struct radix_tree_root *root,
+				struct radix_tree_cache *cache,
+				unsigned long index)
+{
+	struct radix_tree_node *node;
+
+	if ((index & ~RADIX_TREE_MAP_MASK) == cache->first_index)
+		return cache->tree_node->slots[index & RADIX_TREE_MAP_MASK];
+
+	node = radix_tree_lookup_node(root, index, 1);
+	if (!node)
+		return 0;
+
+	cache->tree_node = node;
+	cache->first_index = (index & ~RADIX_TREE_MAP_MASK);
+	return node->slots[index & RADIX_TREE_MAP_MASK];
+}
+
+static inline void radix_tree_cache_init(struct radix_tree_cache *cache)
+{
+	cache->first_index = 0x77;
+}
+
+static inline int radix_tree_cache_size(struct radix_tree_cache *cache)
+{
+	return RADIX_TREE_MAP_SIZE;
+}
+
+static inline int radix_tree_cache_count(struct radix_tree_cache *cache)
+{
+	if (cache->first_index != 0x77)
+		return cache->tree_node->count;
+	else
+		return 0;
+}
+
+static inline int radix_tree_cache_first_index(struct radix_tree_cache *cache)
+{
+	return cache->first_index;
+}
+
 #endif /* _LINUX_RADIX_TREE_H */
diff -rup linux-2.6.14-rc4-mm1-orig/include/linux/sysctl.h linux-2.6.14-rc4-mm1/include/linux/sysctl.h
--- linux-2.6.14-rc4-mm1-orig/include/linux/sysctl.h	2005-10-17 11:40:45.000000000 +0800
+++ linux-2.6.14-rc4-mm1/include/linux/sysctl.h	2005-10-17 11:43:52.000000000 +0800
@@ -181,6 +181,7 @@ enum
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
 	VM_SWAP_PREFETCH=29,	/* int: amount to swap prefetch */
+	VM_READAHEAD_RATIO=30, /* percent of read-ahead size to thrashing-threshold */
 };
 
 
diff -rup linux-2.6.14-rc4-mm1-orig/kernel/sysctl.c linux-2.6.14-rc4-mm1/kernel/sysctl.c
--- linux-2.6.14-rc4-mm1-orig/kernel/sysctl.c	2005-10-17 11:40:45.000000000 +0800
+++ linux-2.6.14-rc4-mm1/kernel/sysctl.c	2005-10-17 11:45:47.000000000 +0800
@@ -67,6 +67,7 @@ extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
+extern int readahead_ratio;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -861,6 +862,16 @@ static ctl_table vm_table[] = {
 	},
 #endif
 #endif
+	{
+		.ctl_name	= VM_READAHEAD_RATIO,
+		.procname	= "readahead_ratio",
+		.data		= &readahead_ratio,
+		.maxlen		= sizeof(readahead_ratio),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 	{ .ctl_name = 0 }
 };
 
diff -rup linux-2.6.14-rc4-mm1-orig/lib/radix-tree.c linux-2.6.14-rc4-mm1/lib/radix-tree.c
--- linux-2.6.14-rc4-mm1-orig/lib/radix-tree.c	2005-10-17 11:40:45.000000000 +0800
+++ linux-2.6.14-rc4-mm1/lib/radix-tree.c	2005-10-17 15:21:58.000000000 +0800
@@ -32,25 +32,6 @@
 #include <linux/bitops.h>
 
 
-#ifdef __KERNEL__
-#define RADIX_TREE_MAP_SHIFT	6
-#else
-#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
-#endif
-#define RADIX_TREE_TAGS		2
-
-#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
-#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
-
-#define RADIX_TREE_TAG_LONGS	\
-	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
-
-struct radix_tree_node {
-	unsigned int	count;
-	void		*slots[RADIX_TREE_MAP_SIZE];
-	unsigned long	tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
-};
-
 struct radix_tree_path {
 	struct radix_tree_node *node;
 	int offset;
@@ -282,8 +263,21 @@ int radix_tree_insert(struct radix_tree_
 }
 EXPORT_SYMBOL(radix_tree_insert);
 
-static inline void **__lookup_slot(struct radix_tree_root *root,
-				   unsigned long index)
+/**
+ *	radix_tree_lookup_node    -    low level lookup routine
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@level:		stop at that many levels from bottom
+ *
+ *	Lookup the item at the position @index in the radix tree @root.
+ *	The return value is:
+ *	@level == 0:      page at @index;
+ *	@level == 1:      the corresponding bottom level tree node;
+ *	@level < height:  (height - @level)th level tree node;
+ *	@level >= height: root node.
+ */
+void *radix_tree_lookup_node(struct radix_tree_root *root,
+				   unsigned long index, unsigned int level)
 {
 	unsigned int height, shift;
 	struct radix_tree_node **slot;
@@ -295,7 +289,7 @@ static inline void **__lookup_slot(struc
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 	slot = &root->rnode;
 
-	while (height > 0) {
+	while (height > level) {
 		if (*slot == NULL)
 			return NULL;
 
@@ -306,38 +300,9 @@ static inline void **__lookup_slot(struc
 		height--;
 	}
 
-	return (void **)slot;
-}
-
-/**
- *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *
- *	Lookup the slot corresponding to the position @index in the radix tree
- *	@root. This is useful for update-if-exists operations.
- */
-void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
-{
-	return __lookup_slot(root, index);
-}
-EXPORT_SYMBOL(radix_tree_lookup_slot);
-
-/**
- *	radix_tree_lookup    -    perform lookup operation on a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *
- *	Lookup the item at the position @index in the radix tree @root.
- */
-void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
-{
-	void **slot;
-
-	slot = __lookup_slot(root, index);
-	return slot != NULL ? *slot : NULL;
+	return *slot;
 }
-EXPORT_SYMBOL(radix_tree_lookup);
+EXPORT_SYMBOL(radix_tree_lookup_node);
 
 /**
  *	radix_tree_tag_set - set a tag on a radix tree node
diff -rup linux-2.6.14-rc4-mm1-orig/mm/filemap.c linux-2.6.14-rc4-mm1/mm/filemap.c
--- linux-2.6.14-rc4-mm1-orig/mm/filemap.c	2005-10-17 11:40:45.000000000 +0800
+++ linux-2.6.14-rc4-mm1/mm/filemap.c	2005-10-17 11:42:30.000000000 +0800
@@ -724,6 +724,8 @@ grab_cache_page_nowait(struct address_sp
 
 EXPORT_SYMBOL(grab_cache_page_nowait);
 
+extern int readahead_ratio;
+
 /*
  * This is a generic file read routine, and uses the
  * mapping->a_ops->readpage() function for the actual low-level
@@ -751,10 +753,12 @@ void do_generic_mapping_read(struct addr
 	unsigned long prev_index;
 	loff_t isize;
 	struct page *cached_page;
+	struct page *prev_page;
 	int error;
 	struct file_ra_state ra = *_ra;
 
 	cached_page = NULL;
+	prev_page = NULL;
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	next_index = index;
 	prev_index = ra.prev_page;
@@ -783,16 +787,35 @@ void do_generic_mapping_read(struct addr
 		nr = nr - offset;
 
 		cond_resched();
-		if (index == next_index)
+		
+		if (readahead_ratio <= 9 && index == next_index)
 			next_index = page_cache_readahead(mapping, &ra, filp,
 					index, last_index - index);
 
 find_page:
 		page = find_get_page(mapping, index);
+		if (readahead_ratio > 9) {
+			if (unlikely(page == NULL)) {
+				page_cache_readahead_adaptive(mapping, &ra,
+						filp, prev_page, NULL,
+						*ppos >> PAGE_CACHE_SHIFT,
+						index, last_index);
+				page = find_get_page(mapping, index);
+			} else if (PageReadahead(page)) {
+				page_cache_readahead_adaptive(mapping, &ra,
+						filp, prev_page, page,
+						*ppos >> PAGE_CACHE_SHIFT,
+						index, last_index);
+			}
+		}
 		if (unlikely(page == NULL)) {
-			handle_ra_miss(mapping, &ra, index);
+			if (readahead_ratio <= 9)
+				handle_ra_miss(mapping, &ra, index);
 			goto no_cached_page;
 		}
+		if (prev_page)
+			page_cache_release(prev_page);
+		prev_page = page;
 		if (!PageUptodate(page))
 			goto page_not_up_to_date;
 page_ok:
@@ -808,8 +831,10 @@ page_ok:
 		 * When (part of) the same page is read multiple times
 		 * in succession, only mark it as accessed the first time.
 		 */
-		if (prev_index != index)
+		if (prev_index != index) {
+			ra_access(&ra, page);
 			mark_page_accessed(page);
+		}
 		prev_index = index;
 
 		/*
@@ -827,7 +852,6 @@ page_ok:
 		index += offset >> PAGE_CACHE_SHIFT;
 		offset &= ~PAGE_CACHE_MASK;
 
-		page_cache_release(page);
 		if (ret == nr && desc->count)
 			continue;
 		goto out;
@@ -839,7 +863,6 @@ page_not_up_to_date:
 		/* Did it get unhashed before we got the lock? */
 		if (!page->mapping) {
 			unlock_page(page);
-			page_cache_release(page);
 			continue;
 		}
 
@@ -864,7 +887,6 @@ readpage:
 					 * invalidate_inode_pages got it
 					 */
 					unlock_page(page);
-					page_cache_release(page);
 					goto find_page;
 				}
 				unlock_page(page);
@@ -885,7 +907,6 @@ readpage:
 		isize = i_size_read(inode);
 		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 		if (unlikely(!isize || index > end_index)) {
-			page_cache_release(page);
 			goto out;
 		}
 
@@ -894,7 +915,6 @@ readpage:
 		if (index == end_index) {
 			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
 			if (nr <= offset) {
-				page_cache_release(page);
 				goto out;
 			}
 		}
@@ -904,7 +924,6 @@ readpage:
 readpage_error:
 		/* UHHUH! A synchronous read error occurred. Report it */
 		desc->error = error;
-		page_cache_release(page);
 		goto out;
 
 no_cached_page:
@@ -929,15 +948,22 @@ no_cached_page:
 		}
 		page = cached_page;
 		cached_page = NULL;
+		if (prev_page)
+			page_cache_release(prev_page);
+		prev_page = page;
 		goto readpage;
 	}
 
 out:
 	*_ra = ra;
+	if (readahead_ratio > 9)
+		_ra->prev_page = prev_index;
 
 	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
 	if (cached_page)
 		page_cache_release(cached_page);
+	if (prev_page)
+		page_cache_release(prev_page);
 	if (filp)
 		file_accessed(filp);
 }
@@ -1235,19 +1261,33 @@ retry_all:
 	 *
 	 * For sequential accesses, we use the generic readahead logic.
 	 */
-	if (VM_SequentialReadHint(area))
+	if (readahead_ratio <= 9 && VM_SequentialReadHint(area))
 		page_cache_readahead(mapping, ra, file, pgoff, 1);
 
+
 	/*
 	 * Do we have something in the page cache already?
 	 */
 retry_find:
 	page = find_get_page(mapping, pgoff);
+	if (VM_SequentialReadHint(area) && readahead_ratio > 9) {
+		if (!page) {
+			page_cache_readahead_adaptive(mapping, ra,
+						file, NULL, NULL,
+						pgoff, pgoff, pgoff + 1);
+			page = find_get_page(mapping, pgoff);
+		} else if (PageReadahead(page)) {
+			page_cache_readahead_adaptive(mapping, ra,
+						file, NULL, page,
+						pgoff, pgoff, pgoff + 1);
+		}
+	}
 	if (!page) {
 		unsigned long ra_pages;
 
 		if (VM_SequentialReadHint(area)) {
-			handle_ra_miss(mapping, ra, pgoff);
+			if (readahead_ratio <= 9)
+				handle_ra_miss(mapping, ra, pgoff);
 			goto no_cached_page;
 		}
 		ra->mmap_miss++;
@@ -1272,6 +1312,13 @@ retry_find:
 		if (ra_pages) {
 			pgoff_t start = 0;
 
+			/*
+			 * Max read-around should be much smaller than
+			 * max read-ahead.
+			 * How about adding a tunable parameter for this?
+			 */
+			if (ra_pages > 64)
+				ra_pages = 64;
 			if (pgoff > ra_pages / 2)
 				start = pgoff - ra_pages / 2;
 			do_page_cache_readahead(mapping, file, start, ra_pages);
@@ -1295,9 +1342,12 @@ success:
 	/*
 	 * Found the page and have a reference on it.
 	 */
+	ra_access(ra, page);
 	mark_page_accessed(page);
 	if (type)
 		*type = majmin;
+	if (readahead_ratio > 9)
+		ra->prev_page = page->index;
 	return page;
 
 outside_data_content:
diff -rup linux-2.6.14-rc4-mm1-orig/mm/page_alloc.c linux-2.6.14-rc4-mm1/mm/page_alloc.c
--- linux-2.6.14-rc4-mm1-orig/mm/page_alloc.c	2005-10-17 11:40:45.000000000 +0800
+++ linux-2.6.14-rc4-mm1/mm/page_alloc.c	2005-10-17 11:50:52.000000000 +0800
@@ -138,7 +138,7 @@ static void bad_page(const char *functio
 			1 << PG_active	|
 			1 << PG_dirty	|
 			1 << PG_reclaim |
-			1 << PG_slab    |
+			1 << PG_slab	|
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved );
@@ -488,6 +488,7 @@ static void prep_new_page(struct page *p
 
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 			1 << PG_referenced | 1 << PG_arch_1 |
+			1 << PG_activate | 1 << PG_readahead |
 			1 << PG_checked | 1 << PG_mappedtodisk);
 	page->private = 0;
 	set_page_refs(page, order);
@@ -870,9 +871,15 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
 	struct task_struct *p = current;
 	int i;
 	int classzone_idx;
+	int do_reclaim;
 	int do_retry;
 	int can_try_harder;
 	int did_some_progress;
+	unsigned long zones_mask;
+	int left_count;
+	int batch_size;
+	int batch_base;
+	int batch_idx;
 
 	might_sleep_if(wait);
 
@@ -892,13 +899,62 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
 
 	classzone_idx = zone_idx(zones[0]);
 
-restart:
 	/*
 	 * Go through the zonelist once, looking for a zone with enough free.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		int do_reclaim = should_reclaim_zone(z, gfp_mask);
+restart:
+	/*
+	 * To fulfill three goals:
+	 * - balanced page aging
+	 * - locality
+	 * - predefined zonelist priority
+	 *
+	 * The logic employs the following rules:
+	 * 1. Zones are checked in predefined order in general.
+	 * 2. Skip to the next zone if it has lower page_age.
+	 * 3. Checkings are carried out in batch, all zones in a batch must be
+	 *    checked before entering the next batch.
+	 * 4. All local zones in the zonelist forms the first batch.
+	 */
+
+	/* TODO: Avoid this loop by putting the values into struct zonelist.
+	 * The (more general) desired batch counts can also go there.
+	 */
+	for (batch_size = 0, i = 0; (z = zones[i]) != NULL; i++) {
+		if (z->zone_pgdat == zones[0]->zone_pgdat)
+			batch_size++;
+	}
+	BUG_ON(!batch_size);
+
+	left_count = i - batch_size;
+	batch_base = 0;
+	batch_idx = 0;
+	zones_mask = 0;
+
+	for (;;) {
+		if (zones_mask == (1 << batch_size) - 1) {
+			if (left_count <= 0) {
+				break;
+			}
+			batch_base += batch_size;
+			batch_size = min(left_count, (int)sizeof(zones_mask) * 8);
+			left_count -= batch_size;
+			batch_idx = 0;
+			zones_mask = 0;
+		}
+
+		do {
+			i = batch_idx;
+			do {
+				if (++batch_idx >= batch_size)
+					batch_idx = 0;
+			} while (zones_mask & (1 << batch_idx));
+		} while (pages_more_aged(zones[batch_base + i],
+					 zones[batch_base + batch_idx]));
+
+		zones_mask |= (1 << i);
+		z = zones[batch_base + i];
 
 		if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
 			continue;
@@ -908,11 +964,12 @@ restart:
 		 * will try to reclaim pages and check the watermark a second
 		 * time before giving up and falling back to the next zone.
 		 */
+		do_reclaim = should_reclaim_zone(z, gfp_mask);
 zone_reclaim_retry:
 		if (!zone_watermark_ok(z, order, z->pages_low,
 				       classzone_idx, 0, 0)) {
 			if (!do_reclaim)
-				continue;
+				goto try_harder;
 			else {
 				zone_reclaim(z, gfp_mask, order);
 				/* Only try reclaim once */
@@ -924,20 +981,18 @@ zone_reclaim_retry:
 		page = buffered_rmqueue(z, order, gfp_mask);
 		if (page)
 			goto got_pg;
-	}
 
-	for (i = 0; (z = zones[i]) != NULL; i++)
+try_harder:
 		wakeup_kswapd(z, order);
 
-	/*
-	 * Go through the zonelist again. Let __GFP_HIGH and allocations
-	 * coming from realtime tasks to go deeper into reserves
-	 *
-	 * This is the last chance, in general, before the goto nopage.
-	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
-	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
-	 */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
+		/*
+		 * Put stress on the zone. Let __GFP_HIGH and allocations
+		 * coming from realtime tasks to go deeper into reserves.
+		 *
+		 * This is the last chance, in general, before the goto nopage.
+		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+		 */
 		if (!zone_watermark_ok(z, order, z->pages_min,
 				       classzone_idx, can_try_harder,
 				       gfp_mask & __GFP_HIGH))
@@ -1448,6 +1503,8 @@ void show_free_areas(void)
 			" active:%lukB"
 			" inactive:%lukB"
 			" present:%lukB"
+			" aging:%lukB"
+			" age:%lu"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
 			"\n",
@@ -1459,6 +1516,8 @@ void show_free_areas(void)
 			K(zone->nr_active),
 			K(zone->nr_inactive),
 			K(zone->present_pages),
+			K(zone->nr_page_aging),
+			zone->page_age,
 			zone->pages_scanned,
 			(zone->all_unreclaimable ? "yes" : "no")
 			);
@@ -2075,6 +2134,9 @@ static void __init free_area_init_core(s
 		zone->nr_scan_inactive = 0;
 		zone->nr_active = 0;
 		zone->nr_inactive = 0;
+		zone->nr_page_aging = 0;
+		zone->aging_milestone = 0;
+		zone->page_age = 0;
 		atomic_set(&zone->reclaim_in_progress, 0);
 		if (!size)
 			continue;
@@ -2223,6 +2285,8 @@ static int zoneinfo_show(struct seq_file
 			   "\n        high     %lu"
 			   "\n        active   %lu"
 			   "\n        inactive %lu"
+			   "\n        aging    %lu"
+			   "\n        age      %lu"
 			   "\n        scanned  %lu (a: %lu i: %lu)"
 			   "\n        spanned  %lu"
 			   "\n        present  %lu",
@@ -2232,6 +2296,8 @@ static int zoneinfo_show(struct seq_file
 			   zone->pages_high,
 			   zone->nr_active,
 			   zone->nr_inactive,
+			   zone->nr_page_aging,
+			   zone->page_age,
 			   zone->pages_scanned,
 			   zone->nr_scan_active, zone->nr_scan_inactive,
 			   zone->spanned_pages,
@@ -2353,6 +2419,63 @@ static char *vmstat_text[] = {
 
 	"pgrotated",
 	"nr_bounce",
+
+	"cache_miss",
+	"readrandom",
+	"pgreadrandom",
+	"readahead_rescue",
+	"pgreadahead_rescue",
+	"readahead_end",
+
+	"readahead",
+	"readahead_return",
+	"readahead_eof",
+	"pgreadahead",
+	"pgreadahead_hit",
+	"pgreadahead_eof",
+
+	"ra_newfile",
+	"ra_newfile_return",
+	"ra_newfile_eof",
+	"pgra_newfile",
+	"pgra_newfile_hit",
+	"pgra_newfile_eof",
+
+	"ra_state",
+	"ra_state_return",
+	"ra_state_eof",
+	"pgra_state",
+	"pgra_state_hit",
+	"pgra_state_eof",
+
+	"ra_context",
+	"ra_context_return",
+	"ra_context_eof",
+	"pgra_context",
+	"pgra_context_hit",
+	"pgra_context_eof",
+
+	"ra_contexta",
+	"ra_contexta_return",
+	"ra_contexta_eof",
+	"pgra_contexta",
+	"pgra_contexta_hit",
+	"pgra_contexta_eof",
+
+	"ra_backward",
+	"ra_backward_return",
+	"ra_backward_eof",
+	"pgra_backward",
+	"pgra_backward_hit",
+	"pgra_backward_eof",
+
+	"ra_random",
+	"ra_random_return",
+	"ra_random_eof",
+	"pgra_random",
+	"pgra_random_hit",
+	"pgra_random_eof",
+
 };
 
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
diff -rup linux-2.6.14-rc4-mm1-orig/mm/readahead.c linux-2.6.14-rc4-mm1/mm/readahead.c
--- linux-2.6.14-rc4-mm1-orig/mm/readahead.c	2005-10-17 11:40:45.000000000 +0800
+++ linux-2.6.14-rc4-mm1/mm/readahead.c	2005-10-17 22:14:24.000000000 +0800
@@ -15,13 +15,65 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 
+#define DEBUG_READAHEAD
+
+#ifdef DEBUG_READAHEAD
+#define dprintk(args...) \
+	if (readahead_ratio & 1) printk(KERN_DEBUG args)
+#define ddprintk(args...) \
+	if ((readahead_ratio & 3) == 3) printk(KERN_DEBUG args)
+
+#define ra_account_page(ra, member, delta)	do {			\
+	unsigned long opg = offsetof(struct page_state, pgreadahead) - 	\
+				offsetof(struct page_state, readahead);	\
+	unsigned long o1 = offsetof(struct page_state, member);		\
+	unsigned long o2 = o1 + 2 * opg * ((ra)->flags & RA_CLASS_MASK);\
+	BUG_ON(opg + o2 >= sizeof(struct page_state));			\
+	__mod_page_state(o1, 1UL);					\
+	__mod_page_state(o2, 1UL);					\
+	__mod_page_state(opg + o1, (delta));				\
+	__mod_page_state(opg + o2, (delta));				\
+} while (0)
+
+#define ra_account(member, class, delta)	do {			\
+	unsigned long opg = offsetof(struct page_state, pgreadahead) - 	\
+				offsetof(struct page_state, readahead);	\
+	unsigned long o1 = offsetof(struct page_state, member);		\
+	unsigned long o2 = o1 + 2 * opg * (class);			\
+	if ((class) >= RA_CLASS_END)					\
+		break;							\
+	BUG_ON(o2 >= sizeof(struct page_state));			\
+	__mod_page_state(o1, (delta));					\
+	__mod_page_state(o2, (delta));					\
+} while (0)
+
+#else
+#undef inc_page_state
+#undef mod_page_state
+#define inc_page_state(a)    do {} while(0)
+#define mod_page_state(a, b) do {} while(0)
+#define dprintk(args...)     do {} while(0)
+#define ddprintk(args...)    do {} while(0)
+#define ra_account(member, class, delta)	do {} while(0)
+#define ra_account_page(member, class, delta)	do {} while(0)
+#endif
+
+/* Set look-ahead size to 1/8 of the read-ahead size. */
+#define LOOKAHEAD_RATIO 8
+
+/* Set read-ahead size to ##% of the thrashing-threshold. */
+int readahead_ratio = 0;   
+EXPORT_SYMBOL(readahead_ratio);
+
 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
 }
 EXPORT_SYMBOL(default_unplug_io_fn);
 
+#define MAX_RA_PAGES	(VM_MAX_READAHEAD >> (PAGE_CACHE_SHIFT - 10))
+#define MIN_RA_PAGES	(VM_MIN_READAHEAD >> (PAGE_CACHE_SHIFT - 10))
 struct backing_dev_info default_backing_dev_info = {
-	.ra_pages	= (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
+	.ra_pages	= MAX_RA_PAGES,
 	.state		= 0,
 	.capabilities	= BDI_CAP_MAP_COPY,
 	.unplug_io_fn	= default_unplug_io_fn,
@@ -50,7 +102,7 @@ static inline unsigned long get_max_read
 
 static inline unsigned long get_min_readahead(struct file_ra_state *ra)
 {
-	return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+	return MIN_RA_PAGES;
 }
 
 static inline void ra_off(struct file_ra_state *ra)
@@ -255,10 +307,11 @@ out:
  */
 static int
 __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
-			unsigned long offset, unsigned long nr_to_read)
+			unsigned long offset, unsigned long nr_to_read,
+			unsigned long lookahead_size)
 {
 	struct inode *inode = mapping->host;
-	struct page *page;
+	struct page *page = NULL;
 	unsigned long end_index;	/* The last page we want to read */
 	LIST_HEAD(page_pool);
 	int page_idx;
@@ -268,7 +321,7 @@ __do_page_cache_readahead(struct address
 	if (isize == 0)
 		goto out;
 
- 	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
 
 	/*
 	 * Preallocate as many pages as we will need.
@@ -291,6 +344,9 @@ __do_page_cache_readahead(struct address
 			break;
 		page->index = page_offset;
 		list_add(&page->lru, &page_pool);
+		if (readahead_ratio > 9 &&
+				page_idx == nr_to_read - lookahead_size)
+			SetPageReadahead(page);
 		ret++;
 	}
 	read_unlock_irq(&mapping->tree_lock);
@@ -327,7 +383,7 @@ int force_page_cache_readahead(struct ad
 		if (this_chunk > nr_to_read)
 			this_chunk = nr_to_read;
 		err = __do_page_cache_readahead(mapping, filp,
-						offset, this_chunk);
+						offset, this_chunk, 0);
 		if (err < 0) {
 			ret = err;
 			break;
@@ -374,7 +430,7 @@ int do_page_cache_readahead(struct addre
 	if (bdi_read_congested(mapping->backing_dev_info))
 		return -1;
 
-	return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
+	return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
 }
 
 /*
@@ -394,7 +450,10 @@ blockable_page_cache_readahead(struct ad
 	if (!block && bdi_read_congested(mapping->backing_dev_info))
 		return 0;
 
-	actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
+	actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
+
+	dprintk("blockable-readahead(ino=%lu, ra=%lu+%lu) = %d\n",
+			mapping->host->i_ino, offset, nr_to_read, actual);
 
 	return check_ra_success(ra, nr_to_read, actual);
 }
@@ -559,3 +618,1334 @@ unsigned long max_sane_readahead(unsigne
 	__get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
 	return min(nr, (inactive + free) / 2);
 }
+
+/*
+ * Adaptive read-ahead.
+ *
+ * Good read patterns are compact both in space and time. The read-ahead logic
+ * tries to grant larger read-ahead size to better readers under the constraint
+ * of system memory and load pressures.
+ *
+ * It employs two methods to estimate the max thrashing safe read-ahead size:
+ *   1. state based   - the default one
+ *   2. context based - the fail safe one
+ * The integration of the dual methods has the merit of being agile and robust.
+ * It makes the overall design clean: special cases are handled in general by
+ * the stateless method, leaving the stateful one simple and fast.
+ *
+ * To improve throughput and decrease read delay, the logic 'looks ahead'.
+ * In every read-ahead chunk, it selects one page and tag it with PG_readahead.
+ * Later when the page with PG_readahead is to be read, the logic knows that
+ * it's time to carry out the next read-ahead chunk in advance.
+ *    
+ *                 a read-ahead chunk
+ *    +-----------------------------------------+       
+ *    |       # PG_readahead                    |       
+ *    +-----------------------------------------+       
+ *            ^ When this page is read, we submit I/O for the next read-ahead.
+ *
+ *
+ * Here are some variable names used frequently:
+ *
+ *                                   |<------- la_size ------>|
+ *                  +-----------------------------------------+       
+ *                  |                #                        |       
+ *                  +-----------------------------------------+       
+ *      ra_index -->|<---------------- ra_size -------------->|
+ *
+ */
+
+#define next_page(pg) (list_entry((pg)->lru.prev, struct page, lru))
+#define prev_page(pg) (list_entry((pg)->lru.next, struct page, lru))
+
+/*
+ * The nature of read-ahead allows most tests to fail or even be wrong.
+ * Here we just do not bother to call get_page(), it's meaningless anyway.
+ */
+struct page *find_page(struct address_space *mapping, unsigned long offset)
+{
+	struct page *page;
+
+	read_lock_irq(&mapping->tree_lock);
+	page = radix_tree_lookup(&mapping->page_tree, offset);
+	read_unlock_irq(&mapping->tree_lock);
+	return page;
+}
+
+/*
+ * Move pages in danger (of thrashing) to the head of inactive_list.
+ * Not expected to happen frequently.
+ */
+static int rescue_pages(struct page *page, unsigned long nr_pages)
+{
+	unsigned long pgrescue;
+	unsigned long index;
+	struct address_space *mapping;
+	struct zone *zone;
+
+	BUG_ON(!nr_pages || !page);
+	pgrescue = 0;
+	index = page->index;
+	mapping = page_mapping(page);
+
+	dprintk("rescue_pages(ino=%lu, index=%lu nr=%lu)\n",
+			mapping->host->i_ino, index, nr_pages);
+
+	for(;;) {
+		zone = page_zone(page);
+		spin_lock_irq(&zone->lru_lock);
+
+		if (!PageLRU(page))
+			goto out_unlock;
+
+		while (page_mapping(page) == mapping && page->index == index) {
+			struct page *the_page = page;
+			page = next_page(page);
+			if (!PageActive(the_page) &&
+					!PageActivate(the_page) &&
+					!PageLocked(the_page) &&
+					page_count(the_page) == 1) {
+				list_move(&the_page->lru, &zone->inactive_list);
+				zone->nr_page_aging++;
+				pgrescue++;
+			}
+			index++;
+			if (!--nr_pages)
+				goto out_unlock;
+		}
+
+		spin_unlock_irq(&zone->lru_lock);
+
+		page = find_page(mapping, index);
+		if (!page)
+			goto out;
+	}
+out_unlock:
+	spin_unlock_irq(&zone->lru_lock);
+out:
+	inc_page_state(readahead_rescue);
+	mod_page_state(pgreadahead_rescue, pgrescue);
+
+	return nr_pages ? index : 0;
+}
+
+/*
+ * State based calculation of read-ahead request.
+ *
+ * This figure shows the meaning of file_ra_state members:
+ *
+ *             chunk A                            chunk B
+ *  +---------------------------+-------------------------------------------+
+ *  |             #             |                   #                       |
+ *  +---------------------------+-------------------------------------------+
+ *                ^             ^                   ^                       ^
+ *              la_index      ra_index     lookahead_index         readahead_index        
+ */
+
+/*
+ * The global effective length of the inactive_list(s).
+ */
+static unsigned long nr_free_inactive(void)
+{
+	unsigned int i;
+	unsigned long sum = 0;
+	struct zone *zones = NODE_DATA(numa_node_id())->node_zones;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		sum += zones[i].nr_inactive +
+			zones[i].free_pages - zones[i].pages_low;
+
+	return sum;
+}
+
+/*
+ * The accumulated count of pages pushed into inactive_list(s).
+ */
+static unsigned long nr_page_aging(void)
+{
+	unsigned int i;
+	unsigned long sum = 0;
+	struct zone *zones = NODE_DATA(numa_node_id())->node_zones;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		sum += zones[i].nr_page_aging;
+
+	return sum;
+}
+
+/*
+ * Set class of read-ahead
+ */
+static inline void set_ra_class(struct file_ra_state *ra,
+				enum file_ra_class ra_class)
+{
+	ra->flags <<= RA_CLASS_SHIFT;
+	ra->flags += ra_class;
+}
+
+/*
+ * The 64bit cache_hit stores three accumulated value and one counter value.
+ * MSB                                                                   LSB
+ * 3333333333333333 : 2222222222222222 : 1111111111111111 : 0000000000000000
+ */
+static inline int ra_cache_hit(struct file_ra_state *ra, int nr)
+{
+	return (ra->cache_hit >> (nr * 16)) & 0xFFFF;
+}
+
+static inline void ra_addup_cache_hit(struct file_ra_state *ra)
+{
+	int n;
+
+	n = ra_cache_hit(ra, 0);
+	ra->cache_hit -= n;
+	n <<= 16;
+	ra->cache_hit += n;
+}
+
+/*
+ * The read-ahead is deemed success if cache-hit-rate > 50%.
+ */
+static inline int ra_cache_hit_ok(struct file_ra_state *ra)
+{
+	return ra_cache_hit(ra, 0) * 2 > (ra->lookahead_index - ra->la_index);
+}
+
+/*
+ * Check if @index falls in the ra request.
+ */
+static inline int ra_has_index(struct file_ra_state *ra, unsigned long index)
+{
+	if (index < ra->la_index || index >= ra->readahead_index)
+		return 0;
+
+	if (index >= ra->ra_index)
+		return 1;
+	else
+		return -1;
+}
+
+/*
+ * Prepare file_ra_state for a new read-ahead sequence.
+ */
+static inline void ra_state_init(struct file_ra_state *ra,
+				unsigned long la_index, unsigned long ra_index)
+{
+	ra_addup_cache_hit(ra);
+	ra->cache_hit <<= 16;
+	ra->lookahead_index = la_index;
+	ra->readahead_index = ra_index;
+}
+
+/*
+ * Take down a new read-ahead request in file_ra_state.
+ */
+static inline void ra_state_update(struct file_ra_state *ra,
+				unsigned long ra_size, unsigned long la_size)
+{
+	ra_addup_cache_hit(ra);
+	ra->ra_index = ra->readahead_index;
+	ra->la_index = ra->lookahead_index;
+	ra->readahead_index += ra_size;
+	ra->lookahead_index = ra->readahead_index - la_size;
+	ra->nr_page_aging = nr_page_aging();
+}
+
+/*
+ * Adjust the read-ahead request in file_ra_state.
+ */
+static inline void ra_state_adjust(struct file_ra_state *ra,
+				unsigned long ra_size, unsigned long la_size)
+{
+	ra->readahead_index = ra->ra_index + ra_size;
+	ra->lookahead_index = ra->readahead_index - la_size;
+}
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+static int ra_dispatch(struct file_ra_state *ra,
+			struct address_space *mapping, struct file *filp)
+{
+	unsigned long eof_index;
+	unsigned long ra_size;
+	unsigned long la_size;
+	int actual;
+	enum file_ra_class ra_class;
+	static char *ra_class_name[] = {
+		"newfile",
+		"state",
+		"context",
+		"contexta",
+		"backward",
+		/* "around", */
+		"random",
+	};
+
+	ra_class = (ra->flags & RA_CLASS_MASK);
+	BUG_ON(ra_class == 0 || ra_class > RA_CLASS_END);
+
+	eof_index = ((i_size_read(mapping->host) - 1) >> PAGE_CACHE_SHIFT) + 1;
+	ra_size = ra->readahead_index - ra->ra_index;
+	la_size = ra->readahead_index - ra->lookahead_index;
+
+	/* Snap to EOF. */
+	if (unlikely(ra->ra_index >= eof_index)) {
+		inc_page_state(readahead_end);
+		return 0;
+	}
+	if (ra->readahead_index + ra_size / 2 > eof_index) {
+		if (ra_class == RA_CLASS_CONTEXT_ACCELERATED &&
+				eof_index > ra->lookahead_index + 1)
+			la_size = eof_index - ra->lookahead_index;
+		else
+			la_size = 0;
+		ra_size = eof_index - ra->ra_index;
+		ra_state_adjust(ra, ra_size, la_size);
+	}
+
+	actual = __do_page_cache_readahead(mapping, filp,
+					ra->ra_index, ra_size, la_size);
+
+	if (!la_size && ra->readahead_index == eof_index)
+		ra_account_page(ra, readahead_eof, actual);
+	ra_account_page(ra, readahead, actual);
+
+	dprintk("readahead-%s(ino=%lu, index=%lu, ra=%lu+%lu-%lu) = %d\n",
+			ra_class_name[ra_class - 1],
+			mapping->host->i_ino, ra->la_index,
+			ra->ra_index, ra_size, la_size, actual);
+
+	return actual;
+}
+
+/*
+ * Determine the request parameters from primitive values.
+ *
+ * It applies the following rules:
+ *   - Substract ra_size by the old look-ahead to get real safe read-ahead;
+ *   - Set new la_size according to the (still large) ra_size;
+ *   - Apply upper limits;
+ *   - Make sure stream_shift is not too small.
+ *     (So that the next global_shift will not be too small.)
+ *
+ * Input:
+ * ra_size stores the estimated thrashing-threshold.
+ * la_size stores the look-ahead size of previous request.
+ */
+static inline int adjust_rala(unsigned long ra_max,
+				unsigned long *ra_size, unsigned long *la_size)
+{
+	unsigned long stream_shift = *la_size;
+
+	if (*ra_size > *la_size)
+		*ra_size -= *la_size;
+	else
+		return 0;
+
+	*la_size = *ra_size / LOOKAHEAD_RATIO;
+
+	if (*ra_size > ra_max)
+		*ra_size = ra_max;
+	if (*la_size > *ra_size)
+		*la_size = *ra_size;
+
+	stream_shift += (*ra_size - *la_size);
+	if (stream_shift < *ra_size / 4)
+		*la_size -= (*ra_size / 4 - stream_shift);
+
+	return 1;
+}
+
+/*
+ * The function estimates two values:
+ * 1. thrashing-threshold for the current stream
+ *    It is returned to make the next read-ahead request.
+ * 2. the remained space for the current chunk
+ *    It will be checked to ensure that the current chunk is safe.
+ * 
+ * The computation will be pretty accurate under heavy load, and will change
+ * vastly with light load(small global_shift), so the grow speed of ra_size
+ * must be limited, and a moderate large stream_shift must be insured.
+ *
+ * This figure illustrates the formula:
+ * While the stream reads stream_shift pages inside the chunks,
+ * the chunks are shifted global_shift pages inside inactive_list.
+ *
+ *      chunk A                    chunk B
+ *                          |<=============== global_shift ================|
+ *  +-------------+         +-------------------+                          |
+ *  |       #     |         |           #       |            inactive_list |
+ *  +-------------+         +-------------------+                     head |
+ *          |---->|         |---------->|
+ *             |                  |
+ *             +-- stream_shift --+
+ */
+static inline unsigned long compute_thrashing_threshold(
+						struct file_ra_state *ra,
+						unsigned long *remain)
+{
+	unsigned long global_size;
+	unsigned long global_shift;
+	unsigned long stream_shift;
+	unsigned long ra_size;
+
+	global_size = nr_free_inactive();
+	global_shift = nr_page_aging() - ra->nr_page_aging;
+	stream_shift = ra_cache_hit(ra, 0);
+
+	ra_size = stream_shift *
+			global_size * readahead_ratio / 100 / global_shift;
+
+	if (global_size > global_shift)
+		*remain = stream_shift *
+				(global_size - global_shift) / global_shift;
+	else
+		*remain = 0;
+
+	ddprintk("compute_thrashing_threshold: "
+			"ra=%lu=%lu*%lu/%lu, remain %lu for %lu\n",
+			ra_size, stream_shift, global_size, global_shift,
+			*remain, ra->readahead_index - ra->lookahead_index);
+
+	return ra_size;
+}
+
+/* 
+ * Main function for file_ra_state based read-ahead.
+ */
+static inline unsigned long
+state_based_readahead(struct address_space *mapping, struct file *filp,
+			struct file_ra_state *ra, struct page *page,
+			unsigned long ra_max)
+{
+	unsigned long ra_old;
+	unsigned long ra_size;
+	unsigned long la_size;
+	unsigned long remain_space;
+
+	la_size = ra->readahead_index - ra->lookahead_index;
+	ra_old = ra->readahead_index - ra->ra_index;
+	ra_size = compute_thrashing_threshold(ra, &remain_space);
+
+	if (readahead_ratio < 80 && 
+			remain_space <= la_size && la_size > 1) {
+		rescue_pages(page, la_size);
+		return 0;
+	}
+
+	if (!adjust_rala(min(ra_max, 2 * ra_old + (ra_max - ra_old) / 16),
+				&ra_size, &la_size))
+		return 0;
+
+	set_ra_class(ra, RA_CLASS_STATE);
+	ra_state_update(ra, ra_size, la_size);
+
+	return ra_dispatch(ra, mapping, filp);
+}
+
+
+/*
+ * Page cache context based estimation of read-ahead/look-ahead size/index.
+ *
+ * The logic first looks backward in the inactive_list to get an estimation of
+ * the thrashing-threshold, and then, if necessary, looks forward to determine
+ * the start point of next read-ahead.
+ *
+ * The estimation theory can be illustrated with figure:
+ * 
+ *   chunk A           chunk B                      chunk C                 head
+ *
+ *   l01 l11           l12   l21                    l22
+ *| |-->|-->|       |------>|-->|                |------>| 
+ *| +-------+       +-----------+                +-------------+               |
+ *| |   #   |       |       #   |                |       #     |               |
+ *| +-------+       +-----------+                +-------------+               |
+ *| |<==============|<===========================|<============================|
+ *        L0                     L1                            L2
+ *
+ * Let f(l) = L be a map from
+ * 	l: the number of pages read by the stream
+ * to
+ * 	L: the number of pages pushed into inactive_list in the mean time
+ * then
+ * 	f(l01) <= L0
+ * 	f(l11 + l12) = L1
+ * 	f(l21 + l22) = L2
+ * 	...
+ * 	f(l01 + l11 + ...) <= Sum(L0 + L1 + ...)
+ *                         <= Length(inactive_list) = f(thrashing-threshold)
+ *
+ * So the count of countinuous history pages left in the inactive_list is always
+ * a lower estimation of the true thrashing-threshold.
+ */
+
+#if PG_activate < PG_referenced
+#error unexpected page flags order
+#endif
+
+#define PAGE_REFCNT_1		(1 << PG_referenced)
+#define PAGE_REFCNT_2		(1 << PG_activate)
+#define PAGE_REFCNT_3		((1 << PG_activate) | (1 << PG_referenced))
+#define PAGE_REFCNT_MASK	PAGE_REFCNT_3
+/*
+ * STATUS   REFERENCE COUNT      TYPE
+ *   __                   -      not in inactive list
+ *   __                   0      fresh
+ *   _R       PAGE_REFCNT_1      stale
+ *   A_       PAGE_REFCNT_2      disturbed once
+ *   AR       PAGE_REFCNT_3      disturbed twice
+ */
+static inline unsigned long __page_refcnt(struct page *page)
+{
+	return page->flags & PAGE_REFCNT_MASK;
+}
+
+static inline unsigned long page_refcnt(struct page *page)
+{
+	if (!page || PageActive(page))
+		return 0;
+
+	return __page_refcnt(page);
+}
+
+static inline char page_refcnt_symbol(struct page *page)
+{
+	if (!page)
+		return 'X';
+	if (PageActive(page))
+		return 'A';
+	switch (__page_refcnt(page)) {
+		case 0:
+			return '_';
+		case PAGE_REFCNT_1:
+			return '-';
+		case PAGE_REFCNT_2:
+			return '=';
+		case PAGE_REFCNT_3:
+			return '#';
+	}
+	return '?';
+}
+
+/*
+ * Look back and count history pages to estimate thrashing-threshold.
+ *
+ * Strategies
+ * - Sequential read that extends from index 0
+ * 	The counted value may well be far under the true threshold, so return
+ * 	it unmodified for further process in adjust_rala_accelerated().
+ * - Sequential read with a large history count
+ * 	Check 3 evenly spread pages to be sure there is no hole or many
+ * 	not-yet-accessed pages. This prevents unnecessary IO, and allows some
+ * 	almost sequential patterns to survive.
+ * - Return equal or smaller count; but ensure a reasonable minimal value.
+ *
+ * Optimization
+ * - The count will normally be min(nr_lookback, offset), unless either memory
+ *   or read speed is low, or it is still in grow up phase.
+ * - A rigid implementation would be a simple loop to scan page by page
+ *   backward, though this may be unnecessary and inefficient, so the
+ *   stepping backward/forward scheme is used.
+ *
+ * FIXME: it seems ugly :(
+ */
+static int count_sequential_pages(struct address_space *mapping,
+			int refcnt, unsigned long *remain,
+			unsigned long offset,
+			unsigned long ra_min, unsigned long ra_max)
+{
+	int step;
+	int count;
+	unsigned long index;
+	unsigned long nr_lookback;
+	struct page *page;
+	struct radix_tree_cache cache;
+
+	*remain = 0;
+	nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) *
+						100 / (readahead_ratio + 1);
+	if (nr_lookback > offset)
+		nr_lookback = offset;
+	if (nr_lookback > mapping->nrpages)
+		nr_lookback = mapping->nrpages;
+
+	if (nr_lookback <= ra_min * 100 / (readahead_ratio + 1)) {
+		*remain = nr_lookback;
+		return ra_min;
+	}
+
+	radix_tree_cache_init(&cache);
+	read_lock_irq(&mapping->tree_lock);
+
+	/* check the far end first */
+	index = offset - nr_lookback;
+	page = radix_tree_cache_lookup(&mapping->page_tree, &cache, index);
+	if (page_refcnt(page) >= refcnt) {
+		step = 1 + nr_lookback / 3;
+		if(nr_lookback > ra_min * 8) {
+			count = 1;
+			goto check_more;
+		} else {
+			*remain = nr_lookback;
+			goto out_unlock;
+		}
+	}
+
+	/* scan backward for non-present page */
+	count = 0; /* just to make gcc happy */
+	for(step = ra_min; step < nr_lookback; step *= 4) {
+		index = offset - step;
+		page = radix_tree_cache_lookup(&mapping->page_tree, &cache,
+									index);
+		if (!page)
+			goto check_more;
+	}
+	index = offset - nr_lookback;
+	page = NULL;
+
+	/* scan forward and check some more pages */
+check_more:
+	for(;;) {
+		if (page && !*remain)
+			*remain = offset - index;
+		if (page_refcnt(page) < refcnt) {
+			count = 0;
+			step = (offset - index + 3) / 4;
+		} else if (++count >= 3 || step < ra_min)
+			break;
+		index += step;
+		if (index >= offset)
+			break;
+		page = radix_tree_cache_lookup(&mapping->page_tree, &cache,
+									index);
+	}
+out_unlock:
+	read_unlock_irq(&mapping->tree_lock);
+
+	count = 3 * step;
+	if (count > nr_lookback)
+		return nr_lookback;
+	
+	if (!*remain)
+		*remain = count;
+
+	count = count * readahead_ratio / 100;
+	if (count < get_min_readahead(NULL))
+		count = get_min_readahead(NULL);
+
+	return count;
+}
+
+/*
+ * Scan forward in inactive_list for the first non-present page.
+ * It takes advantage of the adjacency of pages in inactive_list.
+ */
+static unsigned long lru_scan_forward(struct page *page, int nr_pages)
+{
+	unsigned long index = page->index;
+	struct address_space *mapping = page_mapping(page);
+	struct zone *zone;
+
+	for(;;) {
+		zone = page_zone(page);
+		spin_lock_irq(&zone->lru_lock);
+
+		if (!PageLRU(page))
+			goto out;
+
+		do {    
+			index++;
+			if (!--nr_pages)
+				goto out;
+			page = next_page(page);
+		} while (page_mapping(page) == mapping && page->index == index);
+
+		spin_unlock_irq(&zone->lru_lock);
+
+		page = find_page(mapping, index);
+		if (!page)
+			return index;
+	}
+out:    
+	spin_unlock_irq(&zone->lru_lock);
+	return nr_pages ? index : 0;
+}
+
+/* Directly calling lru_scan_forward() would be slow.
+ * This function tries to avoid unnecessary scans for the most common cases:
+ * - Slow reads should scan forward directly;
+ * - Fast reads should step backward first;
+ * - Aggressive reads may well have max allowed look-ahead size.
+ */
+static unsigned long first_absent_page(struct address_space *mapping,
+				struct page *page, unsigned long index,
+				unsigned long ra_size, unsigned long ra_max)
+{
+	struct radix_tree_cache cache;
+
+	if (ra_size < ra_max)
+		goto scan_forward;
+
+	radix_tree_cache_init(&cache);
+	read_lock_irq(&mapping->tree_lock);
+
+	if (ra_size < LOOKAHEAD_RATIO * ra_max)
+		goto scan_backward;
+
+	page = radix_tree_cache_lookup(&mapping->page_tree, &cache,
+							index + ra_max);
+	if (page) {
+		read_unlock_irq(&mapping->tree_lock);
+		return 0;
+	}
+	page = radix_tree_cache_lookup(&mapping->page_tree, &cache,
+							index + ra_max - 1);
+	if (page) {
+		read_unlock_irq(&mapping->tree_lock);
+		return index + ra_max;
+	}
+
+scan_backward:
+	if (ra_size == index)
+		ra_size /= 4;
+	else
+		ra_size /= (LOOKAHEAD_RATIO * 2);
+	for(;; ra_size /= 2) {
+		page = radix_tree_cache_lookup(&mapping->page_tree, &cache,
+							index + ra_size);
+		if (page)
+			break;
+		if (!ra_size)
+			return index + 1;
+	}
+	read_unlock_irq(&mapping->tree_lock);
+	ra_size = ra_max;
+
+scan_forward:
+	return lru_scan_forward(page, ra_size + 1);
+}
+
+/*
+ * Determine the request parameters for context based read-ahead that extends
+ * from start of file.
+ *
+ * The major weakness of stateless method is perhaps the slow grow up speed of
+ * ra_size. The logic tries to make up for this in the important case of
+ * sequential reads that extend from start of file. In this case, the ra_size
+ * is not choosed to make the whole next chunk safe(as in normal ones). Only
+ * half of which is safe. The added 'unsafe' half is the look-ahead part. It
+ * is expected to be safeguarded by rescue_pages() when the previous chunks are
+ * lost.
+ */
+static inline int adjust_rala_accelerated(unsigned long ra_max,
+				unsigned long *ra_size, unsigned long *la_size)
+{
+	if (*ra_size <= *la_size)
+		return 0;
+
+	*la_size = (*ra_size - *la_size) * readahead_ratio / 100;
+	*ra_size = *la_size * 2;
+
+	if (*ra_size > ra_max)
+		*ra_size = ra_max;
+	if (*la_size > *ra_size)
+		*la_size = *ra_size;
+
+	return 1;
+}
+
+/* 
+ * Main function for page context based read-ahead.
+ */
+static inline int
+try_context_based_readahead(struct address_space *mapping,
+			struct file_ra_state *ra,
+			struct page *prev_page, struct page *page,
+			unsigned long index,
+			unsigned long ra_min, unsigned long ra_max)
+{
+	unsigned long ra_index;
+	unsigned long ra_size;
+	unsigned long la_size;
+	unsigned long remain_pages;
+	unsigned long ret;
+	int refcnt;
+
+	/* NFSv3 daemons may process adjecent requests in parallel,
+	 * leading to many locally disordered, globally sequential reads.
+	 * So do not require nearby history pages to be accessed, present is
+	 * enough.
+	 */
+	if (!prev_page)
+		return 0;
+
+	refcnt = page_refcnt(prev_page);
+	if (refcnt < PAGE_REFCNT_1)
+		refcnt = PAGE_REFCNT_1;
+	
+	ra_size = count_sequential_pages(mapping, refcnt,
+			&remain_pages, index, ra_min, ra_max);
+
+	/* Where to start read-ahead? */
+	if (!page)
+		ra_index = index;
+	else {
+		ra_index = first_absent_page(
+				mapping, page, index, ra_size, ra_max);
+		if (unlikely(!ra_index))
+			return -1;
+	}
+
+	la_size = ra_index - index;
+	if (readahead_ratio < 80 && 
+			remain_pages <= la_size && la_size > 1) {
+		rescue_pages(page, la_size);
+		return -1;
+	}
+
+	if (ra_size == index) {
+		ret = adjust_rala_accelerated(ra_max, &ra_size, &la_size);
+		set_ra_class(ra, RA_CLASS_CONTEXT_ACCELERATED);
+	} else {
+		ret = adjust_rala(ra_max, &ra_size, &la_size);
+		set_ra_class(ra, RA_CLASS_CONTEXT);
+	}
+	if (unlikely(!ret))
+		return -1;
+
+	ra_state_init(ra, index, ra_index);
+	ra_state_update(ra, ra_size, la_size);
+
+	return 1;
+}
+
+/*
+ * Read-ahead on start of file.
+ *
+ * It is most important for small files.
+ * 1. Set a moderate large read-ahead size;
+ * 2. Issue the next read-ahead request as soon as possible.
+ *
+ * But be careful, there are some applications that dip into only the very head
+ * of a file. The most important thing is to prevent them from triggering the
+ * next (much larger) read-ahead request, which leads to lots of cache misses.
+ * Two pages should be enough for them, correct me if I'm wrong.
+ */
+static inline unsigned long
+newfile_readahead(struct address_space *mapping,
+		struct file *filp, struct file_ra_state *ra,
+		unsigned long req_size, unsigned long ra_min)
+{
+	unsigned long ra_size;
+	unsigned long la_size;
+
+	if (req_size > ra_min)
+		req_size = ra_min;
+
+	ra_size = 4 * req_size;
+	la_size = 2 * req_size;
+
+	set_ra_class(ra, RA_CLASS_NEWFILE);
+	ra_state_init(ra, 0, 0);
+	ra_state_update(ra, ra_size, la_size);
+
+	return ra_dispatch(ra, mapping, filp);
+}
+
+/*
+ * Backward prefetching.
+ * No look ahead and thrashing threshold estimation for stepping backward
+ * pattern: should be unnecessary.
+ */
+static inline int
+try_read_backward(struct file_ra_state *ra,
+			unsigned long first_index, unsigned long last_index,
+			unsigned long ra_size, unsigned long ra_max)
+{
+	if (ra_size > ra_max || last_index > ra->prev_page)
+		return 0;
+
+	if (ra_has_index(ra, ra->prev_page)) {
+		if (last_index > ra->la_index)
+			return 0;
+		ra_size += 2 * ra_cache_hit(ra, 0);
+		last_index = ra->la_index;
+	} else {
+		ra_size = 8 * ra_size;
+		last_index = ra->prev_page;
+	}
+
+	if (ra_size > ra_max)
+		ra_size = ra_max;
+
+	if (last_index > first_index + ra_size)
+		return 0;
+
+	first_index = last_index - ra_size;
+
+	set_ra_class(ra, RA_CLASS_BACKWARD);
+	ra_state_init(ra, first_index, first_index);
+	ra_state_update(ra, ra_size, 0);
+
+	return 1;
+}
+
+/*
+ * If there is a previous sequential read, it is likely to be another
+ * sequential read at the new position.
+ * Databases are known to have the seek-and-read-one-record pattern.
+ */
+static inline int
+try_random_readahead(struct file_ra_state *ra, unsigned long index,
+			unsigned long ra_size, unsigned long ra_max)
+{
+	unsigned long hit0 = ra_cache_hit(ra, 0);
+	unsigned long hit1 = ra_cache_hit(ra, 1) + hit0;
+	unsigned long hit2 = ra_cache_hit(ra, 2);
+	unsigned long hit3 = ra_cache_hit(ra, 3);
+
+	if (!ra_has_index(ra, ra->prev_page))
+		return 0;
+
+	if (index == ra->prev_page + 1)      /* read after thrashing */
+		ra_size = hit0;
+	else if (ra_size < hit1 &&           /* read after seeking   */
+			hit1 > hit2 / 2 &&
+			hit2 > hit3 / 2 &&
+			hit3 > hit1 / 2)
+		ra_size = min(ra_max, hit1);
+	else
+		return 0;
+
+	set_ra_class(ra, RA_CLASS_RANDOM);
+	ra_state_init(ra, index, index);
+	ra_state_update(ra, ra_size, 0);
+
+	return 1;
+}
+
+/*
+ * ra_size is mainly determined by:
+ * 1. sequential-start: min(KB(16 + mem_mb/16), KB(64))
+ * 2. sequential-max:	min(KB(64 + mem_mb*64), KB(2048))
+ * 3. sequential:	(thrashing-threshold) * readahead_ratio / 100
+ *
+ * Table of concrete numbers for 4KB page size:
+ *  (inactive + free) (in MB):    4   8   16   32   64  128  256  512 1024
+ *    initial ra_size (in KB):   16  16   16   16   20   24   32   48   64
+ *	  max ra_size (in KB):  320 576 1088 2048 2048 2048 2048 2048 2048
+ */
+static inline void get_readahead_bounds(struct file_ra_state *ra,
+					unsigned long *ra_min,
+					unsigned long *ra_max)
+{
+	unsigned long mem_mb;
+
+#define KB(size)	(((size) * 1024) / PAGE_CACHE_SIZE)
+	mem_mb = nr_free_inactive() * PAGE_CACHE_SIZE / 1024 / 1024;
+	*ra_max = min(min(KB(64 + mem_mb*64), KB(2048)), ra->ra_pages); 
+	*ra_min = min(min(KB(VM_MIN_READAHEAD + mem_mb/16), KB(128)), *ra_max/2);
+#undef KB
+}
+
+/* 
+ * This is the entry point of the adaptive read-ahead logic.
+ *
+ * It is only called on two conditions:
+ * 1. page == NULL
+ *    A cache miss happened, it can be either a random read or a sequential one.
+ * 2. page != NULL 
+ *    There is a look-ahead mark(PG_readahead) from a previous sequential read.
+ *    It's time to do some checking and submit the next read-ahead IO.
+ *
+ * That makes both methods happy, and lives in harmony with application managed
+ * read-aheads via fadvise() / madvise(). The cache hit problem is also
+ * eliminated naturally.
+ */
+unsigned long
+page_cache_readahead_adaptive(struct address_space *mapping,
+			struct file_ra_state *ra, struct file *filp,
+			struct page *prev_page, struct page *page,
+			unsigned long first_index,
+			unsigned long index, unsigned long last_index)
+{
+	unsigned long size;
+	unsigned long ra_min;
+	unsigned long ra_max;
+	int ret;
+
+	if (page) {
+		if(!TestClearPageReadahead(page))
+			return 0;
+		if (bdi_read_congested(mapping->backing_dev_info))
+			return 0;
+	}
+
+	if (page)
+		ra_account(readahead_return, ra->flags & RA_CLASS_MASK, 1);
+	else if (index)
+		inc_page_state(cache_miss);
+
+	size = last_index - index;
+	get_readahead_bounds(ra, &ra_min, &ra_max);
+
+	/* readahead disabled? */
+	if (unlikely(!ra_min || !readahead_ratio)) {
+		size = max_sane_readahead(size);
+		goto readit;
+	}
+
+	/*
+	 * Start of file.
+	 */
+	if (index == 0)
+		return newfile_readahead(mapping, filp, ra, last_index, ra_min);
+
+	/*
+	 * State based sequential read-ahead.
+	 */
+	if ((readahead_ratio % 5) == 0 &&
+		index == ra->lookahead_index &&
+		(page || index == ra->readahead_index) &&
+		(ra_cache_hit_ok(ra) ||
+		 last_index - first_index >= ra_max))
+		return state_based_readahead(mapping, filp, ra, page, ra_max);
+
+	/*
+	 * Backward read-ahead.
+	 */
+	if (try_read_backward(ra, first_index, last_index, size, ra_max))
+		return ra_dispatch(ra, mapping, filp);
+
+	/* 
+	 * Context based sequential read-ahead.
+	 */ 
+	if (!prev_page)
+		prev_page = find_page(mapping, index - 1);
+	ret = try_context_based_readahead(mapping, ra, prev_page, page,
+						index, ra_min, ra_max);
+	if (ret > 0)
+		return ra_dispatch(ra, mapping, filp);
+	if (ret < 0)
+		return 0;
+
+	/* No action on look ahead time ? */
+	if (page)
+		return 0;
+
+	/*
+	 * Random read that follows a sequential one.
+	 */
+	if (try_random_readahead(ra, index, size, ra_max))
+		return ra_dispatch(ra, mapping, filp);
+
+	/*
+	 * Random read.
+	 */
+	if (size > ra_max)
+		size = ra_max;
+
+readit:
+	size = __do_page_cache_readahead(mapping, filp, index, size, 0);
+
+	inc_page_state(readrandom);
+	mod_page_state(pgreadrandom, size);
+
+	dprintk("readrandom(ino=%lu, pages=%lu, index=%lu-%lu-%lu) = %lu\n",
+			mapping->host->i_ino, mapping->nrpages,
+			first_index, index, last_index, size);
+
+	return size;
+}
+
+/*
+ * Call me!
+ */
+void fastcall ra_access(struct file_ra_state *ra, struct page *page)
+{
+	if (page->flags & ((1 << PG_active)   |
+			   (1 << PG_activate) |
+			   (1 << PG_referenced)))
+		return;
+
+	if (!ra_has_index(ra, page->index))
+		return;
+
+	ra->cache_hit++;
+
+	if (page->index >= ra->ra_index)
+		ra_account(pgreadahead_hit, ra->flags & RA_CLASS_MASK, 1);
+	else
+		ra_account(pgreadahead_hit,
+			(ra->flags >> RA_CLASS_SHIFT) & RA_CLASS_MASK, 1);
+}
+
+/*
+ * Detect and protect sequential read-ahead pages.
+ * 
+ * The safty guarantee provided by this function is only needed in file servers
+ * with big readahead_ratio set.
+ *
+ * This function is called when pages in @page_list are to be freed,
+ * it protects ra pages by moving them into @save_list.
+ *
+ * The goal of is to save all and only the sequential pages that are to be
+ * accessed in the near future.  Currently, the algorithm runs in suboptimal
+ * condition, though the test results are acceptable.  To make it run smooth
+ * and fast, ra request boundary must be preserved:
+ * - alloc pages of a chunk from one single zone
+ * - insert pages into lru at one time
+ * - make vmscan code aware of chunk boundaries
+ *
+ * The general idea is to classify pages of a file into random pages and groups
+ * of sequential accessed pages. Random pages and leading segments of
+ * sequential pages are left over, following sequential pages are saved.
+ *
+ * The rules employed must ensure:
+ * - no page is pinned in inactive_list.
+ * - no excessive pages are saved.
+ *
+ * rs/ra         - read sequential/ahead
+ * chunk         - a list of pages belong to the same file
+ * rs/ra pages   - a chunk of pages that was read/to be read sequentially
+ *                 Detected by ascending index and (almost) non-descending
+ *                 reference count. rs pages have greater reference count than
+ *                 following ra pages.  A page can be both rs/ra page, which
+ *                 indicates there are two adjacent readers.
+ * live ra pages - ra pages that have reading in progress
+ *                 Detected by having leading rs pages(either in page_list or in
+ *                 inactive_list), or limited ra pages(may be in another zone,
+ *                 just had their rs pages dropped).
+ * dead ra pages - ra pages that seems to have no imminent reader
+ *                 Note that they are not necessarily dead: either the cost of
+ *                 search the leading rs pages or the cost of keeping them in
+ *                 memory is large, so they are abandoned.
+ *                 Leading rs pages are detected and handled the same way.
+ *
+ * Live ra pages are saved, pure/leading rs pages and dead ra pages are left
+ * over and eligible for free. 
+ *
+ * Read backward pattern support is possible, in which case the pages are
+ * better pushed into lru in reverse order.
+ *
+ * The rules apply to the following common cases:
+ * keep head   back search            chunk             case
+ *    Y      ----____________|______________________    Normal
+ *           ----------------|----__________________    Normal
+ *                           |----__________________    Normal
+ *           ----------------|----------------------    Normal
+ *                           |----------------------    Normal
+ *     y     ________________|______________________    cache miss
+ *                           |______________________    cache miss
+ *     y     ________________|_______--------_______    two readers
+ *    Y      ----____________|_______--------_______    two readers
+ *                           |_______--------_______    two readers
+ *                           |----_____------_______    two readers
+ *           ----------------|----_____------_______    two readers
+ *           _______---------|---------------_______    two readers
+ *    Y      ----___---------|---------------_______    two readers
+ *           ________________|---------------_______    two readers
+ *    Y      ----____________|---------------_______    two readers
+ *    Y      ====------------|----__________________    two readers
+ *    N                      |====-----------_______    two readers
+ *    N                      |###======-------------    three readers
+ * Y: saved by leading rs pages
+ * y: saved by limited leading ra pages
+ * N: to be activated anyway
+ */
+static int save_chunk(struct page *head, struct page *live_head,
+			struct page *tail, struct list_head *save_list)
+{
+	struct page *page;
+	struct address_space *mapping;
+	struct radix_tree_cache cache;
+	int i;
+	unsigned long index;
+	unsigned long refcnt;
+
+#ifdef DEBUG_READAHEAD
+	static char static_buf[PAGE_SIZE];
+	static char *zone_names[1 << ZONES_SHIFT] = {
+			"DMA", "DMA32", "Normal", "HighMem", "Z5", "Z6", "Z7" };
+	char *pat = static_buf;
+	unsigned long pidx = PAGE_SIZE / 2;
+
+	if ((readahead_ratio & 3) == 3) {
+		pat = (char *)get_zeroed_page(GFP_KERNEL);
+		if (!pat)
+			pat = static_buf;
+	}
+#endif
+
+	index = head->index;
+	refcnt = __page_refcnt(head);
+	mapping = head->mapping;
+
+	/* The leading pages are going to be activated anyway? */
+	if (refcnt > PAGE_REFCNT_1 || (!refcnt && mapping_mapped(mapping)))
+		goto skip_scan;
+
+	/* Scan backward to see if leading pages should be saved. */
+	radix_tree_cache_init(&cache);
+	read_lock_irq(&mapping->tree_lock);
+	for (i = 2 * MAX_RA_PAGES; i >= 0; i--) {
+		page = radix_tree_cache_lookup(&mapping->page_tree, &cache,
+								--index);
+#ifdef DEBUG_READAHEAD
+		if ((readahead_ratio & 3) == 3 && pidx)
+			pat[--pidx] = page_refcnt_symbol(page);
+#endif
+
+		/* Having limited leading ra pages is required now. It will
+		 * be less important if ra request boundaries are preserved.
+		 */
+		if (!page) {
+			if (i > MAX_RA_PAGES &&
+					index != head->index - 1 &&
+					!__page_refcnt(head))
+				live_head = head;
+			break;
+		}
+
+		/* Avoid being pinned by active page. */
+		if (unlikely(PageActive(page)))
+			break;
+
+		/* A trick to speed things up, must be placed after the
+		 * active page test.  This check may be removed when chunk
+		 * boundaries are preserved.
+		 */
+		if (unlikely((index & 63) == 63) &&
+				!__page_refcnt(head) &&
+				i > MAX_RA_PAGES &&
+				radix_tree_cache_count(&cache) <
+				index - radix_tree_cache_first_index(&cache)) {
+#ifdef DEBUG_READAHEAD
+			if ((readahead_ratio & 3) == 3 && pidx)
+				pat[--pidx] = '|';
+#endif
+			live_head = head;
+			break;
+		}
+
+		if (__page_refcnt(page) > refcnt) { /* so they are live pages */
+			live_head = head;
+			break;
+		}
+		refcnt = __page_refcnt(page);
+	}
+	read_unlock_irq(&mapping->tree_lock);
+skip_scan:
+
+#ifdef DEBUG_READAHEAD
+	if ((readahead_ratio & 3) == 3) {
+		for (i = 0; pidx < PAGE_SIZE / 2;)
+			pat[i++] = pat[pidx++];
+		pat[i++] = '|';
+		for (page = head; page != tail; page = next_page(page)) {
+			pidx = page->index;
+			pat[i++] = page_refcnt_symbol(page);
+			BUG_ON(PageAnon(page));
+			BUG_ON(PageSwapCache(page));
+			/* BUG_ON(page_mapped(page)); */
+			if (i >= PAGE_SIZE - 1)
+				break;
+		}
+		pat[i] = 0;
+		pat[PAGE_SIZE - 1] = 0;
+	}
+#endif
+
+	/* Save the remaining pages. */
+	i = 0;
+	if (live_head) {
+		/* TODO: use batched move to speed up the normal case */
+		for (; live_head != tail;) { /* never dereference tail! */
+			page = next_page(live_head);
+			if (!PageActivate(live_head)) {
+				i++;
+				list_move(&live_head->lru, save_list);
+			}
+			live_head = page;
+		}
+	}
+
+	if (i)
+		inc_page_state(readahead_rescue);
+
+#ifdef DEBUG_READAHEAD
+	if ((readahead_ratio & 3) == 3) {
+		ddprintk("save_chunk(ino=%lu, idx=%lu-%lu-%lu, %s@%s:%s)"
+				" = %d\n",
+				mapping->host->i_ino,
+				index, head->index, pidx,
+				mapping_mapped(mapping) ? "mmap" : "file",
+				zone_names[page_zonenum(head)], pat, i);
+		if (pat != static_buf)
+			free_page((unsigned long)pat);
+	}
+#endif
+
+	return i;
+}
+
+int rescue_ra_pages(struct list_head *page_list, struct list_head *save_list)
+{
+	struct address_space *mapping;
+	struct page *chunk_head;
+	struct page *live_head;
+	struct page *page;
+	unsigned long refcnt;
+	int ascend_count;
+	int ret = 0;
+
+	page = list_to_page(page_list);
+
+next_chunk:
+	chunk_head = page;
+	live_head = NULL;
+	mapping = page->mapping;
+	ascend_count = 0;
+
+next_rs_page:
+	refcnt = __page_refcnt(page);
+	page = next_page(page);
+
+	if (mapping != page->mapping || &page->lru == page_list)
+		goto save_chunk;
+
+	if (refcnt <= __page_refcnt(page))
+		goto next_rs_page;
+
+	live_head = page;
+
+next_page:
+	if (refcnt < __page_refcnt(page))
+		ascend_count++;
+	refcnt = __page_refcnt(page);
+	page = next_page(page);
+
+	if (mapping != page->mapping || &page->lru == page_list)
+		goto save_chunk;
+
+	goto next_page;
+
+save_chunk:
+	if (!PageAnon(chunk_head) &&
+			!PageSwapCache(chunk_head) &&
+			/* !page_mapped(chunk_head) && */
+			ascend_count <= 3 &&
+			(!refcnt ||
+			 prev_page(page)->index >= chunk_head->index + 8))
+		ret += save_chunk(chunk_head, live_head, page, save_list);
+
+	if (&page->lru != page_list)
+		goto next_chunk;
+
+	if (ret)
+		mod_page_state(pgreadahead_rescue, ret);
+
+	return ret;
+}
+
diff -rup linux-2.6.14-rc4-mm1-orig/mm/swap.c linux-2.6.14-rc4-mm1/mm/swap.c
--- linux-2.6.14-rc4-mm1-orig/mm/swap.c	2005-10-17 11:40:45.000000000 +0800
+++ linux-2.6.14-rc4-mm1/mm/swap.c	2005-10-17 11:42:30.000000000 +0800
@@ -96,6 +96,8 @@ int rotate_reclaimable_page(struct page 
 	return 0;
 }
 
+extern int readahead_ratio;
+
 /*
  * FIXME: speed this up?
  */
@@ -122,8 +124,13 @@ void fastcall activate_page(struct page 
  */
 void fastcall mark_page_accessed(struct page *page)
 {
-	if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
-		activate_page(page);
+	if (!PageActive(page) && !PageActivate(page) &&
+			PageReferenced(page) && PageLRU(page)) {
+		if (readahead_ratio > 9 || (readahead_ratio & 1)) {
+			page_zone(page)->nr_page_aging++;
+			SetPageActivate(page);
+		} else
+			activate_page(page);
 		ClearPageReferenced(page);
 	} else if (!PageReferenced(page)) {
 		SetPageReferenced(page);
@@ -299,6 +306,7 @@ void __pagevec_lru_add(struct pagevec *p
 			if (zone)
 				spin_unlock_irq(&zone->lru_lock);
 			zone = pagezone;
+			update_page_age(zone);
 			spin_lock_irq(&zone->lru_lock);
 		}
 		if (TestSetPageLRU(page))
diff -rup linux-2.6.14-rc4-mm1-orig/mm/vmscan.c linux-2.6.14-rc4-mm1/mm/vmscan.c
--- linux-2.6.14-rc4-mm1-orig/mm/vmscan.c	2005-10-17 11:40:45.000000000 +0800
+++ linux-2.6.14-rc4-mm1/mm/vmscan.c	2005-10-17 11:42:30.000000000 +0800
@@ -370,6 +370,8 @@ static pageout_t pageout(struct page *pa
 	return PAGE_CLEAN;
 }
 
+extern int readahead_ratio;
+
 /*
  * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
  */
@@ -382,6 +384,9 @@ static int shrink_list(struct list_head 
 
 	cond_resched();
 
+	if (readahead_ratio >= 80) 
+		rescue_ra_pages(page_list, &ret_pages);
+
 	pagevec_init(&freed_pvec, 1);
 	while (!list_empty(page_list)) {
 		struct address_space *mapping;
@@ -407,6 +412,11 @@ static int shrink_list(struct list_head 
 		if (PageWriteback(page))
 			goto keep_locked;
 
+		if (PageActivate(page)) {
+			ClearPageActivate(page);
+			goto activate_locked;
+		}
+
 		referenced = page_referenced(page, 1, sc->priority <= 0);
 		/* In active use or really unfreeable?  Activate it. */
 		if (referenced && page_mapping_inuse(page))
@@ -779,6 +789,7 @@ refill_inactive_zone(struct zone *zone, 
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			zone->nr_inactive += pgmoved;
+			zone->nr_page_aging += pgmoved;
 			spin_unlock_irq(&zone->lru_lock);
 			pgdeactivate += pgmoved;
 			pgmoved = 0;
@@ -788,6 +799,7 @@ refill_inactive_zone(struct zone *zone, 
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
+	zone->nr_page_aging += pgmoved;
 	zone->nr_inactive += pgmoved;
 	pgdeactivate += pgmoved;
 	if (buffer_heads_over_limit) {
@@ -870,6 +882,7 @@ shrink_zone(struct zone *zone, struct sc
 		}
 	}
 
+	update_page_age(zone);
 	throttle_vm_writeout();
 
 	atomic_dec(&zone->reclaim_in_progress);
@@ -1057,6 +1070,7 @@ loop_again:
 
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
+		int begin_zone = -1;
 		unsigned long lru_pages = 0;
 
 		all_zones_ok = 1;
@@ -1069,6 +1083,8 @@ loop_again:
 			for (i = pgdat->nr_zones - 1; i >= 0; i--) {
 				struct zone *zone = pgdat->node_zones + i;
 
+				update_page_age(zone);
+
 				if (zone->present_pages == 0)
 					continue;
 
@@ -1078,16 +1094,33 @@ loop_again:
 
 				if (!zone_watermark_ok(zone, order,
 						zone->pages_high, 0, 0, 0)) {
-					end_zone = i;
-					goto scan;
+					if (!end_zone)
+						begin_zone = end_zone = i;
+					else /* if (begin_zone == i + 1) */
+						begin_zone = i;
 				}
 			}
-			goto out;
+			if (begin_zone < 0)
+				goto out;
 		} else {
+			begin_zone = 0;
 			end_zone = pgdat->nr_zones - 1;
 		}
-scan:
-		for (i = 0; i <= end_zone; i++) {
+
+		/*
+		 * Prepare enough free pages for zones with small page_age,
+		 * they are going to be reclaimed in the page allocation.
+		 */
+		while (end_zone < pgdat->nr_zones - 1 &&
+			pages_more_aged(pgdat->node_zones + end_zone,
+					pgdat->node_zones + end_zone + 1))
+			end_zone++;
+		while (begin_zone &&
+			pages_more_aged(pgdat->node_zones + begin_zone,
+					pgdat->node_zones + begin_zone - 1))
+			begin_zone--;
+
+		for (i = begin_zone; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 
 			lru_pages += zone->nr_active + zone->nr_inactive;
@@ -1102,7 +1135,7 @@ scan:
 		 * pages behind kswapd's direction of progress, which would
 		 * cause too much scanning of the lower zones.
 		 */
-		for (i = 0; i <= end_zone; i++) {
+		for (i = begin_zone; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			int nr_slab;
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/