This patch changes readahead to move pages that are already in memory and
in the inactive list to the top of the list. This mirrors the behavior
of non-in-core pages. The position of pages already in the active list
remains unchanged.
The behavior without this patch (leaving in-core pages untouched) means
that pages already in core may be evicted before prefetched pages. Many
small read requests may be forced on the disk because of this behavior.
In particular, note that this behavior means that a system call
posix_fadvise(... POSIX_FADV_WILLNEED) on an in-core page has no effect,
even if that page is the next vitim on the inactive list.
This change helps address the performance problems we encountered
while modifying SQLite and the GIMP to use large file prefetches.
Overall these prefetching techniques improved the runtime of large
benchmarks by 10-17x for these applications. More in the publication
_Reducing Seek Overhead with Application-Directed Prefetching_ in
USENIX ATC 2009 and at http://libprefetch.cs.ucla.edu/.
Signed-off-by: Chris Frost <[email protected]>
Signed-off-by: Steve VanDeBogart <[email protected]>
---
The sparse checker produces this warning which I believe is ok, but
I do not know how to convince sparse of this:
mm/readahead.c:144:9: warning: context imbalance in 'retain_pages' - different lock contexts for basic block
mm/readahead.c | 58 +++++++++++++++++++++++++++++++++++++++++++++----------
1 files changed, 47 insertions(+), 11 deletions(-)
diff --git a/mm/readahead.c b/mm/readahead.c
index aa1aa23..4559563 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -10,6 +10,8 @@
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -132,6 +134,33 @@ out:
return ret;
}
+static void retain_pages(struct pagevec *vec)
+{
+ struct zone *lockedzone = NULL;
+ struct zone *zone;
+ struct page *page;
+ int i;
+
+ for (i = 0; i < pagevec_count(vec); i++) {
+ page = vec->pages[i];
+ zone = page_zone(page);
+ if (zone != lockedzone) {
+ if (lockedzone != NULL)
+ spin_unlock_irq(&lockedzone->lru_lock);
+ lockedzone = zone;
+ spin_lock_irq(&lockedzone->lru_lock);
+ }
+ if (PageLRU(page) && !PageActive(page)) {
+ del_page_from_lru_list(zone, page, LRU_INACTIVE_FILE);
+ add_page_to_lru_list(zone, page, LRU_INACTIVE_FILE);
+ }
+ page_cache_release(page);
+ }
+ if (lockedzone != NULL)
+ spin_unlock_irq(&lockedzone->lru_lock);
+ pagevec_reinit(vec);
+}
+
/*
* __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
@@ -147,6 +176,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
{
struct inode *inode = mapping->host;
struct page *page;
+ struct pagevec retain_vec;
unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
@@ -157,6 +187,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
goto out;
end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+ pagevec_init(&retain_vec, 0);
/*
* Preallocate as many pages as we will need.
@@ -170,19 +201,24 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, page_offset);
rcu_read_unlock();
- if (page)
- continue;
-
- page = page_cache_alloc_cold(mapping);
- if (!page)
- break;
- page->index = page_offset;
- list_add(&page->lru, &page_pool);
- if (page_idx == nr_to_read - lookahead_size)
- SetPageReadahead(page);
- ret++;
+ if (page) {
+ page_cache_get(page);
+ if (!pagevec_add(&retain_vec, page))
+ retain_pages(&retain_vec);
+ } else {
+ page = page_cache_alloc_cold(mapping);
+ if (!page)
+ break;
+ page->index = page_offset;
+ list_add(&page->lru, &page_pool);
+ if (page_idx == nr_to_read - lookahead_size)
+ SetPageReadahead(page);
+ ret++;
+ }
}
+ retain_pages(&retain_vec);
+
/*
* Now start the IO. We ignore I/O errors - if the page is not
* uptodate then the caller will launch readpage again, and
--
1.5.4.3
--
Chris Frost
http://www.frostnet.net/chris/
Hi Chris,
On Wed, Jan 20, 2010 at 01:55:36PM -0800, Chris Frost wrote:
> This patch changes readahead to move pages that are already in memory and
> in the inactive list to the top of the list. This mirrors the behavior
> of non-in-core pages. The position of pages already in the active list
> remains unchanged.
This is good in general.
> @@ -170,19 +201,24 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
> rcu_read_lock();
> page = radix_tree_lookup(&mapping->page_tree, page_offset);
> rcu_read_unlock();
> - if (page)
> - continue;
> -
> - page = page_cache_alloc_cold(mapping);
> - if (!page)
> - break;
> - page->index = page_offset;
> - list_add(&page->lru, &page_pool);
> - if (page_idx == nr_to_read - lookahead_size)
> - SetPageReadahead(page);
> - ret++;
> + if (page) {
> + page_cache_get(page);
This is racy - the page may have already be freed and possibly reused
by others in the mean time.
If you do page_cache_get() on a random page, it may trigger bad_page()
in the buddy page allocator, or the VM_BUG_ON() in put_page_testzero().
> + if (!pagevec_add(&retain_vec, page))
> + retain_pages(&retain_vec);
> + } else {
> + page = page_cache_alloc_cold(mapping);
> + if (!page)
> + break;
> + page->index = page_offset;
> + list_add(&page->lru, &page_pool);
> + if (page_idx == nr_to_read - lookahead_size)
> + SetPageReadahead(page);
> + ret++;
> + }
Years ago I wrote a similar function, which can be called for both
in-kernel-readahead (when it decides not to bring in new pages, but
only retain existing pages) and fadvise-readahead (where it want to
read new pages as well as retain existing pages).
For better chance of code reuse, would you rebase the patch on it?
(You'll have to do some cleanups first.)
+/*
+ * Move pages in danger (of thrashing) to the head of inactive_list.
+ * Not expected to happen frequently.
+ */
+static unsigned long rescue_pages(struct address_space *mapping,
+ struct file_ra_state *ra,
+ pgoff_t index, unsigned long nr_pages)
+{
+ struct page *grabbed_page;
+ struct page *page;
+ struct zone *zone;
+ int pgrescue = 0;
+
+ dprintk("rescue_pages(ino=%lu, index=%lu, nr=%lu)\n",
+ mapping->host->i_ino, index, nr_pages);
+
+ for(; nr_pages;) {
+ grabbed_page = page = find_get_page(mapping, index);
+ if (!page) {
+ index++;
+ nr_pages--;
+ continue;
+ }
+
+ zone = page_zone(page);
+ spin_lock_irq(&zone->lru_lock);
+
+ if (!PageLRU(page)) {
+ index++;
+ nr_pages--;
+ goto next_unlock;
+ }
+
+ do {
+ struct page *the_page = page;
+ page = list_entry((page)->lru.prev, struct page, lru);
+ index++;
+ nr_pages--;
+ ClearPageReadahead(the_page);
+ if (!PageActive(the_page) &&
+ !PageLocked(the_page) &&
+ page_count(the_page) == 1) {
+ list_move(&the_page->lru, &zone->inactive_list);
+ pgrescue++;
+ }
+ } while (nr_pages &&
+ page_mapping(page) == mapping &&
+ page_index(page) == index);
+
+next_unlock:
+ spin_unlock_irq(&zone->lru_lock);
+ page_cache_release(grabbed_page);
+ cond_resched();
+ }
+
+ ra_account(ra, RA_EVENT_READAHEAD_RESCUE, pgrescue);
+ return pgrescue;
+}
Thanks,
Fengguang
On Thu, Jan 21, 2010 at 01:47:34PM +0800, Wu Fengguang wrote:
> On Wed, Jan 20, 2010 at 01:55:36PM -0800, Chris Frost wrote:
> > This patch changes readahead to move pages that are already in memory and
> > in the inactive list to the top of the list. This mirrors the behavior
> > of non-in-core pages. The position of pages already in the active list
> > remains unchanged.
>
> This is good in general.
Great!
> > @@ -170,19 +201,24 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
> > rcu_read_lock();
> > page = radix_tree_lookup(&mapping->page_tree, page_offset);
> > rcu_read_unlock();
> > - if (page)
> > - continue;
> > -
> > - page = page_cache_alloc_cold(mapping);
> > - if (!page)
> > - break;
> > - page->index = page_offset;
> > - list_add(&page->lru, &page_pool);
> > - if (page_idx == nr_to_read - lookahead_size)
> > - SetPageReadahead(page);
> > - ret++;
> > + if (page) {
> > + page_cache_get(page);
>
> This is racy - the page may have already be freed and possibly reused
> by others in the mean time.
>
> If you do page_cache_get() on a random page, it may trigger bad_page()
> in the buddy page allocator, or the VM_BUG_ON() in put_page_testzero().
Thanks for catching these.
>
> > + if (!pagevec_add(&retain_vec, page))
> > + retain_pages(&retain_vec);
> > + } else {
> > + page = page_cache_alloc_cold(mapping);
> > + if (!page)
> > + break;
> > + page->index = page_offset;
> > + list_add(&page->lru, &page_pool);
> > + if (page_idx == nr_to_read - lookahead_size)
> > + SetPageReadahead(page);
> > + ret++;
> > + }
>
> Years ago I wrote a similar function, which can be called for both
> in-kernel-readahead (when it decides not to bring in new pages, but
> only retain existing pages) and fadvise-readahead (where it want to
> read new pages as well as retain existing pages).
>
> For better chance of code reuse, would you rebase the patch on it?
> (You'll have to do some cleanups first.)
This sounds good; thanks. I've rebased my change on the below.
Fwiw, performance is unchanged. A few questions below.
> +/*
> + * Move pages in danger (of thrashing) to the head of inactive_list.
> + * Not expected to happen frequently.
> + */
> +static unsigned long rescue_pages(struct address_space *mapping,
> + struct file_ra_state *ra,
> + pgoff_t index, unsigned long nr_pages)
> +{
> + struct page *grabbed_page;
> + struct page *page;
> + struct zone *zone;
> + int pgrescue = 0;
> +
> + dprintk("rescue_pages(ino=%lu, index=%lu, nr=%lu)\n",
> + mapping->host->i_ino, index, nr_pages);
> +
> + for(; nr_pages;) {
> + grabbed_page = page = find_get_page(mapping, index);
> + if (!page) {
> + index++;
> + nr_pages--;
> + continue;
> + }
> +
> + zone = page_zone(page);
> + spin_lock_irq(&zone->lru_lock);
> +
> + if (!PageLRU(page)) {
> + index++;
> + nr_pages--;
> + goto next_unlock;
> + }
> +
> + do {
> + struct page *the_page = page;
> + page = list_entry((page)->lru.prev, struct page, lru);
> + index++;
> + nr_pages--;
> + ClearPageReadahead(the_page);
> + if (!PageActive(the_page) &&
> + !PageLocked(the_page) &&
> + page_count(the_page) == 1) {
Why require the page count to be 1?
> + list_move(&the_page->lru, &zone->inactive_list);
The LRU list manipulation interface has changed since this patch.
I believe we should replace the list_move() call with:
del_page_from_lru_list(zone, the_page, LRU_INACTIVE_FILE);
add_page_to_lru_list(zone, the_page, LRU_INACTIVE_FILE);
This moves the page to the top of the list, but also notifies mem_cgroup.
It also, I believe needlessly, decrements and then increments the zone
state for each move.
> + pgrescue++;
> + }
> + } while (nr_pages &&
> + page_mapping(page) == mapping &&
> + page_index(page) == index);
Is it ok to not lock each page in this while loop? (Does the zone lock
protect all the reads and writes?)
Will the zone be the same for all pages seen inside a given run of this
while loop?
Do you think performance would be better if the code used a pagevec and
a call to find_get_pages_contig(), instead of the above find_get_page()
and this loop over the LRU list?
> +
> +next_unlock:
> + spin_unlock_irq(&zone->lru_lock);
> + page_cache_release(grabbed_page);
> + cond_resched();
> + }
> +
> + ra_account(ra, RA_EVENT_READAHEAD_RESCUE, pgrescue);
I don't see ra_account() or relevant fields in struct file_ra_state in
the current kernel. I'll drop the ra_account() call?
> + return pgrescue;
> +}
--
Chris Frost
http://www.frostnet.net/chris/
Hi Chris,
> > +/*
> > + * Move pages in danger (of thrashing) to the head of inactive_list.
> > + * Not expected to happen frequently.
> > + */
> > +static unsigned long rescue_pages(struct address_space *mapping,
> > + struct file_ra_state *ra,
> > + pgoff_t index, unsigned long nr_pages)
> > +{
> > + struct page *grabbed_page;
> > + struct page *page;
> > + struct zone *zone;
> > + int pgrescue = 0;
> > +
> > + dprintk("rescue_pages(ino=%lu, index=%lu, nr=%lu)\n",
> > + mapping->host->i_ino, index, nr_pages);
> > +
> > + for(; nr_pages;) {
> > + grabbed_page = page = find_get_page(mapping, index);
> > + if (!page) {
> > + index++;
> > + nr_pages--;
> > + continue;
> > + }
> > +
> > + zone = page_zone(page);
> > + spin_lock_irq(&zone->lru_lock);
> > +
> > + if (!PageLRU(page)) {
> > + index++;
> > + nr_pages--;
> > + goto next_unlock;
> > + }
> > +
> > + do {
> > + struct page *the_page = page;
> > + page = list_entry((page)->lru.prev, struct page, lru);
> > + index++;
> > + nr_pages--;
> > + ClearPageReadahead(the_page);
> > + if (!PageActive(the_page) &&
> > + !PageLocked(the_page) &&
> > + page_count(the_page) == 1) {
>
> Why require the page count to be 1?
Hmm, I think the PageLocked() and page_count() tests meant to
skip pages being manipulated by someone else.
You can just remove them. In fact the page_count()==1 will exclude
the grabbed_page, so must be removed. Thanks for the reminder!
>
> > + list_move(&the_page->lru, &zone->inactive_list);
>
> The LRU list manipulation interface has changed since this patch.
Yeah.
> I believe we should replace the list_move() call with:
> del_page_from_lru_list(zone, the_page, LRU_INACTIVE_FILE);
> add_page_to_lru_list(zone, the_page, LRU_INACTIVE_FILE);
> This moves the page to the top of the list, but also notifies mem_cgroup.
> It also, I believe needlessly, decrements and then increments the zone
> state for each move.
Why do you think mem_cgroup shall be notified here? As me understand
it, mem_cgroup should only care about page addition/removal.
> > + pgrescue++;
> > + }
> > + } while (nr_pages &&
> > + page_mapping(page) == mapping &&
> > + page_index(page) == index);
>
> Is it ok to not lock each page in this while loop? (Does the zone lock
> protect all the reads and writes?)
I believe yes. We are only changing page->lru, which is protected by
zone->lru_lock. btw, why shall we care read/write?
> Will the zone be the same for all pages seen inside a given run of this
> while loop?
Sure. page->lru always links to other pages in the same zone.
> Do you think performance would be better if the code used a pagevec and
> a call to find_get_pages_contig(), instead of the above find_get_page()
> and this loop over the LRU list?
I'm not sure. It should not be a big problem either way.
We can consider it if the find_get_pages_contig() implementation would
be way more simple and clean :)
>
> > +
> > +next_unlock:
> > + spin_unlock_irq(&zone->lru_lock);
> > + page_cache_release(grabbed_page);
> > + cond_resched();
> > + }
> > +
> > + ra_account(ra, RA_EVENT_READAHEAD_RESCUE, pgrescue);
>
> I don't see ra_account() or relevant fields in struct file_ra_state in
> the current kernel. I'll drop the ra_account() call?
Yes, please. It's for some unmerged feature..
Thanks,
Fengguang
On Sat, 23 Jan 2010 18:22:22 +0800
Wu Fengguang <[email protected]> wrote:
> Hi Chris,
>
> > > +/*
> > > + * Move pages in danger (of thrashing) to the head of inactive_list.
> > > + * Not expected to happen frequently.
> > > + */
> > > +static unsigned long rescue_pages(struct address_space *mapping,
> > > + struct file_ra_state *ra,
> > > + pgoff_t index, unsigned long nr_pages)
> > > +{
> > > + struct page *grabbed_page;
> > > + struct page *page;
> > > + struct zone *zone;
> > > + int pgrescue = 0;
> > > +
> > > + dprintk("rescue_pages(ino=%lu, index=%lu, nr=%lu)\n",
> > > + mapping->host->i_ino, index, nr_pages);
> > > +
> > > + for(; nr_pages;) {
> > > + grabbed_page = page = find_get_page(mapping, index);
> > > + if (!page) {
> > > + index++;
> > > + nr_pages--;
> > > + continue;
> > > + }
> > > +
> > > + zone = page_zone(page);
> > > + spin_lock_irq(&zone->lru_lock);
> > > +
> > > + if (!PageLRU(page)) {
> > > + index++;
> > > + nr_pages--;
> > > + goto next_unlock;
> > > + }
> > > +
> > > + do {
> > > + struct page *the_page = page;
> > > + page = list_entry((page)->lru.prev, struct page, lru);
> > > + index++;
> > > + nr_pages--;
> > > + ClearPageReadahead(the_page);
> > > + if (!PageActive(the_page) &&
> > > + !PageLocked(the_page) &&
> > > + page_count(the_page) == 1) {
> >
> > Why require the page count to be 1?
>
> Hmm, I think the PageLocked() and page_count() tests meant to
> skip pages being manipulated by someone else.
>
> You can just remove them. In fact the page_count()==1 will exclude
> the grabbed_page, so must be removed. Thanks for the reminder!
>
> >
> > > + list_move(&the_page->lru, &zone->inactive_list);
> >
> > The LRU list manipulation interface has changed since this patch.
>
> Yeah.
>
> > I believe we should replace the list_move() call with:
> > del_page_from_lru_list(zone, the_page, LRU_INACTIVE_FILE);
> > add_page_to_lru_list(zone, the_page, LRU_INACTIVE_FILE);
> > This moves the page to the top of the list, but also notifies mem_cgroup.
> > It also, I believe needlessly, decrements and then increments the zone
> > state for each move.
>
> Why do you think mem_cgroup shall be notified here? As me understand
> it, mem_cgroup should only care about page addition/removal.
>
No. memcg maintains its LRU list in synchronous way with global LRU.
So, I think it's better to call usual LRU handler calls as Chris does.
And...for maintainance, I like following code rather than your direct code.
Because you mention " Not expected to happen frequently."
void find_isolate_inactive_page(struct address_space *mapping, pgoff_t index, int len)
{
int i = 0;
struct list_head *list;
for (i = 0; i < len; i++)
page = find_get_page(mapping, index + i);
if (!page)
continue;
zone = page_zone(page);
spin_lock_irq(&zone->lru_lock); /* you can optimize this if you want */
/* isolate_lru_page() doesn't handle the type of list, so call __isolate_lru_page */
if (__isolate_lru_page(page, ISOLATE_INACTIVE, 1)
continue;
spin_unlock_irq(&zone->lru_lock);
ClearPageReadahead(page);
putback_lru_page(page);
}
}
Please feel free to do as you want but please takeing care of memcg' lru management.
Thanks,
-Kame
On Sun, Jan 24, 2010 at 05:42:28PM -0700, KAMEZAWA Hiroyuki wrote:
> On Sat, 23 Jan 2010 18:22:22 +0800
> Wu Fengguang <[email protected]> wrote:
> > Why do you think mem_cgroup shall be notified here? As me understand
> > it, mem_cgroup should only care about page addition/removal.
> >
> No. memcg maintains its LRU list in synchronous way with global LRU.
> So, I think it's better to call usual LRU handler calls as Chris does.
Ah right, thanks for the reminder!
> And...for maintainance, I like following code rather than your direct code.
> Because you mention " Not expected to happen frequently."
Yup, won't be frequent for in-kernel readahead and for _sane_ fadvise() users :)
> void find_isolate_inactive_page(struct address_space *mapping, pgoff_t index, int len)
> {
> int i = 0;
> struct list_head *list;
>
> for (i = 0; i < len; i++)
> page = find_get_page(mapping, index + i);
> if (!page)
> continue;
> zone = page_zone(page);
> spin_lock_irq(&zone->lru_lock); /* you can optimize this if you want */
I don't care to optimize. Chris?
> /* isolate_lru_page() doesn't handle the type of list, so call __isolate_lru_page */
> if (__isolate_lru_page(page, ISOLATE_INACTIVE, 1)
__isolate_lru_page() didn't actually take off page from lru, hence at
least the accounting will be wrong. I'll just use Chris'
del_page_from_lru_list()/add_page_to_lru_list() pare.
> continue;
> spin_unlock_irq(&zone->lru_lock);
> ClearPageReadahead(page);
> putback_lru_page(page);
> }
> }
>
> Please feel free to do as you want but please takeing care of memcg' lru management.
OK, thanks.
I updated the patch with your signs pre-added.
Please ack or optimize..
Thanks,
Fengguang
---
readahead: retain inactive lru pages to be accessed soon
From: Chris Frost <[email protected]>
Make sure the cached pages in inactive list won't be evicted too soon,
by moving them back to lru head, when they are covered by
- in-kernel heuristic readahead
- posix_fadvise(POSIX_FADV_WILLNEED) hint from applications
Before patch, pages already in core may be evicted before prefetched
pages. Many small read requests may be forced on the disk because of
this behavior.
In particular, posix_fadvise(... POSIX_FADV_WILLNEED) on an in-core page
has no effect, even if that page is the next victim on the inactive list.
This change helps address the performance problems we encountered
while modifying SQLite and the GIMP to use large file prefetching.
Overall these prefetching techniques improved the runtime of large
benchmarks by 10-17x for these applications. More in the publication
_Reducing Seek Overhead with Application-Directed Prefetching_ in
USENIX ATC 2009 and at http://libprefetch.cs.ucla.edu/.
Signed-off-by: Chris Frost <[email protected]>
Signed-off-by: Steve VanDeBogart <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
Signed-off-by: Wu Fengguang <[email protected]>
---
mm/readahead.c | 39 +++++++++++++++++++++++++++++++++++++++
mm/vmscan.c | 1 -
2 files changed, 39 insertions(+), 1 deletion(-)
--- linux-mm.orig/mm/readahead.c 2010-01-25 09:17:31.000000000 +0800
+++ linux-mm/mm/readahead.c 2010-01-25 10:40:12.000000000 +0800
@@ -9,7 +9,9 @@
#include <linux/kernel.h>
#include <linux/fs.h>
+#include <linux/memcontrol.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -133,6 +135,35 @@ out:
}
/*
+ * The file range is expected to be accessed in near future. Move pages
+ * (possibly in inactive lru tail) to lru head, so that they are retained
+ * in memory for some reasonable time.
+ */
+static void retain_inactive_pages(struct address_space *mapping,
+ pgoff_t index, int len)
+{
+ int i;
+ struct page *page;
+ struct zone *zone;
+
+ for (i = 0; i < len; i++) {
+ page = find_get_page(mapping, index + i);
+ if (!page)
+ continue;
+ zone = page_zone(page);
+ spin_lock_irq(&zone->lru_lock);
+ if (!PageActive(page) && !PageUnevictable(page)) {
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(zone, page, lru);
+ add_page_to_lru_list(zone, page, lru);
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ put_page(page);
+ }
+}
+
+/*
* __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
@@ -184,6 +215,14 @@ __do_page_cache_readahead(struct address
}
/*
+ * Normally readahead will auto stop on cached segments, so we won't
+ * hit many cached pages. If it does happen, bring the inactive pages
+ * adjecent to the newly prefetched ones(if any).
+ */
+ if (ret < nr_to_read)
+ retain_inactive_pages(mapping, offset, page_idx);
+
+ /*
* Now start the IO. We ignore I/O errors - if the page is not
* uptodate then the caller will launch readpage again, and
* will then handle the error.
--- linux-mm.orig/mm/vmscan.c 2010-01-25 08:58:40.000000000 +0800
+++ linux-mm/mm/vmscan.c 2010-01-25 09:17:27.000000000 +0800
@@ -2892,4 +2892,3 @@ void scan_unevictable_unregister_node(st
{
sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
}
-
I changed Wu's patch to add a PageLRU() guard that I believe is required
and optimized zone lock acquisition to only unlock and lock at zone changes.
This optimization seems to provide a 10-20% system time improvement for
some of my GIMP benchmarks and no improvement for other benchmarks.
I agree that the remove and add lru list entry code looks correct.
putback_lru_page() has to worry about a page's evictable status
changing, but I think this code does not because it holds the page
zone lock.
Wu removed the ClearPageReadahead(page) call on in-core pages that
Kamezawa's change added. This removal, not making this call, looks
ok to me.
Thanks Wu and Kamezawa.
What's next?
---
readahead: retain inactive lru pages to be accessed soon
From: Chris Frost <[email protected]>
Ensure that cached pages in the inactive list are not prematurely evicted;
move such pages to lru head when they are covered by
- in-kernel heuristic readahead
- an posix_fadvise(POSIX_FADV_WILLNEED) hint from an application
Before this patch, pages already in core may be evicted before the
pages covered by the same prefetch scan but that were not yet in core.
Many small read requests may be forced on the disk because of this behavior.
In particular, posix_fadvise(... POSIX_FADV_WILLNEED) on an in-core page
has no effect on the page's location in the LRU list, even if it is the
next victim on the inactive list.
This change helps address the performance problems we encountered
while modifying SQLite and the GIMP to use large file prefetching.
Overall these prefetching techniques improved the runtime of large
benchmarks by 10-17x for these applications. More in the publication
_Reducing Seek Overhead with Application-Directed Prefetching_ in
USENIX ATC 2009 and at http://libprefetch.cs.ucla.edu/.
Signed-off-by: Chris Frost <[email protected]>
Signed-off-by: Steve VanDeBogart <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
Signed-off-by: Wu Fengguang <[email protected]>
---
readahead.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 47 insertions(+)
diff --git a/mm/readahead.c b/mm/readahead.c
index aa1aa23..c1d67ab 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -9,7 +9,9 @@
#include <linux/kernel.h>
#include <linux/fs.h>
+#include <linux/memcontrol.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -133,6 +135,43 @@ out:
}
/*
+ * The file range is expected to be accessed in near future. Move pages
+ * (possibly in inactive lru tail) to lru head, so that they are retained
+ * in memory for some reasonable time.
+ */
+static void retain_inactive_pages(struct address_space *mapping,
+ pgoff_t index, int len)
+{
+ int i;
+ struct page *page;
+ struct zone *zone;
+ struct zone *locked_zone = NULL;
+
+ for (i = 0; i < len; i++) {
+ page = find_get_page(mapping, index + i);
+ if (!page)
+ continue;
+ zone = page_zone(page);
+ if (zone != locked_zone) {
+ if (locked_zone)
+ spin_unlock_irq(&locked_zone->lru_lock);
+ locked_zone = zone;
+ spin_lock_irq(&locked_zone->lru_lock);
+ }
+ if (!PageActive(page) && !PageUnevictable(page) &&
+ PageLRU(page)) {
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(zone, page, lru);
+ add_page_to_lru_list(zone, page, lru);
+ }
+ put_page(page);
+ }
+ if (locked_zone)
+ spin_unlock_irq(&locked_zone->lru_lock);
+}
+
+/*
* __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
@@ -184,6 +223,14 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
}
/*
+ * Normally readahead will auto stop on cached segments, so we won't
+ * hit many cached pages. If it does happen, bring the inactive pages
+ * adjecent to the newly prefetched ones(if any).
+ */
+ if (ret < nr_to_read)
+ retain_inactive_pages(mapping, offset, page_idx);
+
+ /*
* Now start the IO. We ignore I/O errors - if the page is not
* uptodate then the caller will launch readpage again, and
* will then handle the error.
--
Chris Frost
http://www.frostnet.net/chris/
On Mon, Jan 25, 2010 at 03:36:35PM -0700, Chris Frost wrote:
> I changed Wu's patch to add a PageLRU() guard that I believe is required
Good catch, Thanks!
> and optimized zone lock acquisition to only unlock and lock at zone changes.
> This optimization seems to provide a 10-20% system time improvement for
> some of my GIMP benchmarks and no improvement for other benchmarks.
OK.
> I agree that the remove and add lru list entry code looks correct.
> putback_lru_page() has to worry about a page's evictable status
> changing, but I think this code does not because it holds the page
> zone lock.
>
> Wu removed the ClearPageReadahead(page) call on in-core pages that
> Kamezawa's change added. This removal, not making this call, looks
> ok to me.
>
> Thanks Wu and Kamezawa.
>
>
> What's next?
I happen to be preparing a readahead series, will include this one :)
Thanks,
Fengguang
> ---
> readahead: retain inactive lru pages to be accessed soon
> From: Chris Frost <[email protected]>
>
> Ensure that cached pages in the inactive list are not prematurely evicted;
> move such pages to lru head when they are covered by
> - in-kernel heuristic readahead
> - an posix_fadvise(POSIX_FADV_WILLNEED) hint from an application
>
> Before this patch, pages already in core may be evicted before the
> pages covered by the same prefetch scan but that were not yet in core.
> Many small read requests may be forced on the disk because of this behavior.
>
> In particular, posix_fadvise(... POSIX_FADV_WILLNEED) on an in-core page
> has no effect on the page's location in the LRU list, even if it is the
> next victim on the inactive list.
>
> This change helps address the performance problems we encountered
> while modifying SQLite and the GIMP to use large file prefetching.
> Overall these prefetching techniques improved the runtime of large
> benchmarks by 10-17x for these applications. More in the publication
> _Reducing Seek Overhead with Application-Directed Prefetching_ in
> USENIX ATC 2009 and at http://libprefetch.cs.ucla.edu/.
>
> Signed-off-by: Chris Frost <[email protected]>
> Signed-off-by: Steve VanDeBogart <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> Signed-off-by: Wu Fengguang <[email protected]>
> ---
> readahead.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 47 insertions(+)
>
> diff --git a/mm/readahead.c b/mm/readahead.c
> index aa1aa23..c1d67ab 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -9,7 +9,9 @@
>
> #include <linux/kernel.h>
> #include <linux/fs.h>
> +#include <linux/memcontrol.h>
> #include <linux/mm.h>
> +#include <linux/mm_inline.h>
> #include <linux/module.h>
> #include <linux/blkdev.h>
> #include <linux/backing-dev.h>
> @@ -133,6 +135,43 @@ out:
> }
>
> /*
> + * The file range is expected to be accessed in near future. Move pages
> + * (possibly in inactive lru tail) to lru head, so that they are retained
> + * in memory for some reasonable time.
> + */
> +static void retain_inactive_pages(struct address_space *mapping,
> + pgoff_t index, int len)
> +{
> + int i;
> + struct page *page;
> + struct zone *zone;
> + struct zone *locked_zone = NULL;
> +
> + for (i = 0; i < len; i++) {
> + page = find_get_page(mapping, index + i);
> + if (!page)
> + continue;
> + zone = page_zone(page);
> + if (zone != locked_zone) {
> + if (locked_zone)
> + spin_unlock_irq(&locked_zone->lru_lock);
> + locked_zone = zone;
> + spin_lock_irq(&locked_zone->lru_lock);
> + }
> + if (!PageActive(page) && !PageUnevictable(page) &&
> + PageLRU(page)) {
> + int lru = page_lru_base_type(page);
> +
> + del_page_from_lru_list(zone, page, lru);
> + add_page_to_lru_list(zone, page, lru);
> + }
> + put_page(page);
> + }
> + if (locked_zone)
> + spin_unlock_irq(&locked_zone->lru_lock);
> +}
> +
> +/*
> * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
> * the pages first, then submits them all for I/O. This avoids the very bad
> * behaviour which would occur if page allocations are causing VM writeback.
> @@ -184,6 +223,14 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
> }
>
> /*
> + * Normally readahead will auto stop on cached segments, so we won't
> + * hit many cached pages. If it does happen, bring the inactive pages
> + * adjecent to the newly prefetched ones(if any).
> + */
> + if (ret < nr_to_read)
> + retain_inactive_pages(mapping, offset, page_idx);
> +
> + /*
> * Now start the IO. We ignore I/O errors - if the page is not
> * uptodate then the caller will launch readpage again, and
> * will then handle the error.
>
> --
> Chris Frost
> http://www.frostnet.net/chris/
On Mon, Jan 25, 2010 at 03:36:35PM -0700, Chris Frost wrote:
> I changed Wu's patch to add a PageLRU() guard that I believe is required
> and optimized zone lock acquisition to only unlock and lock at zone changes.
> This optimization seems to provide a 10-20% system time improvement for
> some of my GIMP benchmarks and no improvement for other benchmarks.
> + del_page_from_lru_list(zone, page, lru);
> + add_page_to_lru_list(zone, page, lru);
> + }
> + put_page(page);
Hmm. put_page() inside lru_lock can deadlock. So you need to further
optimize the code to do pagevec_lookup() and pagevec_release() plus
cond_resched().
Thanks,
Fengguang
Hi, Wu.
I have missed this thread until now.
Before review, first of all, Thanks for adding to good feature, Chris and Wu.
I have some questions.
2010/1/21 Wu Fengguang <[email protected]>:
> Years ago I wrote a similar function, which can be called for both
> in-kernel-readahead (when it decides not to bring in new pages, but
> only retain existing pages) and fadvise-readahead (where it want to
> read new pages as well as retain existing pages).
Why doesn't it merged into mainline?
It's private patch or has some problem?
Actually I am worried about this patch.
That's because it makes shortcut promotion in reclaim exceptionally.
Of course If readahead is working well, this patch effect also would
be good. But let's think about it.
This patch effect happens when inactive file list is small, I think.
It means it's high memory pressure. so if we move ra pages into
head of inactive list, other application which require free page urgently
suffer from latency or are killed.
If VM don't have this patch, of course ra pages are discarded and
then I/O performance would be bad. but as I mentioned, it's time
high memory pressure. so I/O performance low makes system
natural throttling. It can help out of system memory pressure.
In summary I think it's good about viewpoint of I/O but I am not sure
it's good about viewpoint of system.
I will review this patch after my concern is solved. :)
Thanks.
--
Kind regards,
Minchan Kim
Hi Minchan,
On Wed, Jan 27, 2010 at 09:09:40AM +0200, Minchan Kim wrote:
> Hi, Wu.
>
> I have missed this thread until now.
> Before review, first of all, Thanks for adding to good feature, Chris and Wu.
> I have some questions.
>
> 2010/1/21 Wu Fengguang <[email protected]>:
>
> > Years ago I wrote a similar function, which can be called for both
> > in-kernel-readahead (when it decides not to bring in new pages, but
> > only retain existing pages) and fadvise-readahead (where it want to
> > read new pages as well as retain existing pages).
>
> Why doesn't it merged into mainline?
> It's private patch or has some problem?
It's part of the early adaptive readahead patchset, which is too
complex to be acceptable to mainline.
> Actually I am worried about this patch.
> That's because it makes shortcut promotion in reclaim exceptionally.
>
> Of course If readahead is working well, this patch effect also would
> be good. But let's think about it.
>
> This patch effect happens when inactive file list is small, I think.
> It means it's high memory pressure. so if we move ra pages into
> head of inactive list, other application which require free page urgently
> suffer from latency or are killed.
>
> If VM don't have this patch, of course ra pages are discarded and
> then I/O performance would be bad. but as I mentioned, it's time
> high memory pressure. so I/O performance low makes system
> natural throttling. It can help out of system memory pressure.
>
> In summary I think it's good about viewpoint of I/O but I am not sure
> it's good about viewpoint of system.
>
> I will review this patch after my concern is solved. :)
That's legitimate concern. I'm now including this patch in a bigger
patchset to do adaptive (wrt. thrashing safety) readahead, which will
automatically back off readahead size when thrashing happened. I hope
that would address your concern.
Thanks,
Fengguang
On Wed, 27 Jan 2010, Minchan Kim wrote:
> This patch effect happens when inactive file list is small, I think.
> It means it's high memory pressure. so if we move ra pages into
This patch does the same thing regardless of memory pressure - it
doesn't just apply in high memory pressure situations. Is your concern
that in high memory pressure situations this patch with make things worse?
> head of inactive list, other application which require free page urgently
> suffer from latency or are killed.
I don't think this patch will affect the number of pages reclaimed, only
which pages are reclaimed. In extreme cases it could increase the time
needed to reclaim that many pages, but the inactive list would have to be
very short.
> If VM don't have this patch, of course ra pages are discarded and
> then I/O performance would be bad. but as I mentioned, it's time
> high memory pressure. so I/O performance low makes system
> natural throttling. It can help out of system memory pressure.
Even in low memory situations, improving I/O performance can help the
overall system performance. For example if most of the inactive list
is dirty, needlessly discarding pages, just to refetch them will clog
I/O and increase the time needed to write out the dirty pages.
> In summary I think it's good about viewpoint of I/O but I am not sure
> it's good about viewpoint of system.
In this case, I think what's good for I/O is good for the system.
Please help me understand if I am missing something. Thanks
--
Steve
On Thu, Jan 28, 2010 at 4:16 PM, Steve VanDeBogart
<[email protected]> wrote:
> On Wed, 27 Jan 2010, Minchan Kim wrote:
>
>> This patch effect happens when inactive file list is small, I think.
>> It means it's high memory pressure. so if we move ra pages into
>
> This patch does the same thing regardless of memory pressure - it
> doesn't just apply in high memory pressure situations. Is your concern
> that in high memory pressure situations this patch with make things worse?
Yes.
>
>> head of inactive list, other application which require free page urgently
>> suffer from latency or are killed.
>
> I don't think this patch will affect the number of pages reclaimed, only
> which pages are reclaimed. In extreme cases it could increase the time
Most likely.
But it can affect the number of pages reclaimed at sometime.
For example,
scanning window size for reclaim = 5.
P : hard reclaimable page
R : readaheaded page(easily reclaimable page)
without this patch
HEAD-P - P - P - P ................ - P - R -R -R -R -R- TAIL
reclaimed pages : 5
with this patch
HEAD-R-R-R-R-R .................... - P -P -P -P -P -P -TAIL
reclaimed pages : 0 => might be OOMed.
Yes. It's very extreme example.
it is just for explanation. :)
> needed to reclaim that many pages, but the inactive list would have to be
> very short.
I think short inactive list means now we are suffering from shortage of
free memory. So I think it would be better to discard ra pages rather than
OOMed.
>
>> If VM don't have this patch, of course ra pages are discarded and
>> then I/O performance would be bad. but as I mentioned, it's time
>> high memory pressure. so I/O performance low makes system
>> natural throttling. It can help out of system memory pressure.
>
> Even in low memory situations, improving I/O performance can help the
> overall system performance. For example if most of the inactive list is
> dirty, needlessly discarding pages, just to refetch them will clog
> I/O and increase the time needed to write out the dirty pages.
Yes. That's what I said.
But my point is that it makes system I/O throttling by clogging I/O naturally.
It can prevent fast consumption of memory.
Actually I think mm don't have to consider I/O throttling as far as possible.
It's role of I/O subsystem. but it's not real.
There are some codes for I/O throttling in mm.
>
>> In summary I think it's good about viewpoint of I/O but I am not sure
>> it's good about viewpoint of system.
>
> In this case, I think what's good for I/O is good for the system.
> Please help me understand if I am missing something. Thanks
You didn't missed anything. :)
I don't know how this patch affect in low memory situation.
What we just need is experiment which is valuable.
Wu have a catch in my concern and are making new version.
I am looking forward to that.
>
> --
> Steve
>
--
Kind regards,
Minchan Kim
On Tue, Jan 26, 2010 at 09:32:17PM +0800, Wu Fengguang wrote:
> On Mon, Jan 25, 2010 at 03:36:35PM -0700, Chris Frost wrote:
> > I changed Wu's patch to add a PageLRU() guard that I believe is required
> > and optimized zone lock acquisition to only unlock and lock at zone changes.
> > This optimization seems to provide a 10-20% system time improvement for
> > some of my GIMP benchmarks and no improvement for other benchmarks.
>
> > + del_page_from_lru_list(zone, page, lru);
> > + add_page_to_lru_list(zone, page, lru);
> > + }
> > + put_page(page);
I feel very uncomfortable about this put_page() inside zone->lru_lock.
(might deadlock: put_page() conditionally takes zone->lru_lock again)
If you really want the optimization, can we do it like this?
Thanks,
Fengguang
---
readahead: retain inactive lru pages to be accessed soon
From: Chris Frost <[email protected]>
Ensure that cached pages in the inactive list are not prematurely evicted;
move such pages to lru head when they are covered by
- in-kernel heuristic readahead
- an posix_fadvise(POSIX_FADV_WILLNEED) hint from an application
Before this patch, pages already in core may be evicted before the
pages covered by the same prefetch scan but that were not yet in core.
Many small read requests may be forced on the disk because of this behavior.
In particular, posix_fadvise(... POSIX_FADV_WILLNEED) on an in-core page
has no effect on the page's location in the LRU list, even if it is the
next victim on the inactive list.
This change helps address the performance problems we encountered
while modifying SQLite and the GIMP to use large file prefetching.
Overall these prefetching techniques improved the runtime of large
benchmarks by 10-17x for these applications. More in the publication
_Reducing Seek Overhead with Application-Directed Prefetching_ in
USENIX ATC 2009 and at http://libprefetch.cs.ucla.edu/.
Signed-off-by: Chris Frost <[email protected]>
Signed-off-by: Steve VanDeBogart <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
Signed-off-by: Wu Fengguang <[email protected]>
---
mm/readahead.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 52 insertions(+)
--- linux.orig/mm/readahead.c 2010-01-31 21:39:24.000000000 +0800
+++ linux/mm/readahead.c 2010-01-31 22:20:24.000000000 +0800
@@ -9,7 +9,9 @@
#include <linux/kernel.h>
#include <linux/fs.h>
+#include <linux/memcontrol.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -133,6 +135,48 @@ out:
}
/*
+ * The file range is expected to be accessed in near future. Move pages
+ * (possibly in inactive lru tail) to lru head, so that they are retained
+ * in memory for some reasonable time.
+ */
+static void retain_inactive_pages(struct address_space *mapping,
+ pgoff_t index, int len)
+{
+ struct page *grabbed_page;
+ struct page *page;
+ struct zone *zone;
+ int i;
+
+ for (i = 0; i < len; i++) {
+ grabbed_page = page = find_get_page(mapping, index + i);
+ if (!page)
+ continue;
+
+ zone = page_zone(page);
+ spin_lock_irq(&zone->lru_lock);
+ if (PageLRU(page) &&
+ !PageActive(page) &&
+ !PageUnevictable(page)) {
+ int lru = page_lru_base_type(page);
+
+ for (; i < len; i++) {
+ struct page *p = page;
+
+ if (page->mapping != mapping ||
+ page->index != index + i)
+ break;
+ page = list_to_page(&page->lru);
+ del_page_from_lru_list(zone, p, lru);
+ add_page_to_lru_list(zone, p, lru);
+ }
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ page_cache_release(grabbed_page);
+ cond_resched();
+ }
+}
+
+/*
* __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
@@ -184,6 +228,14 @@ __do_page_cache_readahead(struct address
}
/*
+ * Normally readahead will auto stop on cached segments, so we won't
+ * hit many cached pages. If it does happen, bring the inactive pages
+ * adjecent to the newly prefetched ones(if any).
+ */
+ if (ret < nr_to_read)
+ retain_inactive_pages(mapping, offset, page_idx);
+
+ /*
* Now start the IO. We ignore I/O errors - if the page is not
* uptodate then the caller will launch readpage again, and
* will then handle the error.
On Sun, Jan 31, 2010 at 10:31:42PM +0800, Wu Fengguang wrote:
> On Tue, Jan 26, 2010 at 09:32:17PM +0800, Wu Fengguang wrote:
> > On Mon, Jan 25, 2010 at 03:36:35PM -0700, Chris Frost wrote:
> > > I changed Wu's patch to add a PageLRU() guard that I believe is required
> > > and optimized zone lock acquisition to only unlock and lock at zone changes.
> > > This optimization seems to provide a 10-20% system time improvement for
> > > some of my GIMP benchmarks and no improvement for other benchmarks.
>
> I feel very uncomfortable about this put_page() inside zone->lru_lock.
> (might deadlock: put_page() conditionally takes zone->lru_lock again)
>
> If you really want the optimization, can we do it like this?
Sorry that I was slow to respond. (I was out of town.)
Thanks for catching __page_cache_release() locking the zone.
I think staying simple for now sounds good. The below locks
and unlocks the zone for each page. Look good?
---
readahead: retain inactive lru pages to be accessed soon
From: Chris Frost <[email protected]>
Ensure that cached pages in the inactive list are not prematurely evicted;
move such pages to lru head when they are covered by
- in-kernel heuristic readahead
- an posix_fadvise(POSIX_FADV_WILLNEED) hint from an application
Before this patch, pages already in core may be evicted before the
pages covered by the same prefetch scan but that were not yet in core.
Many small read requests may be forced on the disk because of this
behavior.
In particular, posix_fadvise(... POSIX_FADV_WILLNEED) on an in-core page
has no effect on the page's location in the LRU list, even if it is the
next victim on the inactive list.
This change helps address the performance problems we encountered
while modifying SQLite and the GIMP to use large file prefetching.
Overall these prefetching techniques improved the runtime of large
benchmarks by 10-17x for these applications. More in the publication
_Reducing Seek Overhead with Application-Directed Prefetching_ in
USENIX ATC 2009 and at http://libprefetch.cs.ucla.edu/.
Signed-off-by: Chris Frost <[email protected]>
Signed-off-by: Steve VanDeBogart <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
Signed-off-by: Wu Fengguang <[email protected]>
---
readahead.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 44 insertions(+)
diff --git a/mm/readahead.c b/mm/readahead.c
index aa1aa23..c615f96 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -9,7 +9,9 @@
#include <linux/kernel.h>
#include <linux/fs.h>
+#include <linux/memcontrol.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -133,6 +135,40 @@ out:
}
/*
+ * The file range is expected to be accessed in near future. Move pages
+ * (possibly in inactive lru tail) to lru head, so that they are retained
+ * in memory for some reasonable time.
+ */
+static void retain_inactive_pages(struct address_space *mapping,
+ pgoff_t index, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ struct page *page;
+ struct zone *zone;
+
+ page = find_get_page(mapping, index + i);
+ if (!page)
+ continue;
+ zone = page_zone(page);
+ spin_lock_irq(&zone->lru_lock);
+
+ if (PageLRU(page) &&
+ !PageActive(page) &&
+ !PageUnevictable(page)) {
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(zone, page, lru);
+ add_page_to_lru_list(zone, page, lru);
+ }
+
+ spin_unlock_irq(&zone->lru_lock);
+ put_page(page);
+ }
+}
+
+/*
* __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
@@ -184,6 +220,14 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
}
/*
+ * Normally readahead will auto stop on cached segments, so we won't
+ * hit many cached pages. If it does happen, bring the inactive pages
+ * adjecent to the newly prefetched ones(if any).
+ */
+ if (ret < nr_to_read)
+ retain_inactive_pages(mapping, offset, page_idx);
+
+ /*
* Now start the IO. We ignore I/O errors - if the page is not
* uptodate then the caller will launch readpage again, and
* will then handle the error.
--
Chris Frost
http://www.frostnet.net/chris/
On Sun, Jan 31, 2010 at 07:06:39PM -0700, Chris Frost wrote:
> On Sun, Jan 31, 2010 at 10:31:42PM +0800, Wu Fengguang wrote:
> > On Tue, Jan 26, 2010 at 09:32:17PM +0800, Wu Fengguang wrote:
> > > On Mon, Jan 25, 2010 at 03:36:35PM -0700, Chris Frost wrote:
> > > > I changed Wu's patch to add a PageLRU() guard that I believe is required
> > > > and optimized zone lock acquisition to only unlock and lock at zone changes.
> > > > This optimization seems to provide a 10-20% system time improvement for
> > > > some of my GIMP benchmarks and no improvement for other benchmarks.
> >
> > I feel very uncomfortable about this put_page() inside zone->lru_lock.
> > (might deadlock: put_page() conditionally takes zone->lru_lock again)
> >
> > If you really want the optimization, can we do it like this?
>
> Sorry that I was slow to respond. (I was out of town.)
>
> Thanks for catching __page_cache_release() locking the zone.
> I think staying simple for now sounds good. The below locks
> and unlocks the zone for each page. Look good?
OK :)
Thanks,
Fengguang
> ---
> readahead: retain inactive lru pages to be accessed soon
> From: Chris Frost <[email protected]>
>
> Ensure that cached pages in the inactive list are not prematurely evicted;
> move such pages to lru head when they are covered by
> - in-kernel heuristic readahead
> - an posix_fadvise(POSIX_FADV_WILLNEED) hint from an application
>
> Before this patch, pages already in core may be evicted before the
> pages covered by the same prefetch scan but that were not yet in core.
> Many small read requests may be forced on the disk because of this
> behavior.
>
> In particular, posix_fadvise(... POSIX_FADV_WILLNEED) on an in-core page
> has no effect on the page's location in the LRU list, even if it is the
> next victim on the inactive list.
>
> This change helps address the performance problems we encountered
> while modifying SQLite and the GIMP to use large file prefetching.
> Overall these prefetching techniques improved the runtime of large
> benchmarks by 10-17x for these applications. More in the publication
> _Reducing Seek Overhead with Application-Directed Prefetching_ in
> USENIX ATC 2009 and at http://libprefetch.cs.ucla.edu/.
>
> Signed-off-by: Chris Frost <[email protected]>
> Signed-off-by: Steve VanDeBogart <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> Signed-off-by: Wu Fengguang <[email protected]>
> ---
> readahead.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 44 insertions(+)
>
> diff --git a/mm/readahead.c b/mm/readahead.c
> index aa1aa23..c615f96 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -9,7 +9,9 @@
>
> #include <linux/kernel.h>
> #include <linux/fs.h>
> +#include <linux/memcontrol.h>
> #include <linux/mm.h>
> +#include <linux/mm_inline.h>
> #include <linux/module.h>
> #include <linux/blkdev.h>
> #include <linux/backing-dev.h>
> @@ -133,6 +135,40 @@ out:
> }
>
> /*
> + * The file range is expected to be accessed in near future. Move pages
> + * (possibly in inactive lru tail) to lru head, so that they are retained
> + * in memory for some reasonable time.
> + */
> +static void retain_inactive_pages(struct address_space *mapping,
> + pgoff_t index, int len)
> +{
> + int i;
> +
> + for (i = 0; i < len; i++) {
> + struct page *page;
> + struct zone *zone;
> +
> + page = find_get_page(mapping, index + i);
> + if (!page)
> + continue;
> + zone = page_zone(page);
> + spin_lock_irq(&zone->lru_lock);
> +
> + if (PageLRU(page) &&
> + !PageActive(page) &&
> + !PageUnevictable(page)) {
> + int lru = page_lru_base_type(page);
> +
> + del_page_from_lru_list(zone, page, lru);
> + add_page_to_lru_list(zone, page, lru);
> + }
> +
> + spin_unlock_irq(&zone->lru_lock);
> + put_page(page);
> + }
> +}
> +
> +/*
> * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
> * the pages first, then submits them all for I/O. This avoids the very bad
> * behaviour which would occur if page allocations are causing VM writeback.
> @@ -184,6 +220,14 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
> }
>
> /*
> + * Normally readahead will auto stop on cached segments, so we won't
> + * hit many cached pages. If it does happen, bring the inactive pages
> + * adjecent to the newly prefetched ones(if any).
> + */
> + if (ret < nr_to_read)
> + retain_inactive_pages(mapping, offset, page_idx);
> +
> + /*
> * Now start the IO. We ignore I/O errors - if the page is not
> * uptodate then the caller will launch readpage again, and
> * will then handle the error.
>
> --
> Chris Frost
> http://www.frostnet.net/chris/
On Mon, Feb 01, 2010 at 10:17:03AM +0800, Wu Fengguang wrote:
> On Sun, Jan 31, 2010 at 07:06:39PM -0700, Chris Frost wrote:
> > Look good?
>
> OK :)
Great!
Do you have a feel for when you expect to have your tree ready for review?
We'd love to see this patch released soon; applications can start to use
libprefetch when this change is generally available.
--
Chris Frost
http://www.frostnet.net/chris/