Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1750964AbVKIOUe (ORCPT ); Wed, 9 Nov 2005 09:20:34 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1750901AbVKIOUb (ORCPT ); Wed, 9 Nov 2005 09:20:31 -0500 Received: from ns.ustc.edu.cn ([202.38.64.1]:54764 "EHLO mx1.ustc.edu.cn") by vger.kernel.org with ESMTP id S1750842AbVKIOOQ (ORCPT ); Wed, 9 Nov 2005 09:14:16 -0500 Message-Id: <20051109141459.901942000@localhost.localdomain> References: <20051109134938.757187000@localhost.localdomain> Date: Wed, 09 Nov 2005 21:49:44 +0800 From: Wu Fengguang To: linux-kernel@vger.kernel.org Cc: Andrew Morton , Wu Fengguang Subject: [PATCH 06/16] readahead: call scheme Content-Disposition: inline; filename=readahead-call-scheme.patch Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 13158 Lines: 463 An new page flag PG_readahead is introduced as a look-ahead mark. The look-ahead mark corresponds to `ahead_start' of the current logic. The read-ahead logic is called when - read reaches a look-ahead mark; - read on a non-present page. And ra_access() is called on every page reference to maintain the cache_hit counter. This scheme has the following benefits: - makes all stateful/stateless methods happy; - eliminates the cache hit problem naturally; - lives in harmony with application managed read-aheads via fadvise/madvise. Signed-off-by: Wu Fengguang --- include/linux/mm.h | 7 + include/linux/page-flags.h | 5 + mm/filemap.c | 66 ++++++++++++++--- mm/readahead.c | 172 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 239 insertions(+), 11 deletions(-) --- linux-2.6.14-mm1.orig/include/linux/page-flags.h +++ linux-2.6.14-mm1/include/linux/page-flags.h @@ -77,6 +77,7 @@ #define PG_nosave_free 18 /* Free, should not be written */ #define PG_uncached 19 /* Page has been mapped as uncached */ #define PG_activate 20 /* delayed activate */ +#define PG_readahead 21 /* check readahead when reading this page */ /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -315,6 +316,10 @@ extern void __mod_page_state(unsigned lo #define TestClearPageActivate(page) test_and_clear_bit(PG_activate, &(page)->flags) #define TestSetPageActivate(page) test_and_set_bit(PG_activate, &(page)->flags) +#define PageReadahead(page) test_bit(PG_readahead, &(page)->flags) +#define SetPageReadahead(page) set_bit(PG_readahead, &(page)->flags) +#define TestClearPageReadahead(page) test_and_clear_bit(PG_readahead, &(page)->flags) + struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); --- linux-2.6.14-mm1.orig/include/linux/mm.h +++ linux-2.6.14-mm1/include/linux/mm.h @@ -985,6 +985,13 @@ unsigned long page_cache_readahead(struc void handle_ra_miss(struct address_space *mapping, struct file_ra_state *ra, pgoff_t offset); unsigned long max_sane_readahead(unsigned long nr); +unsigned long +page_cache_readahead_adaptive(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + struct page *prev_page, struct page *page, + pgoff_t first_index, + pgoff_t index, pgoff_t last_index); +void fastcall ra_access(struct file_ra_state *ra, struct page *page); /* Do stack extension */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); --- linux-2.6.14-mm1.orig/mm/filemap.c +++ linux-2.6.14-mm1/mm/filemap.c @@ -724,6 +724,8 @@ grab_cache_page_nowait(struct address_sp EXPORT_SYMBOL(grab_cache_page_nowait); +extern int readahead_ratio; + /* * This is a generic file read routine, and uses the * mapping->a_ops->readpage() function for the actual low-level @@ -751,10 +753,12 @@ void do_generic_mapping_read(struct addr unsigned long prev_index; loff_t isize; struct page *cached_page; + struct page *prev_page; int error; struct file_ra_state ra = *_ra; cached_page = NULL; + prev_page = NULL; index = *ppos >> PAGE_CACHE_SHIFT; next_index = index; prev_index = ra.prev_page; @@ -783,16 +787,36 @@ void do_generic_mapping_read(struct addr nr = nr - offset; cond_resched(); - if (index == next_index) + + if (readahead_ratio <= 9 && index == next_index) next_index = page_cache_readahead(mapping, &ra, filp, index, last_index - index); find_page: page = find_get_page(mapping, index); + if (readahead_ratio > 9) { + if (unlikely(page == NULL)) { + page_cache_readahead_adaptive(mapping, &ra, + filp, prev_page, NULL, + *ppos >> PAGE_CACHE_SHIFT, + index, last_index); + page = find_get_page(mapping, index); + } else if (PageReadahead(page)) { + page_cache_readahead_adaptive(mapping, &ra, + filp, prev_page, page, + *ppos >> PAGE_CACHE_SHIFT, + index, last_index); + } + } if (unlikely(page == NULL)) { - handle_ra_miss(mapping, &ra, index); + if (readahead_ratio <= 9) + handle_ra_miss(mapping, &ra, index); goto no_cached_page; } + if (prev_page) + page_cache_release(prev_page); + prev_page = page; + ra_access(&ra, page); if (!PageUptodate(page)) goto page_not_up_to_date; page_ok: @@ -808,8 +832,9 @@ page_ok: * When (part of) the same page is read multiple times * in succession, only mark it as accessed the first time. */ - if (prev_index != index) + if (prev_index != index) { mark_page_accessed(page); + } prev_index = index; /* @@ -827,7 +852,6 @@ page_ok: index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; - page_cache_release(page); if (ret == nr && desc->count) continue; goto out; @@ -839,7 +863,6 @@ page_not_up_to_date: /* Did it get unhashed before we got the lock? */ if (!page->mapping) { unlock_page(page); - page_cache_release(page); continue; } @@ -864,7 +887,6 @@ readpage: * invalidate_inode_pages got it */ unlock_page(page); - page_cache_release(page); goto find_page; } unlock_page(page); @@ -885,7 +907,6 @@ readpage: isize = i_size_read(inode); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; if (unlikely(!isize || index > end_index)) { - page_cache_release(page); goto out; } @@ -894,7 +915,6 @@ readpage: if (index == end_index) { nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; if (nr <= offset) { - page_cache_release(page); goto out; } } @@ -904,7 +924,6 @@ readpage: readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; - page_cache_release(page); goto out; no_cached_page: @@ -929,15 +948,22 @@ no_cached_page: } page = cached_page; cached_page = NULL; + if (prev_page) + page_cache_release(prev_page); + prev_page = page; goto readpage; } out: *_ra = ra; + if (readahead_ratio > 9) + _ra->prev_page = prev_index; *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; if (cached_page) page_cache_release(cached_page); + if (prev_page) + page_cache_release(prev_page); if (filp) file_accessed(filp); } @@ -1235,19 +1261,33 @@ retry_all: * * For sequential accesses, we use the generic readahead logic. */ - if (VM_SequentialReadHint(area)) + if (readahead_ratio <= 9 && VM_SequentialReadHint(area)) page_cache_readahead(mapping, ra, file, pgoff, 1); + /* * Do we have something in the page cache already? */ retry_find: page = find_get_page(mapping, pgoff); + if (VM_SequentialReadHint(area) && readahead_ratio > 9) { + if (!page) { + page_cache_readahead_adaptive(mapping, ra, + file, NULL, NULL, + pgoff, pgoff, pgoff + 1); + page = find_get_page(mapping, pgoff); + } else if (PageReadahead(page)) { + page_cache_readahead_adaptive(mapping, ra, + file, NULL, page, + pgoff, pgoff, pgoff + 1); + } + } if (!page) { unsigned long ra_pages; if (VM_SequentialReadHint(area)) { - handle_ra_miss(mapping, ra, pgoff); + if (readahead_ratio <= 9) + handle_ra_miss(mapping, ra, pgoff); goto no_cached_page; } ra->mmap_miss++; @@ -1284,6 +1324,8 @@ retry_find: if (!did_readaround) ra->mmap_hit++; + ra_access(ra, page); + /* * Ok, found a page in the page cache, now we need to check * that it's up-to-date. @@ -1298,6 +1340,8 @@ success: mark_page_accessed(page); if (type) *type = majmin; + if (readahead_ratio > 9) + ra->prev_page = page->index; return page; outside_data_content: --- linux-2.6.14-mm1.orig/mm/readahead.c +++ linux-2.6.14-mm1/mm/readahead.c @@ -20,6 +20,43 @@ #define MAX_RA_PAGES KB(VM_MAX_READAHEAD) #define MIN_RA_PAGES KB(VM_MIN_READAHEAD) +/* Detailed classification of read-ahead behaviors. */ +#define RA_CLASS_SHIFT 3 +#define RA_CLASS_MASK ((1 << RA_CLASS_SHIFT) - 1) +enum ra_class { + RA_CLASS_ALL, + RA_CLASS_NEWFILE, + RA_CLASS_STATE, + RA_CLASS_CONTEXT, + RA_CLASS_CONTEXT_ACCELERATED, + RA_CLASS_BACKWARD, + RA_CLASS_RANDOM_THRASHING, + RA_CLASS_RANDOM_SEEK, + RA_CLASS_END, +}; + +/* Read-ahead events to be accounted. */ +enum ra_event { + RA_EVENT_CACHE_MISS, /* read cache misses */ + RA_EVENT_READRANDOM, /* random reads */ + RA_EVENT_IO_CONGESTION, /* io congestion */ + RA_EVENT_IO_CACHE_HIT, /* canceled io due to cache hit */ + RA_EVENT_IO_BLOCK, /* read on locked page */ + + RA_EVENT_READAHEAD, /* read-ahead issued */ + RA_EVENT_READAHEAD_HIT, /* read-ahead page hit */ + RA_EVENT_LOOKAHEAD, /* look-ahead issued */ + RA_EVENT_LOOKAHEAD_HIT, /* look-ahead mark hit */ + RA_EVENT_LOOKAHEAD_NOACTION, /* look-ahead mark ignored */ + RA_EVENT_READAHEAD_EOF, /* read-ahead reaches EOF */ + RA_EVENT_READAHEAD_SHRINK, /* ra_size decreased, reflects var. */ + RA_EVENT_READAHEAD_THRASHING, /* read-ahead thrashing happened */ + RA_EVENT_READAHEAD_MUTILATE, /* read-ahead request mutilated */ + RA_EVENT_READAHEAD_RESCUE, /* read-ahead rescued */ + + RA_EVENT_END +}; + /* * Debug facilities. */ @@ -310,9 +347,11 @@ __do_page_cache_readahead(struct address page = radix_tree_lookup(&mapping->page_tree, page_offset); if (page) { +#ifdef READAHEAD_STREAMING if (readahead_ratio > 9 && page_idx == nr_to_read - lookahead_size) SetPageReadahead(page); +#endif continue; } @@ -730,3 +769,136 @@ out: return nr_pages ? index : 0; } + +/* + * This is the entry point of the adaptive read-ahead logic. + * + * It is only called on two conditions: + * 1. page == NULL + * A cache miss happened, it can be either a random read or a sequential one. + * 2. page != NULL + * There is a look-ahead mark(PG_readahead) from a previous sequential read. + * It's time to do some checking and submit the next read-ahead IO. + * + * That has the merits of: + * - makes all stateful/stateless methods happy; + * - eliminates the cache hit problem naturally; + * - lives in harmony with application managed read-aheads via fadvise/madvise. + */ +unsigned long +page_cache_readahead_adaptive(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + struct page *prev_page, struct page *page, + pgoff_t begin_index, + pgoff_t index, pgoff_t end_index) +{ + unsigned long size; + unsigned long ra_min; + unsigned long ra_max; + int ret; + + if (page) { + if(!TestClearPageReadahead(page)) + return 0; + if (bdi_read_congested(mapping->backing_dev_info)) + return 0; + } + + if (page) + ra_account(ra, RA_EVENT_LOOKAHEAD_HIT, + ra->readahead_index - ra->lookahead_index); + else if (index) + ra_account(ra, RA_EVENT_CACHE_MISS, end_index - begin_index); + + size = end_index - index; + get_readahead_bounds(ra, &ra_min, &ra_max); + + /* readahead disabled? */ + if (unlikely(!ra_min || !readahead_ratio)) { + size = max_sane_readahead(size); + goto readit; + } + + /* + * Start of file. + */ + if (index == 0) + return newfile_readahead(mapping, filp, ra, end_index, ra_min); + + /* + * State based sequential read-ahead. + */ + if ((readahead_ratio % 5) == 0 && + index == ra->lookahead_index && + (page || index == ra->readahead_index) && + (ra_cache_hit_ok(ra) || + end_index - begin_index >= ra_max)) + return state_based_readahead(mapping, filp, ra, page, ra_max); + + /* + * Backward read-ahead. + */ + if (try_read_backward(ra, begin_index, end_index, size, ra_min, ra_max)) + return ra_dispatch(ra, mapping, filp); + + /* + * Context based sequential read-ahead. + */ + ret = try_context_based_readahead(mapping, ra, prev_page, page, + index, ra_min, ra_max); + if (ret > 0) + return ra_dispatch(ra, mapping, filp); + if (ret < 0) + return 0; + + /* No action on look ahead time? */ + if (page) { + ra_account(ra, RA_EVENT_LOOKAHEAD_NOACTION, + ra->readahead_index - index); + return 0; + } + + /* + * Random read that follows a sequential one. + */ + if (try_random_readahead(ra, index, size, ra_max)) + return ra_dispatch(ra, mapping, filp); + + /* + * Random read. + */ + if (size > ra_max) + size = ra_max; + +readit: + size = __do_page_cache_readahead(mapping, filp, index, size, 0); + + ra_account(ra, RA_EVENT_READRANDOM, size); + dprintk("readrandom(ino=%lu, pages=%lu, index=%lu-%lu-%lu) = %lu\n", + mapping->host->i_ino, mapping->nrpages, + begin_index, index, end_index, size); + + return size; +} + +/* + * Call me! + */ +void fastcall ra_access(struct file_ra_state *ra, struct page *page) +{ + if (page->flags & ((1 << PG_active) | + (1 << PG_activate) | + (1 << PG_referenced))) + return; + + if (!ra_has_index(ra, page->index)) + return; + + ra->cache_hit++; + + if (page->index >= ra->ra_index) + ra_account(ra, RA_EVENT_READAHEAD_HIT, 1); + else + ra_account(ra, RA_EVENT_READAHEAD_HIT, -1); +} + -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/