Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S269023AbUIXWxm (ORCPT ); Fri, 24 Sep 2004 18:53:42 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S269031AbUIXWxm (ORCPT ); Fri, 24 Sep 2004 18:53:42 -0400 Received: from e1.ny.us.ibm.com ([32.97.182.101]:26509 "EHLO e1.ny.us.ibm.com") by vger.kernel.org with ESMTP id S269023AbUIXWxC (ORCPT ); Fri, 24 Sep 2004 18:53:02 -0400 Message-ID: <4154A5FF.6040206@austin.ibm.com> Date: Fri, 24 Sep 2004 17:55:59 -0500 From: Steven Pratt User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624 Netscape/7.1 X-Accept-Language: en-us, en MIME-Version: 1.0 To: Andrew Morton CC: linux-kernel@vger.kernel.org Subject: Re: [PATCH/RFC] Simplified Readahead References: <4152F46D.1060200@austin.ibm.com> <20040923194216.1f2b7b05.akpm@osdl.org> <41543FE2.5040807@austin.ibm.com> <20040924150523.4853465b.akpm@osdl.org> In-Reply-To: <20040924150523.4853465b.akpm@osdl.org> Content-Type: multipart/mixed; boundary="------------060208000707010007020102" Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 22509 Lines: 645 This is a multi-part message in MIME format. --------------060208000707010007020102 Content-Type: text/plain; charset=us-ascii; format=flowed Content-Transfer-Encoding: 7bit Ooops, here is the actual new patch with shut off for already in page cache. --------------060208000707010007020102 Content-Type: text/plain; name="newramm.patch" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="newramm.patch" --- linux-2.6.9-rc2/include/linux/mm.h.org 2004-09-17 14:05:27.000000000 -0500 +++ linux-2.6.9-rc2/include/linux/mm.h 2004-09-24 11:40:53.000000000 -0500 @@ -722,15 +722,18 @@ /* readahead.c */ #define VM_MAX_READAHEAD 128 /* kbytes */ #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ +#define VM_MAX_CACHE_HIT 2560 /* max pages in a row in cache before + * turning readahead off */ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, unsigned long offset, unsigned long nr_to_read); int force_page_cache_readahead(struct address_space *mapping, struct file *filp, unsigned long offset, unsigned long nr_to_read); -void page_cache_readahead(struct address_space *mapping, +unsigned long page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, - unsigned long offset); + unsigned long offset, + unsigned long size); void handle_ra_miss(struct address_space *mapping, struct file_ra_state *ra, pgoff_t offset); unsigned long max_sane_readahead(unsigned long nr); --- linux-2.6.9-rc2/include/linux/fs.h.org 2004-09-17 14:05:47.000000000 -0500 +++ linux-2.6.9-rc2/include/linux/fs.h 2004-09-24 10:56:17.000000000 -0500 @@ -557,16 +557,17 @@ struct file_ra_state { unsigned long start; /* Current window */ unsigned long size; - unsigned long next_size; /* Next window size */ + unsigned long flags; /* ra flags RA_FLAG_xxx*/ + unsigned long cache_hit; /* cache hit count*/ unsigned long prev_page; /* Cache last read() position */ unsigned long ahead_start; /* Ahead window */ unsigned long ahead_size; - unsigned long currnt_wnd_hit; /* locality in the current window */ - unsigned long average; /* size of next current window */ unsigned long ra_pages; /* Maximum readahead window */ unsigned long mmap_hit; /* Cache hit stat for mmap accesses */ unsigned long mmap_miss; /* Cache miss stat for mmap accesses */ }; +#define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */ +#define RA_FLAG_INCACHE 0x02 /* file is already in cache */ struct file { struct list_head f_list; --- linux-2.6.9-rc2/mm/readahead.org.c 2004-09-16 16:02:05.000000000 -0500 +++ linux-2.6.9-rc2/mm/readahead.c 2004-09-24 15:25:44.000000000 -0500 @@ -35,7 +35,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = mapping->backing_dev_info->ra_pages; - ra->average = ra->ra_pages / 2; } EXPORT_SYMBOL_GPL(file_ra_state_init); @@ -52,6 +51,62 @@ return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; } +static inline void ra_off(struct file_ra_state *ra) +{ + ra->start=0; + ra->flags=0; + ra->size=-1UL; + ra->ahead_start=0; + ra->ahead_size=0; + return; +} + +/* + * Set the initial window size, round to next power of 2 and square + * for small size, x 4 for medium, and x 2 for large + * for 128k (32 page) max ra + * 1-8 page = 32k initial, > 8 page = 128k initial + */ +unsigned long get_init_ra_size(unsigned long size, unsigned long max, + unsigned long min) +{ + unsigned long s_size=1, newsize; + do { + s_size = s_size << 1; + } while ((size = size >> 1)); + if (s_size <= max / 64) { + newsize = s_size * s_size; + } else if (s_size <= max/4) { + newsize = max / 4; + } else { + newsize = max; + } + return newsize; +} + +/* + * Set the new window size, this is called only when + * I/O is to be submitted, not for each call to readahead + * If cache miss occered, reduce next I/O size, else + * increase depending on how close to max we are. + */ +unsigned long get_next_ra_size(unsigned long cur, unsigned long max, + unsigned long min, unsigned long * flags) +{ + unsigned long newsize; + if (*flags & RA_FLAG_MISS) { + newsize = max((cur - 2),min); + *flags &= ~RA_FLAG_MISS; + } else if ( cur < max/16 ) { + newsize = 4 * cur; + } else { + newsize = 2 * cur; + } + newsize = min(newsize, max_sane_readahead(max)); + return newsize; +} + + #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) /** @@ -152,19 +207,17 @@ * ahead_size: Together, these form the "ahead window". * ra_pages: The externally controlled max readahead for this fd. * - * When readahead is in the "maximally shrunk" state (next_size == -1UL), - * readahead is disabled. In this state, prev_page and size are used, inside - * handle_ra_miss(), to detect the resumption of sequential I/O. Once there - * has been a decent run of sequential I/O (defined by get_min_readahead), - * readahead is reenabled. + * When readahead is in the off state (size == -1UL), + * readahead is disabled. In this state, prev_page is used + * to detect the resumption of sequential I/O. * * The readahead code manages two windows - the "current" and the "ahead" * windows. The intent is that while the application is walking the pages * in the current window, I/O is underway on the ahead window. When the * current window is fully traversed, it is replaced by the ahead window * and the ahead window is invalidated. When this copying happens, the - * new current window's pages are probably still locked. When I/O has - * completed, we submit a new batch of I/O, creating a new ahead window. + * new current window's pages are probably still locked. So + * we submit a new batch of I/O immediately, creating a new ahead window. * * So: * @@ -176,34 +229,25 @@ * ahead window. * * A `readahead hit' occurs when a read request is made against a page which is - * inside the current window. Hits are good, and the window size (next_size) - * is grown aggressively when hits occur. Two pages are added to the next - * window size on each hit, which will end up doubling the next window size by - * the time I/O is submitted for it. - * - * If readahead hits are more sparse (say, the application is only reading - * every second page) then the window will build more slowly. - * - * On a readahead miss (the application seeked away) the readahead window is - * shrunk by 25%. We don't want to drop it too aggressively, because it is a - * good assumption that an application which has built a good readahead window - * will continue to perform linear reads. Either at the new file position, or - * at the old one after another seek. + * the next sequential page. Ahead windowe calculations are done only when it + * is time to submit a new IO. The code ramps up the size agressively at first, + * but slow down as it approaches max_readhead. * - * After enough misses, readahead is fully disabled. (next_size = -1UL). + * Any seek/ramdom IO will result in readahead being turned off. It will resume + * at the first sequential access. * * There is a special-case: if the first page which the application tries to * read happens to be the first page of the file, it is assumed that a linear - * read is about to happen and the window is immediately set to half of the - * device maximum. + * read is about to happen and the window is immediately set to the initial size + * based on I/O request size and the max_readahead. * * A page request at (start + size) is not a miss at all - it's just a part of * sequential file reading. * - * This function is to be called for every page which is read, rather than when - * it is time to perform readahead. This is so the readahead algorithm can - * centrally work out the access patterns. This could be costly with many tiny - * read()s, so we specifically optimise for that case with prev_page. + * This function is to be called for every read request, rather than when + * it is time to perform readahead. It is called only oce for the entire I/O + * regardless of size unless readahead is unable to start enough I/O to satisfy + * the request (I/O request > max_readahead). */ /* @@ -212,7 +256,7 @@ * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. * - * Returns the number of pages which actually had IO started against them. + * Returns the number of pages requested, or the maximum amount of I/O allowed. */ static inline int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, @@ -301,266 +345,160 @@ } /* - * This version skips the IO if the queue is read-congested, and will tell the - * block layer to abandon the readahead if request allocation would block. - * - * force_page_cache_readahead() will ignore queue congestion and will block on - * request queues. - */ -int do_page_cache_readahead(struct address_space *mapping, struct file *filp, - unsigned long offset, unsigned long nr_to_read) -{ - if (!bdi_read_congested(mapping->backing_dev_info)) - return __do_page_cache_readahead(mapping, filp, - offset, nr_to_read); - return 0; -} - -/* * Check how effective readahead is being. If the amount of started IO is * less than expected then the file is partly or fully in pagecache and - * readahead isn't helping. Shrink the window. + * readahead isn't helping. * - * But don't shrink it too much - the application may read the same page - * occasionally. */ static inline void -check_ra_success(struct file_ra_state *ra, pgoff_t attempt, - pgoff_t actual, pgoff_t orig_next_size) +check_ra_success(struct file_ra_state *ra, unsigned long nr_to_read, + unsigned long actual) { if (actual == 0) { - if (orig_next_size > 1) { - ra->next_size = orig_next_size - 1; - if (ra->ahead_size) - ra->ahead_size = ra->next_size; - } else { - ra->next_size = -1UL; - ra->size = 0; + ra->cache_hit += nr_to_read; + if (ra->cache_hit >= VM_MAX_CACHE_HIT) { + ra->flags |= RA_FLAG_INCACHE; + ra_off(ra); } + } else { + ra->cache_hit=0; } + return; +} + +/* + * Issue the I/O. If pages already in cache, increment the hit count until + * we exceed max, then turn RA off until we start missing again. + */ +int do_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read) +{ + return __do_page_cache_readahead(mapping, filp, offset, nr_to_read); } + + + /* * page_cache_readahead is the main function. If performs the adaptive * readahead window size management and submits the readahead I/O. */ -void +unsigned long page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, - struct file *filp, unsigned long offset) + struct file *filp, unsigned long offset, + unsigned long req_size) { - unsigned max; - unsigned orig_next_size; - unsigned actual; - int first_access=0; - unsigned long average; + unsigned long max, min, maxsane, newsize=req_size; + unsigned long actual=0; /* * Here we detect the case where the application is performing * sub-page sized reads. We avoid doing extra work and bogusly * perturbing the readahead window expansion logic. - * If next_size is zero, this is the very first read for this - * file handle, or the window is maximally shrunk. + * If size is zero, there is no read ahead window so we need one */ - if (offset == ra->prev_page) { - if (ra->next_size != 0) - goto out; + if (offset == ra->prev_page && req_size == 1 && ra->size != 0) { + goto out; } - if (ra->next_size == -1UL) - goto out; /* Maximally shrunk */ - max = get_max_readahead(ra); - if (max == 0) - goto out; /* No readahead */ + min = get_min_readahead(ra); + maxsane = max_sane_readahead(max); + newsize = min(req_size, maxsane); + + if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) + newsize = 1; + goto out; /* No readahead or file already in cache*/ - orig_next_size = ra->next_size; - - if (ra->next_size == 0) { - /* - * Special case - first read. - * We'll assume it's a whole-file read, and - * grow the window fast. - */ - first_access=1; - ra->next_size = max / 2; - ra->prev_page = offset; - ra->currnt_wnd_hit++; - goto do_io; - } - - ra->prev_page = offset; - - if (offset >= ra->start && offset <= (ra->start + ra->size)) { - /* - * A readahead hit. Either inside the window, or one - * page beyond the end. Expand the next readahead size. - */ - ra->next_size += 2; - - if (ra->currnt_wnd_hit <= (max * 2)) - ra->currnt_wnd_hit++; - } else { - /* - * A miss - lseek, pagefault, pread, etc. Shrink the readahead - * window. + /* + * Special case - first read. + * We'll assume it's a whole-file read if at start of file, and + * grow the window fast. + * or detect first sequential access + */ + if ((ra->size == 0 && offset == 0) // first io and start of file + || (ra->size == -1UL && ra->prev_page == offset-1)) { //1st seq + ra->prev_page = offset + newsize-1; + ra->size = get_init_ra_size(newsize, max, min); + ra->start = offset; + actual = do_page_cache_readahead(mapping, filp, offset, ra->size); + check_ra_success(ra, ra->size, actual); + /* if the request size is larger than our max readahead, we + * at least want to be sure that we get 2 IOs if flight and + * we know that we will definitly need the new I/O. + * once we do this, subsequent calls should be able to overlap IOs, + * thus preventing stalls. so Issue the ahead window immediately. */ - ra->next_size -= 2; - - average = ra->average; - if (average < ra->currnt_wnd_hit) { - average++; + if (req_size >= max) { + ra->ahead_size = get_next_ra_size(ra->size, max, min, &ra->flags); + ra->ahead_start = ra->start + ra->size; + actual = do_page_cache_readahead(mapping, filp, + ra->ahead_start, ra->ahead_size); + check_ra_success(ra, ra->ahead_size, actual); } - ra->average = (average + ra->currnt_wnd_hit) / 2; - ra->currnt_wnd_hit = 1; + goto out; } - if ((long)ra->next_size > (long)max) - ra->next_size = max; - if ((long)ra->next_size <= 0L) { - ra->next_size = -1UL; - ra->size = 0; - goto out; /* Readahead is off */ + /* now handle the random case: + * partial page reads and first access were handled above, + * so this must be the next page otherwise it is random + */ + if ((offset != (ra->prev_page+1) || (ra->size == 0))) { + ra_off(ra); + ra->prev_page = offset + newsize-1; + actual = do_page_cache_readahead(mapping, filp, offset, newsize); + check_ra_success(ra, newsize, actual); + goto out; } - /* - * Is this request outside the current window? + /* If we get here we are doing sequential IO and this was + * not the first occurence (ie we have an existing window) */ - if (offset < ra->start || offset >= (ra->start + ra->size)) { - /* - * A miss against the current window. Have we merely - * advanced into the ahead window? - */ - if (offset == ra->ahead_start) { - /* - * Yes, we have. The ahead window now becomes - * the current window. - */ - ra->start = ra->ahead_start; - ra->size = ra->ahead_size; - ra->prev_page = ra->start; - ra->ahead_start = 0; - ra->ahead_size = 0; - - /* - * Control now returns, probably to sleep until I/O - * completes against the first ahead page. - * When the second page in the old ahead window is - * requested, control will return here and more I/O - * will be submitted to build the new ahead window. - */ - goto out; - } -do_io: - /* - * This is the "unusual" path. We come here during - * startup or after an lseek. We invalidate the - * ahead window and get some I/O underway for the new - * current window. - */ - if (!first_access) { - /* Heuristic: there is a high probability - * that around ra->average number of - * pages shall be accessed in the next - * current window. - */ - average = ra->average; - if (ra->currnt_wnd_hit > average) - average = (ra->currnt_wnd_hit + ra->average + 1) / 2; - ra->next_size = min(average , (unsigned long)max); - } - ra->start = offset; - ra->size = ra->next_size; - ra->ahead_start = 0; /* Invalidate these */ - ra->ahead_size = 0; - actual = do_page_cache_readahead(mapping, filp, offset, - ra->size); - if(!first_access) { - /* - * do not adjust the readahead window size the first - * time, the ahead window might get closed if all - * the pages are already in the cache. - */ - check_ra_success(ra, ra->size, actual, orig_next_size); - } + if (ra->ahead_start == 0) { /* no ahead window yet */ + ra->ahead_size = get_next_ra_size(ra->size, max, min, &ra->flags); + ra->ahead_start = ra->start + ra->size; + ra->prev_page = offset + newsize - 1; + actual = do_page_cache_readahead(mapping, filp, + ra->ahead_start, ra->ahead_size); + check_ra_success(ra, ra->ahead_size, actual); } else { - /* - * This read request is within the current window. It may be - * time to submit I/O for the ahead window while the - * application is about to step into the ahead window. - */ - if (ra->ahead_start == 0) { - /* - * If the average io-size is more than maximum - * readahead size of the file the io pattern is - * sequential. Hence bring in the readahead window - * immediately. - * If the average io-size is less than maximum - * readahead size of the file the io pattern is - * random. Hence don't bother to readahead. + /* already have an ahead window, check if we crossed into it */ + if ((offset + newsize -1) >= ra->ahead_start) { + ra->start = ra->ahead_start; + ra->size = ra->ahead_size; + ra->ahead_start = ra->ahead_start + ra->ahead_size; + ra->ahead_size = get_next_ra_size(ra->ahead_size, + max, min, &ra->flags); + ra->prev_page = offset + newsize - 1; + actual = do_page_cache_readahead(mapping, filp, + ra->ahead_start, ra->ahead_size); + check_ra_success(ra, ra->ahead_size, actual); + } else { + /* do nothing, read contained in current window and ahead + * window populated just update the prev_page pointer. */ - average = ra->average; - if (ra->currnt_wnd_hit > average) - average = (ra->currnt_wnd_hit + ra->average + 1) / 2; - - if (average > max) { - ra->ahead_start = ra->start + ra->size; - ra->ahead_size = ra->next_size; - actual = do_page_cache_readahead(mapping, filp, - ra->ahead_start, ra->ahead_size); - check_ra_success(ra, ra->ahead_size, - actual, orig_next_size); - } + ra->prev_page = offset + newsize -1; } } -out: - return; + out: + return(newsize); } -EXPORT_SYMBOL(page_cache_readahead); - /* * handle_ra_miss() is called when it is known that a page which should have * been present in the pagecache (we just did some readahead there) was in fact * not found. This will happen if it was evicted by the VM (readahead - * thrashing) or if the readahead window is maximally shrunk. + * thrashing) * - * If the window has been maximally shrunk (next_size == -1UL) then look to see - * if we are getting misses against sequential file offsets. If so, and this - * persists then resume readahead. - * - * Otherwise we're thrashing, so shrink the readahead window by three pages. - * This is because it is grown by two pages on a readahead hit. Theory being - * that the readahead window size will stabilise around the maximum level at - * which there is no thrashing. + * turn on the cache miss flag in the RA struct, this will cause the RA code + * to reduce the RA size on the next read. */ void handle_ra_miss(struct address_space *mapping, - struct file_ra_state *ra, pgoff_t offset) + struct file_ra_state *ra, pgoff_t offset) { - if (ra->next_size == -1UL) { - const unsigned long max = get_max_readahead(ra); - - if (offset != ra->prev_page + 1) { - ra->size = ra->size?ra->size-1:0; /* Not sequential */ - } else { - ra->size++; /* A sequential read */ - if (ra->size >= max) { /* Resume readahead */ - ra->start = offset - max; - ra->next_size = max; - ra->size = max; - ra->ahead_start = 0; - ra->ahead_size = 0; - ra->average = max / 2; - } - } - ra->prev_page = offset; - } else { - const unsigned long min = get_min_readahead(ra); - - ra->next_size -= 3; - if (ra->next_size < min) - ra->next_size = min; - } + ra->flags |= RA_FLAG_MISS; + ra->flags &= ~RA_FLAG_INCACHE; } /* --- linux-2.6.9-rc2/mm/filemap.org.c 2004-09-17 16:11:21.000000000 -0500 +++ linux-2.6.9-rc2/mm/filemap.c 2004-09-21 13:45:25.000000000 -0500 @@ -717,14 +717,15 @@ read_actor_t actor) { struct inode *inode = mapping->host; - unsigned long index, end_index, offset; + unsigned long index, end_index, offset, req_size, next_index; loff_t isize; struct page *cached_page; int error; struct file_ra_state ra = *_ra; cached_page = NULL; - index = *ppos >> PAGE_CACHE_SHIFT; + next_index = index = *ppos >> PAGE_CACHE_SHIFT; + req_size = (desc->count + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; isize = i_size_read(inode); @@ -734,7 +735,7 @@ end_index = (isize - 1) >> PAGE_CACHE_SHIFT; for (;;) { struct page *page; - unsigned long nr, ret; + unsigned long ret_size, nr, ret; /* nr is the maximum number of bytes to copy from this page */ nr = PAGE_CACHE_SIZE; @@ -749,7 +750,12 @@ nr = nr - offset; cond_resched(); - page_cache_readahead(mapping, &ra, filp, index); + if(index == next_index && req_size) { + ret_size = page_cache_readahead(mapping, &ra, + filp, index, req_size); + next_index += ret_size; + req_size -= ret_size; + } find_page: page = find_get_page(mapping, index); @@ -1195,7 +1201,7 @@ * For sequential accesses, we use the generic readahead logic. */ if (VM_SequentialReadHint(area)) - page_cache_readahead(mapping, ra, file, pgoff); + page_cache_readahead(mapping, ra, file, pgoff, 1); /* * Do we have something in the page cache already? --------------060208000707010007020102-- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/