Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751349AbVJ2Frs (ORCPT ); Sat, 29 Oct 2005 01:47:48 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1751361AbVJ2Frr (ORCPT ); Sat, 29 Oct 2005 01:47:47 -0400 Received: from ns.ustc.edu.cn ([202.38.64.1]:33163 "EHLO mx1.ustc.edu.cn") by vger.kernel.org with ESMTP id S1751349AbVJ2Fre (ORCPT ); Sat, 29 Oct 2005 01:47:34 -0400 Message-Id: <20051029060241.677539000@localhost.localdomain> References: <20051029060216.159380000@localhost.localdomain> Date: Sat, 29 Oct 2005 14:02:25 +0800 From: Wu Fengguang To: linux-kernel@vger.kernel.org Cc: Andrew Morton , Wu Fengguang Subject: [PATCH 09/13] readahead: context based method Content-Disposition: inline; filename=readahead-method-context.patch Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 10642 Lines: 376 This is the slow code path. No valid state info is available, so the page cache is queried to abtain the required position/timing infomation. Major steps: - look back/forward to find the ra_index; - look back to estimate a thrashing safe ra_size; - assemble the next read-ahead request in file->f_ra; - submit it. Signed-off-by: Wu Fengguang --- mm/readahead.c | 336 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 335 insertions(+), 1 deletion(-) --- linux-2.6.14-rc5-mm1.orig/mm/readahead.c +++ linux-2.6.14-rc5-mm1/mm/readahead.c @@ -1080,6 +1080,340 @@ state_based_readahead(struct address_spa return ra_dispatch(ra, mapping, filp); } +/* + * Page cache context based estimation of read-ahead/look-ahead size/index. + * + * The logic first looks backward in the inactive_list to get an estimation of + * the thrashing-threshold, and then, if necessary, looks forward to determine + * the start point of next read-ahead. + * + * The estimation theory can be illustrated with figure: + * + * chunk A chunk B chunk C head + * + * l01 l11 l12 l21 l22 + *| |-->|-->| |------>|-->| |------>| + *| +-------+ +-----------+ +-------------+ | + *| | # | | # | | # | | + *| +-------+ +-----------+ +-------------+ | + *| |<==============|<===========================|<============================| + * L0 L1 L2 + * + * Let f(l) = L be a map from + * l: the number of pages read by the stream + * to + * L: the number of pages pushed into inactive_list in the mean time + * then + * f(l01) <= L0 + * f(l11 + l12) = L1 + * f(l21 + l22) = L2 + * ... + * f(l01 + l11 + ...) <= Sum(L0 + L1 + ...) + * <= Length(inactive_list) = f(thrashing-threshold) + * + * So the count of countinuous history pages left in the inactive_list is always + * a lower estimation of the true thrashing-threshold. + */ + +/* + * STATUS REFERENCE COUNT TYPE + * A__ 0 not in inactive list + * ___ 0 fresh + * __R PAGE_REFCNT_1 stale + * _a_ PAGE_REFCNT_2 disturbed once + * _aR PAGE_REFCNT_3 disturbed twice + * + * A/a/R: Active / aCTIVATE / Referenced + */ +static inline unsigned long cold_page_refcnt(struct page *page) +{ + if (!page || PageActive(page)) + return 0; + + return page_refcnt(page); +} + +static inline char page_refcnt_symbol(struct page *page) +{ + if (!page) + return 'X'; + if (PageActive(page)) + return 'A'; + switch (page_refcnt(page)) { + case 0: + return '_'; + case PAGE_REFCNT_1: + return '-'; + case PAGE_REFCNT_2: + return '='; + case PAGE_REFCNT_3: + return '#'; + } + return '?'; +} + +/* + * Count/estimate cache hits in range [first_index, last_index]. + * The estimation is simple and a bit optimistic. + */ +static int count_cache_hit(struct address_space *mapping, + unsigned long first_index, unsigned long last_index) +{ + static int steps[8] = {0, 4, 2, 6, 1, 3, 5, 7}; + struct page *page; + int size = last_index - first_index + 1; + int count = 0; + int i; + + read_lock_irq(&mapping->tree_lock); + + for (i = 0; i < 8;) { + page = __find_page(mapping, + first_index + size * steps[i++] / 8); + if (cold_page_refcnt(page) >= PAGE_REFCNT_1 && ++count >= 2) + break; + } + + read_unlock_irq(&mapping->tree_lock); + + return size * count / i; +} + +/* + * Look back and check history pages to estimate thrashing-threshold. + */ +static int query_page_cache(struct address_space *mapping, + unsigned long *remain, unsigned long offset, + unsigned long ra_min, unsigned long ra_max) +{ + int step; + int count; + unsigned long index; + unsigned long nr_lookback; + struct radix_tree_cache cache; + + /* + * Scan backward and check the near @ra_max pages. + * The count here determines ra_size. + */ + read_lock_irq(&mapping->tree_lock); + index = radix_tree_lookup_head(&mapping->page_tree, offset, ra_max); + read_unlock_irq(&mapping->tree_lock); +#ifdef DEBUG_READAHEAD_RADIXTREE + if (index <= offset) { + WARN_ON(!find_page(mapping, index)); + if (index + ra_max > offset) + WARN_ON(find_page(mapping, index - 1)); + } else { + BUG_ON(index > offset + 1); + WARN_ON(find_page(mapping, offset)); + } +#endif + + *remain = offset - index + 1; + + if (unlikely(*remain <= ra_min)) { + count = ra_min; + goto out; + } + + count = count_cache_hit(mapping, index, offset); + if (count < ra_min) + count = ra_min; + if (unlikely(count * 2 < offset - index)) + goto out; + + if (*remain < ra_max) + goto out; + + /* + * Check the far pages coarsely. + * The big count here helps increase la_size. + */ + nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) * + 100 / (readahead_ratio + 1); + if (nr_lookback > offset) + nr_lookback = offset; + + radix_tree_cache_init(&cache); + read_lock_irq(&mapping->tree_lock); + for (step = 2 * ra_max; step < nr_lookback; step += ra_max) { + struct radix_tree_node *node; + node = radix_tree_cache_lookup_node(&mapping->page_tree, + &cache, offset - step, 1); + if (!node) + break; +#ifdef DEBUG_READAHEAD_RADIXTREE + if (node != radix_tree_lookup_node(&mapping->page_tree, + offset - step, 1)) { + read_unlock_irq(&mapping->tree_lock); + printk(KERN_ERR "check radix_tree_cache_lookup_node!\n"); + return 1; + } +#endif + } + read_unlock_irq(&mapping->tree_lock); + + /* + * For sequential read that extends from index 0, the counted value + * may well be far under the true threshold, so return it unmodified + * for further process in adjust_rala_accelerated(). + */ + if (step < offset) + count = step * readahead_ratio / 100; + else + count = offset; + +out: + return count; +} + +/* + * Scan backward in the file for the first non-present page. + */ +static inline unsigned long first_absent_page_bw(struct address_space *mapping, + unsigned long index, unsigned long max_scan) +{ + struct radix_tree_cache cache; + struct page *page; + unsigned long origin; + + origin = index; + if (max_scan > index) + max_scan = index; + radix_tree_cache_init(&cache); + read_lock_irq(&mapping->tree_lock); + for (;;) { + page = radix_tree_cache_lookup(&mapping->page_tree, + &cache, --index); + if (page) { + index++; + break; + } + if (origin - index > max_scan) + break; + } + read_unlock_irq(&mapping->tree_lock); + + return index; +} + +/* + * Scan forward in the file for the first non-present page. + */ +static inline unsigned long first_absent_page(struct address_space *mapping, + unsigned long index, unsigned long max_scan) +{ + unsigned long ra_index; + + read_lock_irq(&mapping->tree_lock); + ra_index = radix_tree_lookup_tail(&mapping->page_tree, + index + 1, max_scan); + read_unlock_irq(&mapping->tree_lock); + +#ifdef DEBUG_READAHEAD_RADIXTREE + BUG_ON(ra_index <= index); + if (index + max_scan > index) { + if (ra_index <= index + max_scan) + WARN_ON(find_page(mapping, ra_index)); + WARN_ON(!find_page(mapping, ra_index - 1)); + } +#endif + + if (ra_index <= index + max_scan) + return ra_index; + else + return 0; +} + +/* + * Determine the request parameters for context based read-ahead that extends + * from start of file. + * + * The major weakness of stateless method is perhaps the slow grow up speed of + * ra_size. The logic tries to make up for this in the important case of + * sequential reads that extend from start of file. In this case, the ra_size + * is not choosed to make the whole next chunk safe(as in normal ones). Only + * half of which is safe. The added 'unsafe' half is the look-ahead part. It + * is expected to be safeguarded by rescue_pages() when the previous chunks are + * lost. + */ +static inline int adjust_rala_accelerated(unsigned long ra_max, + unsigned long *ra_size, unsigned long *la_size) +{ + if (*ra_size <= *la_size) + return 0; + + *la_size = (*ra_size - *la_size) * readahead_ratio / 100; + *ra_size = *la_size * 2; + + if (*ra_size > ra_max) + *ra_size = ra_max; + if (*la_size > *ra_size) + *la_size = *ra_size; + + return 1; +} + +/* + * Main function for page context based read-ahead. + */ +static inline int +try_context_based_readahead(struct address_space *mapping, + struct file_ra_state *ra, + struct page *prev_page, struct page *page, + unsigned long index, + unsigned long ra_min, unsigned long ra_max) +{ + unsigned long ra_index; + unsigned long la_size; + unsigned long ra_size; + unsigned long remain_pages; + + /* Where to start read-ahead? + * NFSv3 daemons may process adjecent requests in parallel, + * leading to many locally disordered, globally sequential reads. + * So do not require nearby history pages to be present or accessed. + */ + if (page) { + ra_index = first_absent_page(mapping, index, ra_max * 5 / 4); + if (unlikely(!ra_index)) + return -1; + } else if (!prev_page) { + ra_index = first_absent_page_bw(mapping, index, ra_min); + if (index - ra_index > ra_min) + return 0; + ra_min += index - ra_index; + index = ra_index; + } else + ra_index = index; + + ra_size = query_page_cache(mapping, &remain_pages, + index - 1, ra_min, ra_max); + + la_size = ra_index - index; + if (readahead_ratio < VM_READAHEAD_PROTECT_RATIO && + remain_pages <= la_size && la_size > 1) { + rescue_pages(page, la_size); + return -1; + } + + if (ra_size == index) { + if (!adjust_rala_accelerated(ra_max, &ra_size, &la_size)) + return -1; + set_ra_class(ra, RA_CLASS_CONTEXT_ACCELERATED); + } else { + if (!adjust_rala(ra_max, &ra_size, &la_size)) + return -1; + set_ra_class(ra, RA_CLASS_CONTEXT); + } + + ra_state_init(ra, index, ra_index); + ra_state_update(ra, ra_size, la_size); + + return 1; +} + /* * ra_size is mainly determined by: @@ -1180,7 +1514,7 @@ page_cache_readahead_adaptive(struct add * Context based sequential read-ahead. */ ret = try_context_based_readahead(mapping, ra, prev_page, page, - index, size, ra_min, ra_max); + index, ra_min, ra_max); if (ret > 0) return ra_dispatch(ra, mapping, filp); if (ret < 0) -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/