Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1161112AbVKYPIR (ORCPT ); Fri, 25 Nov 2005 10:08:17 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1161108AbVKYPH5 (ORCPT ); Fri, 25 Nov 2005 10:07:57 -0500 Received: from ns.ustc.edu.cn ([202.38.64.1]:17280 "EHLO mx1.ustc.edu.cn") by vger.kernel.org with ESMTP id S932698AbVKYPHv (ORCPT ); Fri, 25 Nov 2005 10:07:51 -0500 Message-Id: <20051125151606.005059000@localhost.localdomain> References: <20051125151210.993109000@localhost.localdomain> Date: Fri, 25 Nov 2005 23:12:21 +0800 From: Wu Fengguang To: linux-kernel@vger.kernel.org Cc: Andrew Morton , Wu Fengguang Subject: [PATCH 11/19] readahead: context based method Content-Disposition: inline; filename=readahead-method-context.patch Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 10674 Lines: 378 This is the slow code path. No valid state info is available, so the page cache is queried to abtain the required position/timing infomation. Major steps: - look back/forward to find the ra_index; - look back to estimate a thrashing safe ra_size; - assemble the next read-ahead request in file_ra_state; - submit it. Signed-off-by: Wu Fengguang --- mm/readahead.c | 345 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 345 insertions(+) --- linux-2.6.15-rc2-mm1.orig/mm/readahead.c +++ linux-2.6.15-rc2-mm1/mm/readahead.c @@ -1160,6 +1160,351 @@ state_based_readahead(struct address_spa return ra_dispatch(ra, mapping, filp); } +/* + * Page cache context based estimation of read-ahead/look-ahead size/index. + * + * The logic first looks around to find the start point of next read-ahead, + * and then, if necessary, looks backward in the inactive_list to get an + * estimation of the thrashing-threshold. + * + * The estimation theory can be illustrated with figure: + * + * chunk A chunk B chunk C head + * + * l01 l11 l12 l21 l22 + *| |-->|-->| |------>|-->| |------>| + *| +-------+ +-----------+ +-------------+ | + *| | # | | # | | # | | + *| +-------+ +-----------+ +-------------+ | + *| |<==============|<===========================|<============================| + * L0 L1 L2 + * + * Let f(l) = L be a map from + * l: the number of pages read by the stream + * to + * L: the number of pages pushed into inactive_list in the mean time + * then + * f(l01) <= L0 + * f(l11 + l12) = L1 + * f(l21 + l22) = L2 + * ... + * f(l01 + l11 + ...) <= Sum(L0 + L1 + ...) + * <= Length(inactive_list) = f(thrashing-threshold) + * + * So the count of countinuous history pages left in the inactive_list is always + * a lower estimation of the true thrashing-threshold. + */ + +/* + * STATUS REFERENCE COUNT TYPE + * A__ 0 not in inactive list + * ___ 0 fresh + * __R PAGE_REFCNT_1 stale + * _a_ PAGE_REFCNT_2 disturbed once + * _aR PAGE_REFCNT_3 disturbed twice + * + * A/a/R: Active / aCTIVATE / Referenced + */ +static inline unsigned long cold_page_refcnt(struct page *page) +{ + if (!page || PageActive(page)) + return 0; + + return page_refcnt(page); +} + +static inline char page_refcnt_symbol(struct page *page) +{ + if (!page) + return 'X'; + if (PageActive(page)) + return 'A'; + switch (page_refcnt(page)) { + case 0: + return '_'; + case PAGE_REFCNT_1: + return '-'; + case PAGE_REFCNT_2: + return '='; + case PAGE_REFCNT_3: + return '#'; + } + return '?'; +} + +/* + * Count/estimate cache hits in range [first_index, last_index]. + * The estimation is simple and optimistic. + */ +static int count_cache_hit(struct address_space *mapping, + pgoff_t first_index, pgoff_t last_index) +{ + struct page *page; + int size = last_index - first_index + 1; + int count = 0; + int i; + + read_lock_irq(&mapping->tree_lock); + + /* + * The first page may well is chunk head and has been accessed, + * so it is index 0 that makes the estimation optimistic. This + * behavior guarantees a readahead when (size < ra_max) and + * (readahead_hit_rate >= 16). + */ + for (i = 0; i < 16;) { + page = __find_page(mapping, first_index + + size * ((i++ * 29) & 15) / 16); + if (cold_page_refcnt(page) >= PAGE_REFCNT_1 && ++count >= 2) + break; + } + + read_unlock_irq(&mapping->tree_lock); + + return size * count / i; +} + +/* + * Look back and check history pages to estimate thrashing-threshold. + */ +static int query_page_cache(struct address_space *mapping, + struct file_ra_state *ra, + unsigned long *remain, pgoff_t offset, + unsigned long ra_min, unsigned long ra_max) +{ + int count; + pgoff_t index; + unsigned long nr_lookback; + struct radix_tree_cache cache; + + /* + * Scan backward and check the near @ra_max pages. + * The count here determines ra_size. + */ + read_lock_irq(&mapping->tree_lock); + index = radix_tree_lookup_head(&mapping->page_tree, offset, ra_max); + read_unlock_irq(&mapping->tree_lock); +#ifdef DEBUG_READAHEAD_RADIXTREE + if (index <= offset) { + WARN_ON(!find_page(mapping, index)); + if (index + ra_max > offset) + WARN_ON(find_page(mapping, index - 1)); + } else { + BUG_ON(index > offset + 1); + WARN_ON(find_page(mapping, offset)); + } +#endif + + *remain = offset - index + 1; + + if (unlikely(*remain <= ra_min)) + return ra_min; + + if (!index) + return *remain; + + if (offset + 1 == ra->readahead_index && ra_cache_hit_ok(ra)) + count = *remain; + else if (count_cache_hit(mapping, index, offset) * + readahead_hit_rate >= *remain) + count = *remain; + else + return ra_min; + + if (count < ra_max) + goto out; + + /* + * Check the far pages coarsely. + * The big count here helps increase la_size. + */ + nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) * + 100 / (readahead_ratio + 1); + if (nr_lookback > offset) + nr_lookback = offset; + + radix_tree_cache_init(&cache); + read_lock_irq(&mapping->tree_lock); + for (count += ra_max; count < nr_lookback; count += ra_max) { + struct radix_tree_node *node; + node = radix_tree_cache_lookup_node(&mapping->page_tree, + &cache, offset - count, 1); + if (!node) + break; +#ifdef DEBUG_READAHEAD_RADIXTREE + if (node != radix_tree_lookup_node(&mapping->page_tree, + offset - count, 1)) { + read_unlock_irq(&mapping->tree_lock); + printk(KERN_ERR "check radix_tree_cache_lookup_node!\n"); + return 1; + } +#endif + } + read_unlock_irq(&mapping->tree_lock); + + /* + * For sequential read that extends from index 0, the counted value + * may well be far under the true threshold, so return it unmodified + * for further process in adjust_rala_accelerated(). + */ + if (count >= offset) + return offset + 1; + +out: + count = count * readahead_ratio / 100; + return count; +} + +/* + * Scan backward in the file for the first non-present page. + */ +static inline pgoff_t first_absent_page_bw(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + struct radix_tree_cache cache; + struct page *page; + pgoff_t origin; + + origin = index; + if (max_scan > index) + max_scan = index; + + radix_tree_cache_init(&cache); + read_lock_irq(&mapping->tree_lock); + for (; origin - index <= max_scan;) { + page = radix_tree_cache_lookup(&mapping->page_tree, + &cache, --index); + if (page) { + index++; + break; + } + } + read_unlock_irq(&mapping->tree_lock); + + return index; +} + +/* + * Scan forward in the file for the first non-present page. + */ +static inline pgoff_t first_absent_page(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + pgoff_t ra_index; + + read_lock_irq(&mapping->tree_lock); + ra_index = radix_tree_lookup_tail(&mapping->page_tree, + index + 1, max_scan); + read_unlock_irq(&mapping->tree_lock); + +#ifdef DEBUG_READAHEAD_RADIXTREE + BUG_ON(ra_index <= index); + if (index + max_scan > index) { + if (ra_index <= index + max_scan) + WARN_ON(find_page(mapping, ra_index)); + WARN_ON(!find_page(mapping, ra_index - 1)); + } +#endif + + if (ra_index <= index + max_scan) + return ra_index; + else + return 0; +} + +/* + * Determine the request parameters for context based read-ahead that extends + * from start of file. + * + * The major weakness of stateless method is perhaps the slow grow up speed of + * ra_size. The logic tries to make up for this in the important case of + * sequential reads that extend from start of file. In this case, the ra_size + * is not choosed to make the whole next chunk safe(as in normal ones). Only + * half of which is safe. The added 'unsafe' half is the look-ahead part. It + * is expected to be safeguarded by rescue_pages() when the previous chunks are + * lost. + */ +static inline int adjust_rala_accelerated(unsigned long ra_max, + unsigned long *ra_size, unsigned long *la_size) +{ + pgoff_t index = *ra_size; + + *ra_size -= min(*ra_size, *la_size); + *ra_size = *ra_size * readahead_ratio / 100; + *la_size = index * readahead_ratio / 100; + *ra_size += *la_size; + + if (*ra_size > ra_max) + *ra_size = ra_max; + if (*la_size > *ra_size) + *la_size = *ra_size; + + return 1; +} + +/* + * Main function for page context based read-ahead. + */ +static inline int +try_context_based_readahead(struct address_space *mapping, + struct file_ra_state *ra, + struct page *prev_page, struct page *page, + pgoff_t index, unsigned long ra_size, + unsigned long ra_min, unsigned long ra_max) +{ + pgoff_t ra_index; + unsigned long la_size; + unsigned long remain_pages; + + /* Where to start read-ahead? + * NFSv3 daemons may process adjecent requests in parallel, + * leading to many locally disordered, globally sequential reads. + * So do not require nearby history pages to be present or accessed. + */ + if (page) { + ra_index = first_absent_page(mapping, index, ra_max * 5 / 4); + if (unlikely(!ra_index)) + return -1; + } else if (!prev_page) { + ra_index = first_absent_page_bw(mapping, index, + readahead_hit_rate + ra_min); + if (index - ra_index > readahead_hit_rate + ra_min) + return 0; + ra_min += 2 * (index - ra_index); + index = ra_index; + } else { + ra_index = index; + if (ra_has_index(ra, index)) + ra_account(ra, RA_EVENT_READAHEAD_MUTILATE, + ra->readahead_index - index); + } + + ra_size = query_page_cache(mapping, ra, &remain_pages, + index - 1, ra_min, ra_max); + + la_size = ra_index - index; + if (readahead_ratio < VM_READAHEAD_PROTECT_RATIO && + remain_pages <= la_size && la_size > 1) { + rescue_pages(page, la_size); + return -1; + } + + if (ra_size == index) { + if (!adjust_rala_accelerated(ra_max, &ra_size, &la_size)) + return -1; + set_ra_class(ra, RA_CLASS_CONTEXT_ACCELERATED); + } else { + if (!adjust_rala(ra_max, &ra_size, &la_size)) + return -1; + set_ra_class(ra, RA_CLASS_CONTEXT); + } + + ra_state_init(ra, index, ra_index); + ra_state_update(ra, ra_size, la_size); + + return 1; +} + /* * ra_size is mainly determined by: -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/