This is the slow code path.
No valid state info is available, so the page cache is queried to abtain the
required position/timing infomation.
Major steps:
- look back/forward to find the ra_index;
- look back to estimate a thrashing safe ra_size;
- assemble the next read-ahead request in file->f_ra;
- submit it.
Signed-off-by: Wu Fengguang <[email protected]>
---
mm/readahead.c | 336 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 335 insertions(+), 1 deletion(-)
--- linux-2.6.14-rc5-mm1.orig/mm/readahead.c
+++ linux-2.6.14-rc5-mm1/mm/readahead.c
@@ -1080,6 +1080,340 @@ state_based_readahead(struct address_spa
return ra_dispatch(ra, mapping, filp);
}
+/*
+ * Page cache context based estimation of read-ahead/look-ahead size/index.
+ *
+ * The logic first looks backward in the inactive_list to get an estimation of
+ * the thrashing-threshold, and then, if necessary, looks forward to determine
+ * the start point of next read-ahead.
+ *
+ * The estimation theory can be illustrated with figure:
+ *
+ * chunk A chunk B chunk C head
+ *
+ * l01 l11 l12 l21 l22
+ *| |-->|-->| |------>|-->| |------>|
+ *| +-------+ +-----------+ +-------------+ |
+ *| | # | | # | | # | |
+ *| +-------+ +-----------+ +-------------+ |
+ *| |<==============|<===========================|<============================|
+ * L0 L1 L2
+ *
+ * Let f(l) = L be a map from
+ * l: the number of pages read by the stream
+ * to
+ * L: the number of pages pushed into inactive_list in the mean time
+ * then
+ * f(l01) <= L0
+ * f(l11 + l12) = L1
+ * f(l21 + l22) = L2
+ * ...
+ * f(l01 + l11 + ...) <= Sum(L0 + L1 + ...)
+ * <= Length(inactive_list) = f(thrashing-threshold)
+ *
+ * So the count of countinuous history pages left in the inactive_list is always
+ * a lower estimation of the true thrashing-threshold.
+ */
+
+/*
+ * STATUS REFERENCE COUNT TYPE
+ * A__ 0 not in inactive list
+ * ___ 0 fresh
+ * __R PAGE_REFCNT_1 stale
+ * _a_ PAGE_REFCNT_2 disturbed once
+ * _aR PAGE_REFCNT_3 disturbed twice
+ *
+ * A/a/R: Active / aCTIVATE / Referenced
+ */
+static inline unsigned long cold_page_refcnt(struct page *page)
+{
+ if (!page || PageActive(page))
+ return 0;
+
+ return page_refcnt(page);
+}
+
+static inline char page_refcnt_symbol(struct page *page)
+{
+ if (!page)
+ return 'X';
+ if (PageActive(page))
+ return 'A';
+ switch (page_refcnt(page)) {
+ case 0:
+ return '_';
+ case PAGE_REFCNT_1:
+ return '-';
+ case PAGE_REFCNT_2:
+ return '=';
+ case PAGE_REFCNT_3:
+ return '#';
+ }
+ return '?';
+}
+
+/*
+ * Count/estimate cache hits in range [first_index, last_index].
+ * The estimation is simple and a bit optimistic.
+ */
+static int count_cache_hit(struct address_space *mapping,
+ unsigned long first_index, unsigned long last_index)
+{
+ static int steps[8] = {0, 4, 2, 6, 1, 3, 5, 7};
+ struct page *page;
+ int size = last_index - first_index + 1;
+ int count = 0;
+ int i;
+
+ read_lock_irq(&mapping->tree_lock);
+
+ for (i = 0; i < 8;) {
+ page = __find_page(mapping,
+ first_index + size * steps[i++] / 8);
+ if (cold_page_refcnt(page) >= PAGE_REFCNT_1 && ++count >= 2)
+ break;
+ }
+
+ read_unlock_irq(&mapping->tree_lock);
+
+ return size * count / i;
+}
+
+/*
+ * Look back and check history pages to estimate thrashing-threshold.
+ */
+static int query_page_cache(struct address_space *mapping,
+ unsigned long *remain, unsigned long offset,
+ unsigned long ra_min, unsigned long ra_max)
+{
+ int step;
+ int count;
+ unsigned long index;
+ unsigned long nr_lookback;
+ struct radix_tree_cache cache;
+
+ /*
+ * Scan backward and check the near @ra_max pages.
+ * The count here determines ra_size.
+ */
+ read_lock_irq(&mapping->tree_lock);
+ index = radix_tree_lookup_head(&mapping->page_tree, offset, ra_max);
+ read_unlock_irq(&mapping->tree_lock);
+#ifdef DEBUG_READAHEAD_RADIXTREE
+ if (index <= offset) {
+ WARN_ON(!find_page(mapping, index));
+ if (index + ra_max > offset)
+ WARN_ON(find_page(mapping, index - 1));
+ } else {
+ BUG_ON(index > offset + 1);
+ WARN_ON(find_page(mapping, offset));
+ }
+#endif
+
+ *remain = offset - index + 1;
+
+ if (unlikely(*remain <= ra_min)) {
+ count = ra_min;
+ goto out;
+ }
+
+ count = count_cache_hit(mapping, index, offset);
+ if (count < ra_min)
+ count = ra_min;
+ if (unlikely(count * 2 < offset - index))
+ goto out;
+
+ if (*remain < ra_max)
+ goto out;
+
+ /*
+ * Check the far pages coarsely.
+ * The big count here helps increase la_size.
+ */
+ nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) *
+ 100 / (readahead_ratio + 1);
+ if (nr_lookback > offset)
+ nr_lookback = offset;
+
+ radix_tree_cache_init(&cache);
+ read_lock_irq(&mapping->tree_lock);
+ for (step = 2 * ra_max; step < nr_lookback; step += ra_max) {
+ struct radix_tree_node *node;
+ node = radix_tree_cache_lookup_node(&mapping->page_tree,
+ &cache, offset - step, 1);
+ if (!node)
+ break;
+#ifdef DEBUG_READAHEAD_RADIXTREE
+ if (node != radix_tree_lookup_node(&mapping->page_tree,
+ offset - step, 1)) {
+ read_unlock_irq(&mapping->tree_lock);
+ printk(KERN_ERR "check radix_tree_cache_lookup_node!\n");
+ return 1;
+ }
+#endif
+ }
+ read_unlock_irq(&mapping->tree_lock);
+
+ /*
+ * For sequential read that extends from index 0, the counted value
+ * may well be far under the true threshold, so return it unmodified
+ * for further process in adjust_rala_accelerated().
+ */
+ if (step < offset)
+ count = step * readahead_ratio / 100;
+ else
+ count = offset;
+
+out:
+ return count;
+}
+
+/*
+ * Scan backward in the file for the first non-present page.
+ */
+static inline unsigned long first_absent_page_bw(struct address_space *mapping,
+ unsigned long index, unsigned long max_scan)
+{
+ struct radix_tree_cache cache;
+ struct page *page;
+ unsigned long origin;
+
+ origin = index;
+ if (max_scan > index)
+ max_scan = index;
+ radix_tree_cache_init(&cache);
+ read_lock_irq(&mapping->tree_lock);
+ for (;;) {
+ page = radix_tree_cache_lookup(&mapping->page_tree,
+ &cache, --index);
+ if (page) {
+ index++;
+ break;
+ }
+ if (origin - index > max_scan)
+ break;
+ }
+ read_unlock_irq(&mapping->tree_lock);
+
+ return index;
+}
+
+/*
+ * Scan forward in the file for the first non-present page.
+ */
+static inline unsigned long first_absent_page(struct address_space *mapping,
+ unsigned long index, unsigned long max_scan)
+{
+ unsigned long ra_index;
+
+ read_lock_irq(&mapping->tree_lock);
+ ra_index = radix_tree_lookup_tail(&mapping->page_tree,
+ index + 1, max_scan);
+ read_unlock_irq(&mapping->tree_lock);
+
+#ifdef DEBUG_READAHEAD_RADIXTREE
+ BUG_ON(ra_index <= index);
+ if (index + max_scan > index) {
+ if (ra_index <= index + max_scan)
+ WARN_ON(find_page(mapping, ra_index));
+ WARN_ON(!find_page(mapping, ra_index - 1));
+ }
+#endif
+
+ if (ra_index <= index + max_scan)
+ return ra_index;
+ else
+ return 0;
+}
+
+/*
+ * Determine the request parameters for context based read-ahead that extends
+ * from start of file.
+ *
+ * The major weakness of stateless method is perhaps the slow grow up speed of
+ * ra_size. The logic tries to make up for this in the important case of
+ * sequential reads that extend from start of file. In this case, the ra_size
+ * is not choosed to make the whole next chunk safe(as in normal ones). Only
+ * half of which is safe. The added 'unsafe' half is the look-ahead part. It
+ * is expected to be safeguarded by rescue_pages() when the previous chunks are
+ * lost.
+ */
+static inline int adjust_rala_accelerated(unsigned long ra_max,
+ unsigned long *ra_size, unsigned long *la_size)
+{
+ if (*ra_size <= *la_size)
+ return 0;
+
+ *la_size = (*ra_size - *la_size) * readahead_ratio / 100;
+ *ra_size = *la_size * 2;
+
+ if (*ra_size > ra_max)
+ *ra_size = ra_max;
+ if (*la_size > *ra_size)
+ *la_size = *ra_size;
+
+ return 1;
+}
+
+/*
+ * Main function for page context based read-ahead.
+ */
+static inline int
+try_context_based_readahead(struct address_space *mapping,
+ struct file_ra_state *ra,
+ struct page *prev_page, struct page *page,
+ unsigned long index,
+ unsigned long ra_min, unsigned long ra_max)
+{
+ unsigned long ra_index;
+ unsigned long la_size;
+ unsigned long ra_size;
+ unsigned long remain_pages;
+
+ /* Where to start read-ahead?
+ * NFSv3 daemons may process adjecent requests in parallel,
+ * leading to many locally disordered, globally sequential reads.
+ * So do not require nearby history pages to be present or accessed.
+ */
+ if (page) {
+ ra_index = first_absent_page(mapping, index, ra_max * 5 / 4);
+ if (unlikely(!ra_index))
+ return -1;
+ } else if (!prev_page) {
+ ra_index = first_absent_page_bw(mapping, index, ra_min);
+ if (index - ra_index > ra_min)
+ return 0;
+ ra_min += index - ra_index;
+ index = ra_index;
+ } else
+ ra_index = index;
+
+ ra_size = query_page_cache(mapping, &remain_pages,
+ index - 1, ra_min, ra_max);
+
+ la_size = ra_index - index;
+ if (readahead_ratio < VM_READAHEAD_PROTECT_RATIO &&
+ remain_pages <= la_size && la_size > 1) {
+ rescue_pages(page, la_size);
+ return -1;
+ }
+
+ if (ra_size == index) {
+ if (!adjust_rala_accelerated(ra_max, &ra_size, &la_size))
+ return -1;
+ set_ra_class(ra, RA_CLASS_CONTEXT_ACCELERATED);
+ } else {
+ if (!adjust_rala(ra_max, &ra_size, &la_size))
+ return -1;
+ set_ra_class(ra, RA_CLASS_CONTEXT);
+ }
+
+ ra_state_init(ra, index, ra_index);
+ ra_state_update(ra, ra_size, la_size);
+
+ return 1;
+}
+
/*
* ra_size is mainly determined by:
@@ -1180,7 +1514,7 @@ page_cache_readahead_adaptive(struct add
* Context based sequential read-ahead.
*/
ret = try_context_based_readahead(mapping, ra, prev_page, page,
- index, size, ra_min, ra_max);
+ index, ra_min, ra_max);
if (ret > 0)
return ra_dispatch(ra, mapping, filp);
if (ret < 0)
--