2004-09-23 16:03:47

by Steven Pratt

[permalink] [raw]
Subject: [PATCH/RFC] Simplified Readahead

--- linux-2.6.9-rc2/include/linux/mm.h.org 2004-09-17 14:05:27.000000000 -0500
+++ linux-2.6.9-rc2/include/linux/mm.h 2004-09-22 12:42:59.000000000 -0500
@@ -727,10 +727,11 @@
unsigned long offset, unsigned long nr_to_read);
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
unsigned long offset, unsigned long nr_to_read);
-void page_cache_readahead(struct address_space *mapping,
+unsigned long page_cache_readahead(struct address_space *mapping,
struct file_ra_state *ra,
struct file *filp,
- unsigned long offset);
+ unsigned long offset,
+ unsigned long size);
void handle_ra_miss(struct address_space *mapping,
struct file_ra_state *ra, pgoff_t offset);
unsigned long max_sane_readahead(unsigned long nr);
--- linux-2.6.9-rc2/include/linux/fs.h.org 2004-09-17 14:05:47.000000000 -0500
+++ linux-2.6.9-rc2/include/linux/fs.h 2004-09-21 13:47:10.000000000 -0500
@@ -557,16 +557,15 @@
struct file_ra_state {
unsigned long start; /* Current window */
unsigned long size;
- unsigned long next_size; /* Next window size */
+ unsigned long flags; /* ra flags RA_FLAG_xxx*/
unsigned long prev_page; /* Cache last read() position */
unsigned long ahead_start; /* Ahead window */
unsigned long ahead_size;
- unsigned long currnt_wnd_hit; /* locality in the current window */
- unsigned long average; /* size of next current window */
unsigned long ra_pages; /* Maximum readahead window */
unsigned long mmap_hit; /* Cache hit stat for mmap accesses */
unsigned long mmap_miss; /* Cache miss stat for mmap accesses */
};
+#define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */

struct file {
struct list_head f_list;
--- linux-2.6.9-rc2/mm/readahead.org.c 2004-09-16 16:02:05.000000000 -0500
+++ linux-2.6.9-rc2/mm/readahead.c 2004-09-22 13:18:46.000000000 -0500
@@ -35,7 +35,6 @@
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
ra->ra_pages = mapping->backing_dev_info->ra_pages;
- ra->average = ra->ra_pages / 2;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);

@@ -52,6 +51,62 @@
return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
}

+static inline void ra_off(struct file_ra_state *ra)
+{
+ ra->start=0;
+ ra->flags=0;
+ ra->size=-1UL;
+ ra->ahead_start=0;
+ ra->ahead_size=0;
+ return;
+}
+
+/*
+ * Set the initial window size, round to next power of 2 and square
+ * for small size, x 4 for medium, and x 2 for large
+ * for 128k (32 page) max ra
+ * 1-8 page = 32k initial, > 8 page = 128k initial
+ */
+unsigned long get_init_ra_size(unsigned long size, unsigned long max,
+ unsigned long min)
+{
+ unsigned long s_size=1, newsize;
+ do {
+ s_size = s_size << 1;
+ } while ((size = size >> 1));
+ if (s_size <= max / 64) {
+ newsize = s_size * s_size;
+ } else if (s_size <= max/4) {
+ newsize = max / 4;
+ } else {
+ newsize = max;
+ }
+ return newsize;
+}
+
+/*
+ * Set the new window size, this is called only when
+ * I/O is to be submitted, not for each call to readahead
+ * If cache miss occered, reduce next I/O size, else
+ * increase depending on how close to max we are.
+ */
+unsigned long get_next_ra_size(unsigned long cur, unsigned long max,
+ unsigned long min, unsigned long * flags)
+{
+ unsigned long newsize;
+ if (*flags & RA_FLAG_MISS) {
+ newsize = max((cur - 2),min);
+ *flags &= ~RA_FLAG_MISS;
+ } else if ( cur < max/16 ) {
+ newsize = 4 * cur;
+ } else {
+ newsize = 2 * cur;
+ }
+ newsize = min(newsize, max_sane_readahead(max));
+ return newsize;
+}
+
+
#define list_to_page(head) (list_entry((head)->prev, struct page, lru))

/**
@@ -152,19 +207,17 @@
* ahead_size: Together, these form the "ahead window".
* ra_pages: The externally controlled max readahead for this fd.
*
- * When readahead is in the "maximally shrunk" state (next_size == -1UL),
- * readahead is disabled. In this state, prev_page and size are used, inside
- * handle_ra_miss(), to detect the resumption of sequential I/O. Once there
- * has been a decent run of sequential I/O (defined by get_min_readahead),
- * readahead is reenabled.
+ * When readahead is in the off state (size == -1UL),
+ * readahead is disabled. In this state, prev_page is used
+ * to detect the resumption of sequential I/O.
*
* The readahead code manages two windows - the "current" and the "ahead"
* windows. The intent is that while the application is walking the pages
* in the current window, I/O is underway on the ahead window. When the
* current window is fully traversed, it is replaced by the ahead window
* and the ahead window is invalidated. When this copying happens, the
- * new current window's pages are probably still locked. When I/O has
- * completed, we submit a new batch of I/O, creating a new ahead window.
+ * new current window's pages are probably still locked. So
+ * we submit a new batch of I/O immediately, creating a new ahead window.
*
* So:
*
@@ -176,34 +229,25 @@
* ahead window.
*
* A `readahead hit' occurs when a read request is made against a page which is
- * inside the current window. Hits are good, and the window size (next_size)
- * is grown aggressively when hits occur. Two pages are added to the next
- * window size on each hit, which will end up doubling the next window size by
- * the time I/O is submitted for it.
- *
- * If readahead hits are more sparse (say, the application is only reading
- * every second page) then the window will build more slowly.
- *
- * On a readahead miss (the application seeked away) the readahead window is
- * shrunk by 25%. We don't want to drop it too aggressively, because it is a
- * good assumption that an application which has built a good readahead window
- * will continue to perform linear reads. Either at the new file position, or
- * at the old one after another seek.
+ * the next sequential page. Ahead windowe calculations are done only when it
+ * is time to submit a new IO. The code ramps up the size agressively at first,
+ * but slow down as it approaches max_readhead.
*
- * After enough misses, readahead is fully disabled. (next_size = -1UL).
+ * Any seek/ramdom IO will result in readahead being turned off. It will resume
+ * at the first sequential access.
*
* There is a special-case: if the first page which the application tries to
* read happens to be the first page of the file, it is assumed that a linear
- * read is about to happen and the window is immediately set to half of the
- * device maximum.
+ * read is about to happen and the window is immediately set to the initial size
+ * based on I/O request size and the max_readahead.
*
* A page request at (start + size) is not a miss at all - it's just a part of
* sequential file reading.
*
- * This function is to be called for every page which is read, rather than when
- * it is time to perform readahead. This is so the readahead algorithm can
- * centrally work out the access patterns. This could be costly with many tiny
- * read()s, so we specifically optimise for that case with prev_page.
+ * This function is to be called for every read request, rather than when
+ * it is time to perform readahead. It is called only oce for the entire I/O
+ * regardless of size unless readahead is unable to start enough I/O to satisfy
+ * the request (I/O request > max_readahead).
*/

/*
@@ -212,7 +256,7 @@
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*
- * Returns the number of pages which actually had IO started against them.
+ * Returns the number of pages requested, or the maximum amount of I/O allowed.
*/
static inline int
__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
@@ -310,257 +354,123 @@
int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
unsigned long offset, unsigned long nr_to_read)
{
- if (!bdi_read_congested(mapping->backing_dev_info))
- return __do_page_cache_readahead(mapping, filp,
- offset, nr_to_read);
- return 0;
+ return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
}

-/*
- * Check how effective readahead is being. If the amount of started IO is
- * less than expected then the file is partly or fully in pagecache and
- * readahead isn't helping. Shrink the window.
- *
- * But don't shrink it too much - the application may read the same page
- * occasionally.
- */
-static inline void
-check_ra_success(struct file_ra_state *ra, pgoff_t attempt,
- pgoff_t actual, pgoff_t orig_next_size)
-{
- if (actual == 0) {
- if (orig_next_size > 1) {
- ra->next_size = orig_next_size - 1;
- if (ra->ahead_size)
- ra->ahead_size = ra->next_size;
- } else {
- ra->next_size = -1UL;
- ra->size = 0;
- }
- }
-}
+
+

/*
* page_cache_readahead is the main function. If performs the adaptive
* readahead window size management and submits the readahead I/O.
*/
-void
+unsigned long
page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
- struct file *filp, unsigned long offset)
+ struct file *filp, unsigned long offset, unsigned long req_size)
{
- unsigned max;
- unsigned orig_next_size;
- unsigned actual;
- int first_access=0;
- unsigned long average;
-
+ unsigned long max, min, maxsane, newsize=req_size;
+ unsigned long actual=0;
+
/*
* Here we detect the case where the application is performing
* sub-page sized reads. We avoid doing extra work and bogusly
* perturbing the readahead window expansion logic.
- * If next_size is zero, this is the very first read for this
- * file handle, or the window is maximally shrunk.
+ * If size is zero, there is no read ahead window so we need one
*/
- if (offset == ra->prev_page) {
- if (ra->next_size != 0)
- goto out;
+ if (offset == ra->prev_page && req_size == 1 && ra->size != 0) {
+ goto out;
}
-
- if (ra->next_size == -1UL)
- goto out; /* Maximally shrunk */
-
+
max = get_max_readahead(ra);
- if (max == 0)
+ min = get_min_readahead(ra);
+ maxsane = max_sane_readahead(max);
+ newsize = min(req_size, maxsane);
+
+ if (newsize == 0)
goto out; /* No readahead */

- orig_next_size = ra->next_size;
-
- if (ra->next_size == 0) {
- /*
- * Special case - first read.
- * We'll assume it's a whole-file read, and
- * grow the window fast.
- */
- first_access=1;
- ra->next_size = max / 2;
- ra->prev_page = offset;
- ra->currnt_wnd_hit++;
- goto do_io;
- }
-
- ra->prev_page = offset;
-
- if (offset >= ra->start && offset <= (ra->start + ra->size)) {
- /*
- * A readahead hit. Either inside the window, or one
- * page beyond the end. Expand the next readahead size.
+ /*
+ * Special case - first read.
+ * We'll assume it's a whole-file read if at start of file, and
+ * grow the window fast.
+ * or detect first sequential access
+ */
+ if ((ra->size == 0 && offset == 0) // first io and start of file
+ || (ra->size == -1UL && ra->prev_page == offset-1)) { //1st seq access
+ ra->prev_page = offset + newsize-1;
+ ra->size = get_init_ra_size(newsize, max, min);
+ ra->start = offset;
+ actual = do_page_cache_readahead(mapping, filp, offset, ra->size);
+ /* if the request size is larger than our max readahead, we
+ * at least want to be sure that we get 2 IOs if flight and
+ * we know that we will definitly need the new I/O.
+ * once we do this, subsequent calls should be able to overlap IOs,
+ * thus preventing stalls. so Issue the ahead window immediately.
*/
- ra->next_size += 2;
-
- if (ra->currnt_wnd_hit <= (max * 2))
- ra->currnt_wnd_hit++;
- } else {
- /*
- * A miss - lseek, pagefault, pread, etc. Shrink the readahead
- * window.
- */
- ra->next_size -= 2;
-
- average = ra->average;
- if (average < ra->currnt_wnd_hit) {
- average++;
+ if (req_size >= max) {
+ ra->ahead_size = get_next_ra_size(ra->size, max, min, &ra->flags);
+ ra->ahead_start = ra->start + ra->size;
+ actual = do_page_cache_readahead(mapping, filp,
+ ra->ahead_start, ra->ahead_size);
}
- ra->average = (average + ra->currnt_wnd_hit) / 2;
- ra->currnt_wnd_hit = 1;
+ goto out;
}

- if ((long)ra->next_size > (long)max)
- ra->next_size = max;
- if ((long)ra->next_size <= 0L) {
- ra->next_size = -1UL;
- ra->size = 0;
- goto out; /* Readahead is off */
- }
+ /* now handle the random case:
+ * partial page reads and first access were handled above,
+ * so this must be the next page otherwise it is random
+ */
+ if ((offset != (ra->prev_page+1) || (ra->size == 0))) {
+ ra_off(ra);
+ ra->prev_page = offset + newsize-1;
+ actual = do_page_cache_readahead(mapping, filp, offset, newsize);
+ goto out;
+ }

- /*
- * Is this request outside the current window?
+ /* If we get here we are doing sequential IO and this was
+ * not the first occurence (ie we have an existing window)
*/
- if (offset < ra->start || offset >= (ra->start + ra->size)) {
- /*
- * A miss against the current window. Have we merely
- * advanced into the ahead window?
- */
- if (offset == ra->ahead_start) {
- /*
- * Yes, we have. The ahead window now becomes
- * the current window.
- */
+
+ if (ra->ahead_start == 0) { /* no ahead window yet */
+ ra->ahead_size = get_next_ra_size(ra->size, max, min, &ra->flags);
+ ra->ahead_start = ra->start + ra->size;
+ ra->prev_page = offset + newsize - 1;
+ actual = do_page_cache_readahead(mapping, filp,
+ ra->ahead_start, ra->ahead_size);
+ }else {
+ /* already have an ahead window, check if we crossed into it */
+ if ((offset + newsize -1) >= ra->ahead_start) {
ra->start = ra->ahead_start;
ra->size = ra->ahead_size;
- ra->prev_page = ra->start;
- ra->ahead_start = 0;
- ra->ahead_size = 0;
-
- /*
- * Control now returns, probably to sleep until I/O
- * completes against the first ahead page.
- * When the second page in the old ahead window is
- * requested, control will return here and more I/O
- * will be submitted to build the new ahead window.
- */
- goto out;
- }
-do_io:
- /*
- * This is the "unusual" path. We come here during
- * startup or after an lseek. We invalidate the
- * ahead window and get some I/O underway for the new
- * current window.
- */
- if (!first_access) {
- /* Heuristic: there is a high probability
- * that around ra->average number of
- * pages shall be accessed in the next
- * current window.
- */
- average = ra->average;
- if (ra->currnt_wnd_hit > average)
- average = (ra->currnt_wnd_hit + ra->average + 1) / 2;
-
- ra->next_size = min(average , (unsigned long)max);
- }
- ra->start = offset;
- ra->size = ra->next_size;
- ra->ahead_start = 0; /* Invalidate these */
- ra->ahead_size = 0;
- actual = do_page_cache_readahead(mapping, filp, offset,
- ra->size);
- if(!first_access) {
- /*
- * do not adjust the readahead window size the first
- * time, the ahead window might get closed if all
- * the pages are already in the cache.
- */
- check_ra_success(ra, ra->size, actual, orig_next_size);
- }
- } else {
- /*
- * This read request is within the current window. It may be
- * time to submit I/O for the ahead window while the
- * application is about to step into the ahead window.
- */
- if (ra->ahead_start == 0) {
- /*
- * If the average io-size is more than maximum
- * readahead size of the file the io pattern is
- * sequential. Hence bring in the readahead window
- * immediately.
- * If the average io-size is less than maximum
- * readahead size of the file the io pattern is
- * random. Hence don't bother to readahead.
- */
- average = ra->average;
- if (ra->currnt_wnd_hit > average)
- average = (ra->currnt_wnd_hit + ra->average + 1) / 2;
-
- if (average > max) {
- ra->ahead_start = ra->start + ra->size;
- ra->ahead_size = ra->next_size;
- actual = do_page_cache_readahead(mapping, filp,
+ ra->ahead_start = ra->ahead_start + ra->ahead_size;
+ ra->ahead_size = get_next_ra_size(ra->ahead_size, max, min, &ra->flags);
+ ra->prev_page = offset + newsize - 1;
+ actual = do_page_cache_readahead(mapping, filp,
ra->ahead_start, ra->ahead_size);
- check_ra_success(ra, ra->ahead_size,
- actual, orig_next_size);
- }
+ }else {
+ /* do nothing, read contained in current window and ahead
+ * window populated just update the prev_page pointer.
+ */
+ ra->prev_page = offset + newsize -1;
}
}
out:
- return;
+ return (newsize);
}
-EXPORT_SYMBOL(page_cache_readahead);
-

/*
* handle_ra_miss() is called when it is known that a page which should have
* been present in the pagecache (we just did some readahead there) was in fact
* not found. This will happen if it was evicted by the VM (readahead
- * thrashing) or if the readahead window is maximally shrunk.
+ * thrashing)
*
- * If the window has been maximally shrunk (next_size == -1UL) then look to see
- * if we are getting misses against sequential file offsets. If so, and this
- * persists then resume readahead.
- *
- * Otherwise we're thrashing, so shrink the readahead window by three pages.
- * This is because it is grown by two pages on a readahead hit. Theory being
- * that the readahead window size will stabilise around the maximum level at
- * which there is no thrashing.
+ * turn on the cache miss flag in the RA struct, this will cause the RA code
+ * to reduce the RA size on the next read.
*/
void handle_ra_miss(struct address_space *mapping,
struct file_ra_state *ra, pgoff_t offset)
{
- if (ra->next_size == -1UL) {
- const unsigned long max = get_max_readahead(ra);
-
- if (offset != ra->prev_page + 1) {
- ra->size = ra->size?ra->size-1:0; /* Not sequential */
- } else {
- ra->size++; /* A sequential read */
- if (ra->size >= max) { /* Resume readahead */
- ra->start = offset - max;
- ra->next_size = max;
- ra->size = max;
- ra->ahead_start = 0;
- ra->ahead_size = 0;
- ra->average = max / 2;
- }
- }
- ra->prev_page = offset;
- } else {
- const unsigned long min = get_min_readahead(ra);
-
- ra->next_size -= 3;
- if (ra->next_size < min)
- ra->next_size = min;
- }
+ ra->flags |= RA_FLAG_MISS;
}

/*
--- linux-2.6.9-rc2/mm/filemap.org.c 2004-09-17 16:11:21.000000000 -0500
+++ linux-2.6.9-rc2/mm/filemap.c 2004-09-21 13:45:25.000000000 -0500
@@ -717,14 +717,15 @@
read_actor_t actor)
{
struct inode *inode = mapping->host;
- unsigned long index, end_index, offset;
+ unsigned long index, end_index, offset, req_size, next_index;
loff_t isize;
struct page *cached_page;
int error;
struct file_ra_state ra = *_ra;

cached_page = NULL;
- index = *ppos >> PAGE_CACHE_SHIFT;
+ next_index = index = *ppos >> PAGE_CACHE_SHIFT;
+ req_size = (desc->count + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;

isize = i_size_read(inode);
@@ -734,7 +735,7 @@
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
for (;;) {
struct page *page;
- unsigned long nr, ret;
+ unsigned long ret_size, nr, ret;

/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
@@ -749,7 +750,12 @@
nr = nr - offset;

cond_resched();
- page_cache_readahead(mapping, &ra, filp, index);
+ if(index == next_index && req_size) {
+ ret_size = page_cache_readahead(mapping, &ra,
+ filp, index, req_size);
+ next_index += ret_size;
+ req_size -= ret_size;
+ }

find_page:
page = find_get_page(mapping, index);
@@ -1195,7 +1201,7 @@
* For sequential accesses, we use the generic readahead logic.
*/
if (VM_SequentialReadHint(area))
- page_cache_readahead(mapping, ra, file, pgoff);
+ page_cache_readahead(mapping, ra, file, pgoff, 1);

/*
* Do we have something in the page cache already?


Attachments:
2.6.9-rc2-mm1-newra.patch (19.60 kB)

2004-09-23 22:21:59

by Joel Schopp

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

> The key design point of the new design is to make the readahead code
> aware of the size of the I/O request. This change eliminates the need
> for treating large random I/O as sequential and all of the averaging
> code that exists just to support this. In addition to this change, the
> new design ramps up quicker, and shuts off faster. This, combined with
> the request size awareness eliminates the so called "slow read path"
> that we try so hard to avoid in the current code. For complete details
> on the design of the new readahead logic, please refer to
> http://www-124.ibm.com/developerworks/opensource/linuxperf/readahead/read-ahead-design.pdf

I do love the design. Certainly an improvement over what we currently
have in all ways I can think of.

>
>
> There are a few exception cases which still concern me.
> 1. pages already in cache
> 2. I/O queue congestion.
> 3. page stealing

Correct me if I am wrong on this. But if there was simultaneous
multithreaded reading on a single file would it look non sequential and
not trigger readahead? For instance if on a 4 processor system 4
threads were each reading 1/4 of the single file sequentially. Would
this be exception case number 4?

I don't know if this is a common scenario, just a plausible one. I also
don't know a good way to deal with it. Current code is just as
susceptible to it in any case. Nothing to hold up putting this in, just
curious if somebody knows how we might deal with it properly.

2004-09-24 00:28:25

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Steven Pratt wrote:

> There are a few exception cases which still concern me.
> 1. pages already in cache
> 2. I/O queue congestion.

I've always thought readahead should be done whether the IO queue is
congested or not.

2004-09-24 02:46:21

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Steven Pratt <[email protected]> wrote:
>
> The readahead code has undergone many changes in the 2.6 kernel and the
> current implementation is in my opinion obtuse and hard to maintain.

It did get a bit ugly - it was intially designed to handle pagefault
readaround and perhaps could be further simplified as we're now doing that
independently.

> would like to offer up an alternative simplified design which will not
> only make the code easier to maintain,

We won't know that until all functionality is in place.

> but as performance tests have
> shown, results in better performance in many cases.

And we won't know that until the bugs whcih you've identified in the current
code are fixed.

> We are very interested in having others review and try out the code and
> run whatever performance tests they see fit.
>
> Quick overview of the new design:
>
> The key design point of the new design is to make the readahead code
> aware of the size of the I/O request.

The advantage of the current page-at-a-time code is that the readahead code
behaves exactly the same, whether the application is doing 256 4k reads or
one 1M read. Plus it fits the old pagefault requirement.

> This change eliminates the need
> for treating large random I/O as sequential and all of the averaging
> code that exists just to support this. In addition to this change, the
> new design ramps up quicker, and shuts off faster. This, combined with
> the request size awareness eliminates the so called "slow read path"
> that we try so hard to avoid in the current code. For complete details
> on the design of the new readahead logic, please refer to
> http://www-124.ibm.com/developerworks/opensource/linuxperf/readahead/read-ahead-design.pdf
>
> There are a few exception cases which still concern me.
>
> 1. pages already in cache

Yes, we need to handle this. All pages in cache with lots of CPUs
hammering the same file is a common case.

Maybe not so significant on small x86, but on large power4 with a higher
lock-versus-memcpy cost ratio, that extra locking will hurt.

> 2. I/O queue congestion.

I forget why we did this.

> 3. page stealing

Please don't rename things - it gets confusing. This is called "readahead
thrashing".

> The first of these is a file already residing in page cache. If we do
> not code for this case we will end up doing multiple page lookups for
> each page. The current code tries to handle this using the
> check_ra_success function, but this code does not work.

Are you sure? Did you try to fix it?

> check_ra_success will subtract 1 page each time an I/O is completely
> contained in page cache, however on the main path we will increment the
> window size by 2 for each page in the request (up to max_readahead) thus
> negating the reduction. My question is what is the right behavior.
> Reducing the size of the ahead window doesn't help. You must turn off
> readahead to have any effect. Once you believe all pages to be in page
> cache we should just immediately turn off readahead. What is this
> trigger point? 4 I/Os in a row? 400?

Hard call.

> My concern is that the savings
> for skipping the double lookup appears to be on the order of .5% CPU in
> my tests,

What were the tests, and on what sort of machine?

> but the penalty for small I/O in sequential read case can be
> substantial. Currently the new code does not handle this case, but it
> could be enhanced to do so.
>
> The second case is on queue congestion. Current code does not submit
> the I/O if the queue is congested. This will result in each page being
> read serially on the cache miss path. Does submitting 32 4k I/Os
> instead of 1 128k I/O help queue congestion? Here is one place where
> the current cod gets real confusing. We reduce the window by 1 page for
> queue congestion(treated the same as page cache hit), but we leave the
> information about the current and ahead windows alone even though we did
> not issue the I/O to populate it and so we will never issue the I/O from
> the readahead code. Eventually we will start taking cache misses since
> no one read the pages. That code decrements the window size by 3, but
> as in the cache hit case since we are still reading sequentially we keep
> incrementing by 2 for each page; net effect -1 for each page not in
> cache. Again, the new code ignores the congestion case and still
> tries to do readahead, thus minimizing/optimizing the I/O requests sent
> to the device. Is this right? If not what should we do?

Sounds like the current code got itself broken during the fixups for the
seeky-read case.

I do think we should skip the I/O for POSIX_FADV_WILLNEED against a
congested queue. I can't immediately think of a good reason for skipping
the I/O for normal readahead.

> The third exception case is page stealing where the page into which

"readahead thrashing".

> readahead was done is reclaimed before the data is copied to user
> space. This would seem to be a somewhat rare case only happening under
> sever memory pressure,

No. The problematic case here is many files being read slowly. Such as an
ftp server with many users across slow connections.

We ended up deciding that the correct behaviour here is to reduce the
readahead window size on all files to that point at which the thrashing
ceases, and no further.

> but my tests have shown that it can occur quite
> frequently with as little as 4 threads doing 1M readahead or 16 threads
> doing 128k readahead on a machine with 1GB memory of which 950MB is page
> cache.

Try one process serving 1000 files slowly on a 64MB machine. Something
like that.

> Here it would seem the right thing to do is shrink the window
> size and reduce the chance for page stealing, this however kill I/O
> performance if done to aggressively. Again the current code may not
> perform as expected. As in the 2 previous cases, the -3 is offset by a
> +2 and so unless > 2/3 of pages in a given window are stolen the net
> effect is to ignore the page stealing. New code will slowly shrink the
> window as long as stealing occurs, but will quickly regrow once it stops.

Again, if the current code is bust we should fix it up before we can
validly compare its performance with new code.

> Performance:

As far as I can tell, none of the bugs which you've identified will come
into play with this testing, yes? If so then the testing is valid.
Otherwise it's kinda pointless.

> Graph lines with "new7" are the new readahead code, all others are the
> stock kernel.

Lots of info there. It would be useful if you could summarise the results
for those who are disinclined to wade through the graphs, thanks.

The patch needs cleaning up: coding style consistency, use hard tabs and
not spaces, fit it into 80 columns.

2004-09-24 05:00:48

by Peter Chubb

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

>>>>> "Andrew" == Andrew Morton <[email protected]> writes:

Andrew> Steven Pratt <[email protected]> wrote:
>> The readahead code has undergone many changes in the 2.6 kernel
>> and the current implementation is in my opinion obtuse and hard to
>> maintain.

Andrew> It did get a bit ugly - it was intially designed to handle
Andrew> pagefault readaround and perhaps could be further simplified
Andrew> as we're now doing that independently.

If you're coding up new readahead schemes, it may be worth taking into
account Papathanasiou and Scott, `Energy Efficient Prefetching and
Caching'
( http://www.usenix.org/events/usenix04/tech/general/papathanasiou/papathanasiou_html/index.html
)

which describes tuning of readahead for optimum disk energy usage,
while not compromising performance.

Here's the abstract:

Traditional disk management strategies--prefetching and caching
in particular--are designed to maximize performance. In mobile
systems they conflict with strategies that attempt to save
energy by powering down the disk when it is idle. We present
new rules for prefetching and caching that maximize power-down
opportunities (without performance loss) by creating an access
pattern characterized by intense bursts of activity separated
by long idle times. We also describe an automatic system that
monitors past application behavior in order to generate
appropriate prefetching hints, and a general system of kernel
enhancements that coordinate I/O activity across all running
applications.

We have implemented our system in the Linux kernel, and have
measured its performance and energy consumption via physical
instrumentation of a running laptop. We describe our
implementation and present quantitative results. For workloads
including a mix of sequential access to large files
(multimedia), concurrent access to large numbers of files
(compilation), and random access to large files (speech
recognition), we report disk energy savings of 60-80%, with
negligible loss in throughput or interactive responsiveness.


--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
The technical we do immediately, the political takes *forever*

2004-09-24 15:37:48

by Steven Pratt

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Andrew Morton wrote:

>Steven Pratt <[email protected]> wrote:
>
>
>>would like to offer up an alternative simplified design which will not
>>only make the code easier to maintain,
>>
>>
>
>We won't know that until all functionality is in place.
>
>
Ok, but both you and Nick indicated that the queue congestion isn't
needed, and the readahead thrashing is in there. So I think the only
piece missing is double lookups on cache hits.

>>but as performance tests have
>>shown, results in better performance in many cases.
>>
>>
>
>And we won't know that until the bugs whcih you've identified in the current
>code are fixed.
>
The results I showed were not the bug paths (mostly).

>
>
>>We are very interested in having others review and try out the code and
>>run whatever performance tests they see fit.
>>
>>Quick overview of the new design:
>>
>>The key design point of the new design is to make the readahead code
>>aware of the size of the I/O request.
>>
>>
>
>The advantage of the current page-at-a-time code is that the readahead code
>behaves exactly the same, whether the application is doing 256 4k reads or
>one 1M read. Plus it fits the old pagefault requirement.
>
>
Yes, but it accomplishes this by possible making the 1M slower. And I
must admit that I don't know what the "old pagefault requirement" is.
Is that something we still need to worry about?

>> This change eliminates the need
>>for treating large random I/O as sequential and all of the averaging
>>code that exists just to support this. In addition to this change, the
>>new design ramps up quicker, and shuts off faster. This, combined with
>>the request size awareness eliminates the so called "slow read path"
>>that we try so hard to avoid in the current code. For complete details
>>on the design of the new readahead logic, please refer to
>>http://www-124.ibm.com/developerworks/opensource/linuxperf/readahead/read-ahead-design.pdf
>>
>>There are a few exception cases which still concern me.
>>
>>1. pages already in cache
>>
>>
>
>Yes, we need to handle this. All pages in cache with lots of CPUs
>hammering the same file is a common case.
>
>Maybe not so significant on small x86, but on large power4 with a higher
>lock-versus-memcpy cost ratio, that extra locking will hurt.
>
>
Ok, we have some data from larger machines. I will collect it all and
summarize separately.

>>2. I/O queue congestion.
>>
>>
>
>I forget why we did this.
>
>
>
>>3. page stealing
>>
>>
>
>Please don't rename things - it gets confusing. This is called "readahead
>thrashing".
>
>
My bad.

>>The first of these is a file already residing in page cache. If we do
>>not code for this case we will end up doing multiple page lookups for
>>each page. The current code tries to handle this using the
>>check_ra_success function, but this code does not work.
>>
>>
>
>Are you sure? Did you try to fix it?
>
>
90% sure. Mostly done through code inspection and looking at the I/O
sizes generated. I though about how to fix it, but never came up with
any solution in the old code. Couldn't figure out how to graft it in
with all the incrementing and decrementing of next_size to where it
would not break something else.

>>check_ra_success will subtract 1 page each time an I/O is completely
>>contained in page cache, however on the main path we will increment the
>>window size by 2 for each page in the request (up to max_readahead) thus
>>negating the reduction. My question is what is the right behavior.
>>Reducing the size of the ahead window doesn't help. You must turn off
>>readahead to have any effect. Once you believe all pages to be in page
>>cache we should just immediately turn off readahead. What is this
>>trigger point? 4 I/Os in a row? 400?
>>
>>
>
>Hard call.
>
>
I know, but we have to come up with something if we really want to avoid
the double lookup.

>
>
>> My concern is that the savings
>>for skipping the double lookup appears to be on the order of .5% CPU in
>>my tests,
>>
>>
>
>What were the tests, and on what sort of machine?
>
>
The .5% was from an 8way pentium IV box. Ran the sysbench sequential
read test on an 800MB file set which fit completely in cache. As I
mentioned above, we tried this on a 16way POWER4, but ran into severe
locking in fget_light. I'll post the data that we have soon.

>>but the penalty for small I/O in sequential read case can be
>>substantial. Currently the new code does not handle this case, but it
>>could be enhanced to do so.
>>
>>The second case is on queue congestion. Current code does not submit
>>the I/O if the queue is congested. This will result in each page being
>>read serially on the cache miss path. Does submitting 32 4k I/Os
>>instead of 1 128k I/O help queue congestion? Here is one place where
>>the current cod gets real confusing. We reduce the window by 1 page for
>>queue congestion(treated the same as page cache hit), but we leave the
>>information about the current and ahead windows alone even though we did
>>not issue the I/O to populate it and so we will never issue the I/O from
>>the readahead code. Eventually we will start taking cache misses since
>>no one read the pages. That code decrements the window size by 3, but
>>as in the cache hit case since we are still reading sequentially we keep
>>incrementing by 2 for each page; net effect -1 for each page not in
>>cache. Again, the new code ignores the congestion case and still
>>tries to do readahead, thus minimizing/optimizing the I/O requests sent
>>to the device. Is this right? If not what should we do?
>>
>>
>
>Sounds like the current code got itself broken during the fixups for the
>seeky-read case.
>
>I do think we should skip the I/O for POSIX_FADV_WILLNEED against a
>congested queue. I can't immediately think of a good reason for skipping
>the I/O for normal readahead.
>
>
Can you expand on the POSIX_FADV_WILLNEED.

>
>
>>The third exception case is page stealing where the page into which
>>
>>
>
>"readahead thrashing".
>
>
>
>>readahead was done is reclaimed before the data is copied to user
>>space. This would seem to be a somewhat rare case only happening under
>>sever memory pressure,
>>
>>
>
>No. The problematic case here is many files being read slowly. Such as an
>ftp server with many users across slow connections.
>
>We ended up deciding that the correct behaviour here is to reduce the
>readahead window size on all files to that point at which the thrashing
>ceases, and no further.
>
>
>
>>but my tests have shown that it can occur quite
>>frequently with as little as 4 threads doing 1M readahead or 16 threads
>>doing 128k readahead on a machine with 1GB memory of which 950MB is page
>>cache.
>>
>>
>
>Try one process serving 1000 files slowly on a 64MB machine. Something
>like that.
>
>
Ok, the code I posted has readahead window shrinking in it which should
handle this case.

>> Here it would seem the right thing to do is shrink the window
>>size and reduce the chance for page stealing, this however kill I/O
>>performance if done to aggressively. Again the current code may not
>>perform as expected. As in the 2 previous cases, the -3 is offset by a
>>+2 and so unless > 2/3 of pages in a given window are stolen the net
>>effect is to ignore the page stealing. New code will slowly shrink the
>>window as long as stealing occurs, but will quickly regrow once it stops.
>>
>>
>
>Again, if the current code is bust we should fix it up before we can
>validly compare its performance with new code.
>
>
If Ram can figure out how to do this then I am all for comparing against
new code. But the problem is the basic logic of how the readahead
window size is calculated in the current code is what is causing all the
problems. I don't see how to fix it without significant re-work of the
logic (which is what this is).

>>Performance:
>>
>>
>
>As far as I can tell, none of the bugs which you've identified will come
>into play with this testing, yes? If so then the testing is valid.
>Otherwise it's kinda pointless.
>
>
The tests I did should have concentrated on the mainline paths, not the
exceptions cases so I believe the comparison is valid. There will be
some cache hits, but since both old and new code is ignoring that in
their current forms, it is a fair comparison.

>
>
>>Graph lines with "new7" are the new readahead code, all others are the
>>stock kernel.
>>
>>
>
>Lots of info there. It would be useful if you could summarise the results
>for those who are disinclined to wade through the graphs, thanks.
>

Ok, thought I did. New code never slower. New code always faster on
multi threaded sequential reads. No change for Random IO.

>
>The patch needs cleaning up: coding style consistency, use hard tabs and
>not spaces, fit it into 80 columns.
>

Ok, will work on that.

Thanks for the input. I'll work on the already in cache path and let
you know what I come up with.

Steve

2004-09-24 16:50:29

by Steven Pratt

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Nick Piggin wrote:

> Steven Pratt wrote:
>
>> Andrew Morton wrote:
>>
>>> Steven Pratt <[email protected]> wrote:
>>>
>>>
>>>> would like to offer up an alternative simplified design which will
>>>> not only make the code easier to maintain,
>>>>
>>>
>>>
>>> We won't know that until all functionality is in place.
>>>
>>
>> Ok, but both you and Nick indicated that the queue congestion isn't
>> needed,
>
> I would have thought that always doing the readahead would provide a
> more graceful degradation, assuming the readahead algorithm is fairly
> accurate, and copes with things like readahead thrashing (which we
> hope is the case).

Yes, that is exactly my thought. I think this is what the new code does.

>
>>> I do think we should skip the I/O for POSIX_FADV_WILLNEED against a
>>> congested queue. I can't immediately think of a good reason for
>>> skipping
>>> the I/O for normal readahead.
>>>
>>
>
> I don't see why you should skip the readahead for FADVISE_WILLNEED
> either. Presumably if someone needs this, they really need it. We
> should aim for optimal behaviour when the apis are being used
> correctly...

Ok, great, since this is what it does.

Thanks, Steve


2004-09-24 16:40:39

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Steven Pratt wrote:
> Andrew Morton wrote:
>
>> Steven Pratt <[email protected]> wrote:
>>
>>
>>> would like to offer up an alternative simplified design which will
>>> not only make the code easier to maintain,
>>>
>>
>>
>> We won't know that until all functionality is in place.
>>
>>
> Ok, but both you and Nick indicated that the queue congestion isn't
> needed,

I would have thought that always doing the readahead would provide a
more graceful degradation, assuming the readahead algorithm is fairly
accurate, and copes with things like readahead thrashing (which we
hope is the case).

>> I do think we should skip the I/O for POSIX_FADV_WILLNEED against a
>> congested queue. I can't immediately think of a good reason for skipping
>> the I/O for normal readahead.
>>

I don't see why you should skip the readahead for FADVISE_WILLNEED
either. Presumably if someone needs this, they really need it. We
should aim for optimal behaviour when the apis are being used correctly...

2004-09-24 22:01:55

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Steven Pratt <[email protected]> wrote:
>
> >
> >The advantage of the current page-at-a-time code is that the readahead code
> >behaves exactly the same, whether the application is doing 256 4k reads or
> >one 1M read. Plus it fits the old pagefault requirement.
> >
> >
> Yes, but it accomplishes this by possible making the 1M slower. And I
> must admit that I don't know what the "old pagefault requirement" is.
> Is that something we still need to worry about?

The "old pagefault requirement": the code in there used to perform
readaround at pagefault time as well as readahead at read() time. Hence it
had to work well for single-page requests. That requirement isn't there
any more but some of the code to support it is still there, perhaps.

> >>
> >>1. pages already in cache
> >>
> >>
> >
> >Yes, we need to handle this. All pages in cache with lots of CPUs
> >hammering the same file is a common case.
> >
> >Maybe not so significant on small x86, but on large power4 with a higher
> >lock-versus-memcpy cost ratio, that extra locking will hurt.
> >
> >
> Ok, we have some data from larger machines. I will collect it all and
> summarize separately.

SDET would be interesting, as well as explicit testing of lots of processes
reading the same fully-cached file.

> >>cache we should just immediately turn off readahead. What is this
> >>trigger point? 4 I/Os in a row? 400?
> >>
> >>
> >
> >Hard call.
> >
> >
> I know, but we have to come up with something if we really want to avoid
> the double lookup.

As long as readahead gets fully disabled at some stage, we should be OK.

We should probably compare i_size with mapping->nrpages at open() time,
too. No point in enabling readahead if it's all cached. But doing that
would make performance testing harder, so do it later.

> >
> >I do think we should skip the I/O for POSIX_FADV_WILLNEED against a
> >congested queue. I can't immediately think of a good reason for skipping
> >the I/O for normal readahead.
> >
> >
> Can you expand on the POSIX_FADV_WILLNEED.

It's an application-specified readahead hint. It should ideally be
asynchronous so the application can get some I/O underway while it's
crunching on something else. If the queue is contested then the
application will accidentally block when launching the readahead, which
kinda defeats the purpose.

Yes, the application will block when it does the subsequent read() anyway,
but applications expect to block in read(). Seems saner this way.

2004-09-24 22:40:39

by Steven Pratt

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Andrew Morton wrote:

>Steven Pratt <[email protected]> wrote:
>
>
>>>The advantage of the current page-at-a-time code is that the readahead code
>>>behaves exactly the same, whether the application is doing 256 4k reads or
>>>one 1M read. Plus it fits the old pagefault requirement.
>>>
>>>
>>>
>>Yes, but it accomplishes this by possible making the 1M slower. And I
>>must admit that I don't know what the "old pagefault requirement" is.
>>Is that something we still need to worry about?
>>
>>
>
>The "old pagefault requirement": the code in there used to perform
>readaround at pagefault time as well as readahead at read() time. Hence it
>had to work well for single-page requests. That requirement isn't there
>any more but some of the code to support it is still there, perhaps.
>
>
>
>>>>1. pages already in cache
>>>>
>>>>
>>>>
>>>>
>>>Yes, we need to handle this. All pages in cache with lots of CPUs
>>>hammering the same file is a common case.
>>>
>>>Maybe not so significant on small x86, but on large power4 with a higher
>>>lock-versus-memcpy cost ratio, that extra locking will hurt.
>>>
>>>
>>>
>>Ok, we have some data from larger machines. I will collect it all and
>>summarize separately.
>>
>>
>
>SDET would be interesting, as well as explicit testing of lots of processes
>reading the same fully-cached file.
>
>
Don't have SDET but we have been working on the multiple processes
reading same file case on a large(16way POWER4 with 128GB) machine. We
had to apply some fdrcu patches to get past problems in fget_light which
were causing 80%spin on the file_lock. We then end up with anout 45%
spin lock on mapping->tree_lock. This is on vanilla rc2. I know you
changed that to a rw lock in your tree and we nee to try that as well..
Data is not consistant enough to make any conclusions, but I don't see a
dramatic change by turning off readahead. I need to do more testing on
this to get better results.

In any case I agree that we should deal with this case.

>>>>cache we should just immediately turn off readahead. What is this
>>>>trigger point? 4 I/Os in a row? 400?
>>>>
>>>>
>>>>
>>>Hard call.
>>>
>>>
>>>
>>I know, but we have to come up with something if we really want to avoid
>>the double lookup.
>>
>>
>
>As long as readahead gets fully disabled at some stage, we should be OK.
>
>
I am attaching a reworked patch which now shuts off readahead if 10M
(arbitrary value for now) of I/O comes from page cache in a row. Any
actual I/O will restart readahead.

>We should probably compare i_size with mapping->nrpages at open() time,
>too. No point in enabling readahead if it's all cached. But doing that
>would make performance testing harder, so do it later.
>
>
Ok. Sounds good.

>
>
>>>I do think we should skip the I/O for POSIX_FADV_WILLNEED against a
>>>congested queue. I can't immediately think of a good reason for skipping
>>>the I/O for normal readahead.
>>>
>>>
>>>
>>>
>>Can you expand on the POSIX_FADV_WILLNEED.
>>
>>
>
>It's an application-specified readahead hint. It should ideally be
>asynchronous so the application can get some I/O underway while it's
>crunching on something else. If the queue is contested then the
>application will accidentally block when launching the readahead, which
>kinda defeats the purpose.
>
>
Well if the app really does this asynchronously, does it matter that we
block?

>Yes, the application will block when it does the subsequent read() anyway,
>but applications expect to block in read(). Seems saner this way.
>
Just to be sure I have this correct, the readahead code will be invoked
once on the POSIX_FADV_WILLNEED request, but this looks mostly like a
regular read, and then again for the same pages on a real read?

Steve


2004-09-24 22:53:42

by Steven Pratt

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

--- linux-2.6.9-rc2/include/linux/mm.h.org 2004-09-17 14:05:27.000000000 -0500
+++ linux-2.6.9-rc2/include/linux/mm.h 2004-09-24 11:40:53.000000000 -0500
@@ -722,15 +722,18 @@
/* readahead.c */
#define VM_MAX_READAHEAD 128 /* kbytes */
#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
+#define VM_MAX_CACHE_HIT 2560 /* max pages in a row in cache before
+ * turning readahead off */

int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
unsigned long offset, unsigned long nr_to_read);
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
unsigned long offset, unsigned long nr_to_read);
-void page_cache_readahead(struct address_space *mapping,
+unsigned long page_cache_readahead(struct address_space *mapping,
struct file_ra_state *ra,
struct file *filp,
- unsigned long offset);
+ unsigned long offset,
+ unsigned long size);
void handle_ra_miss(struct address_space *mapping,
struct file_ra_state *ra, pgoff_t offset);
unsigned long max_sane_readahead(unsigned long nr);
--- linux-2.6.9-rc2/include/linux/fs.h.org 2004-09-17 14:05:47.000000000 -0500
+++ linux-2.6.9-rc2/include/linux/fs.h 2004-09-24 10:56:17.000000000 -0500
@@ -557,16 +557,17 @@
struct file_ra_state {
unsigned long start; /* Current window */
unsigned long size;
- unsigned long next_size; /* Next window size */
+ unsigned long flags; /* ra flags RA_FLAG_xxx*/
+ unsigned long cache_hit; /* cache hit count*/
unsigned long prev_page; /* Cache last read() position */
unsigned long ahead_start; /* Ahead window */
unsigned long ahead_size;
- unsigned long currnt_wnd_hit; /* locality in the current window */
- unsigned long average; /* size of next current window */
unsigned long ra_pages; /* Maximum readahead window */
unsigned long mmap_hit; /* Cache hit stat for mmap accesses */
unsigned long mmap_miss; /* Cache miss stat for mmap accesses */
};
+#define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */
+#define RA_FLAG_INCACHE 0x02 /* file is already in cache */

struct file {
struct list_head f_list;
--- linux-2.6.9-rc2/mm/readahead.org.c 2004-09-16 16:02:05.000000000 -0500
+++ linux-2.6.9-rc2/mm/readahead.c 2004-09-24 15:25:44.000000000 -0500
@@ -35,7 +35,6 @@
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
ra->ra_pages = mapping->backing_dev_info->ra_pages;
- ra->average = ra->ra_pages / 2;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);

@@ -52,6 +51,62 @@
return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
}

+static inline void ra_off(struct file_ra_state *ra)
+{
+ ra->start=0;
+ ra->flags=0;
+ ra->size=-1UL;
+ ra->ahead_start=0;
+ ra->ahead_size=0;
+ return;
+}
+
+/*
+ * Set the initial window size, round to next power of 2 and square
+ * for small size, x 4 for medium, and x 2 for large
+ * for 128k (32 page) max ra
+ * 1-8 page = 32k initial, > 8 page = 128k initial
+ */
+unsigned long get_init_ra_size(unsigned long size, unsigned long max,
+ unsigned long min)
+{
+ unsigned long s_size=1, newsize;
+ do {
+ s_size = s_size << 1;
+ } while ((size = size >> 1));
+ if (s_size <= max / 64) {
+ newsize = s_size * s_size;
+ } else if (s_size <= max/4) {
+ newsize = max / 4;
+ } else {
+ newsize = max;
+ }
+ return newsize;
+}
+
+/*
+ * Set the new window size, this is called only when
+ * I/O is to be submitted, not for each call to readahead
+ * If cache miss occered, reduce next I/O size, else
+ * increase depending on how close to max we are.
+ */
+unsigned long get_next_ra_size(unsigned long cur, unsigned long max,
+ unsigned long min, unsigned long * flags)
+{
+ unsigned long newsize;
+ if (*flags & RA_FLAG_MISS) {
+ newsize = max((cur - 2),min);
+ *flags &= ~RA_FLAG_MISS;
+ } else if ( cur < max/16 ) {
+ newsize = 4 * cur;
+ } else {
+ newsize = 2 * cur;
+ }
+ newsize = min(newsize, max_sane_readahead(max));
+ return newsize;
+}
+
+
#define list_to_page(head) (list_entry((head)->prev, struct page, lru))

/**
@@ -152,19 +207,17 @@
* ahead_size: Together, these form the "ahead window".
* ra_pages: The externally controlled max readahead for this fd.
*
- * When readahead is in the "maximally shrunk" state (next_size == -1UL),
- * readahead is disabled. In this state, prev_page and size are used, inside
- * handle_ra_miss(), to detect the resumption of sequential I/O. Once there
- * has been a decent run of sequential I/O (defined by get_min_readahead),
- * readahead is reenabled.
+ * When readahead is in the off state (size == -1UL),
+ * readahead is disabled. In this state, prev_page is used
+ * to detect the resumption of sequential I/O.
*
* The readahead code manages two windows - the "current" and the "ahead"
* windows. The intent is that while the application is walking the pages
* in the current window, I/O is underway on the ahead window. When the
* current window is fully traversed, it is replaced by the ahead window
* and the ahead window is invalidated. When this copying happens, the
- * new current window's pages are probably still locked. When I/O has
- * completed, we submit a new batch of I/O, creating a new ahead window.
+ * new current window's pages are probably still locked. So
+ * we submit a new batch of I/O immediately, creating a new ahead window.
*
* So:
*
@@ -176,34 +229,25 @@
* ahead window.
*
* A `readahead hit' occurs when a read request is made against a page which is
- * inside the current window. Hits are good, and the window size (next_size)
- * is grown aggressively when hits occur. Two pages are added to the next
- * window size on each hit, which will end up doubling the next window size by
- * the time I/O is submitted for it.
- *
- * If readahead hits are more sparse (say, the application is only reading
- * every second page) then the window will build more slowly.
- *
- * On a readahead miss (the application seeked away) the readahead window is
- * shrunk by 25%. We don't want to drop it too aggressively, because it is a
- * good assumption that an application which has built a good readahead window
- * will continue to perform linear reads. Either at the new file position, or
- * at the old one after another seek.
+ * the next sequential page. Ahead windowe calculations are done only when it
+ * is time to submit a new IO. The code ramps up the size agressively at first,
+ * but slow down as it approaches max_readhead.
*
- * After enough misses, readahead is fully disabled. (next_size = -1UL).
+ * Any seek/ramdom IO will result in readahead being turned off. It will resume
+ * at the first sequential access.
*
* There is a special-case: if the first page which the application tries to
* read happens to be the first page of the file, it is assumed that a linear
- * read is about to happen and the window is immediately set to half of the
- * device maximum.
+ * read is about to happen and the window is immediately set to the initial size
+ * based on I/O request size and the max_readahead.
*
* A page request at (start + size) is not a miss at all - it's just a part of
* sequential file reading.
*
- * This function is to be called for every page which is read, rather than when
- * it is time to perform readahead. This is so the readahead algorithm can
- * centrally work out the access patterns. This could be costly with many tiny
- * read()s, so we specifically optimise for that case with prev_page.
+ * This function is to be called for every read request, rather than when
+ * it is time to perform readahead. It is called only oce for the entire I/O
+ * regardless of size unless readahead is unable to start enough I/O to satisfy
+ * the request (I/O request > max_readahead).
*/

/*
@@ -212,7 +256,7 @@
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*
- * Returns the number of pages which actually had IO started against them.
+ * Returns the number of pages requested, or the maximum amount of I/O allowed.
*/
static inline int
__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
@@ -301,266 +345,160 @@
}

/*
- * This version skips the IO if the queue is read-congested, and will tell the
- * block layer to abandon the readahead if request allocation would block.
- *
- * force_page_cache_readahead() will ignore queue congestion and will block on
- * request queues.
- */
-int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
- unsigned long offset, unsigned long nr_to_read)
-{
- if (!bdi_read_congested(mapping->backing_dev_info))
- return __do_page_cache_readahead(mapping, filp,
- offset, nr_to_read);
- return 0;
-}
-
-/*
* Check how effective readahead is being. If the amount of started IO is
* less than expected then the file is partly or fully in pagecache and
- * readahead isn't helping. Shrink the window.
+ * readahead isn't helping.
*
- * But don't shrink it too much - the application may read the same page
- * occasionally.
*/
static inline void
-check_ra_success(struct file_ra_state *ra, pgoff_t attempt,
- pgoff_t actual, pgoff_t orig_next_size)
+check_ra_success(struct file_ra_state *ra, unsigned long nr_to_read,
+ unsigned long actual)
{
if (actual == 0) {
- if (orig_next_size > 1) {
- ra->next_size = orig_next_size - 1;
- if (ra->ahead_size)
- ra->ahead_size = ra->next_size;
- } else {
- ra->next_size = -1UL;
- ra->size = 0;
+ ra->cache_hit += nr_to_read;
+ if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
+ ra->flags |= RA_FLAG_INCACHE;
+ ra_off(ra);
}
+ } else {
+ ra->cache_hit=0;
}
+ return;
+}
+
+/*
+ * Issue the I/O. If pages already in cache, increment the hit count until
+ * we exceed max, then turn RA off until we start missing again.
+ */
+int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+ unsigned long offset, unsigned long nr_to_read)
+{
+ return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
}

+
+
+
/*
* page_cache_readahead is the main function. If performs the adaptive
* readahead window size management and submits the readahead I/O.
*/
-void
+unsigned long
page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
- struct file *filp, unsigned long offset)
+ struct file *filp, unsigned long offset,
+ unsigned long req_size)
{
- unsigned max;
- unsigned orig_next_size;
- unsigned actual;
- int first_access=0;
- unsigned long average;
+ unsigned long max, min, maxsane, newsize=req_size;
+ unsigned long actual=0;

/*
* Here we detect the case where the application is performing
* sub-page sized reads. We avoid doing extra work and bogusly
* perturbing the readahead window expansion logic.
- * If next_size is zero, this is the very first read for this
- * file handle, or the window is maximally shrunk.
+ * If size is zero, there is no read ahead window so we need one
*/
- if (offset == ra->prev_page) {
- if (ra->next_size != 0)
- goto out;
+ if (offset == ra->prev_page && req_size == 1 && ra->size != 0) {
+ goto out;
}

- if (ra->next_size == -1UL)
- goto out; /* Maximally shrunk */
-
max = get_max_readahead(ra);
- if (max == 0)
- goto out; /* No readahead */
+ min = get_min_readahead(ra);
+ maxsane = max_sane_readahead(max);
+ newsize = min(req_size, maxsane);
+
+ if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
+ newsize = 1;
+ goto out; /* No readahead or file already in cache*/

- orig_next_size = ra->next_size;
-
- if (ra->next_size == 0) {
- /*
- * Special case - first read.
- * We'll assume it's a whole-file read, and
- * grow the window fast.
- */
- first_access=1;
- ra->next_size = max / 2;
- ra->prev_page = offset;
- ra->currnt_wnd_hit++;
- goto do_io;
- }
-
- ra->prev_page = offset;
-
- if (offset >= ra->start && offset <= (ra->start + ra->size)) {
- /*
- * A readahead hit. Either inside the window, or one
- * page beyond the end. Expand the next readahead size.
- */
- ra->next_size += 2;
-
- if (ra->currnt_wnd_hit <= (max * 2))
- ra->currnt_wnd_hit++;
- } else {
- /*
- * A miss - lseek, pagefault, pread, etc. Shrink the readahead
- * window.
+ /*
+ * Special case - first read.
+ * We'll assume it's a whole-file read if at start of file, and
+ * grow the window fast.
+ * or detect first sequential access
+ */
+ if ((ra->size == 0 && offset == 0) // first io and start of file
+ || (ra->size == -1UL && ra->prev_page == offset-1)) { //1st seq
+ ra->prev_page = offset + newsize-1;
+ ra->size = get_init_ra_size(newsize, max, min);
+ ra->start = offset;
+ actual = do_page_cache_readahead(mapping, filp, offset, ra->size);
+ check_ra_success(ra, ra->size, actual);
+ /* if the request size is larger than our max readahead, we
+ * at least want to be sure that we get 2 IOs if flight and
+ * we know that we will definitly need the new I/O.
+ * once we do this, subsequent calls should be able to overlap IOs,
+ * thus preventing stalls. so Issue the ahead window immediately.
*/
- ra->next_size -= 2;
-
- average = ra->average;
- if (average < ra->currnt_wnd_hit) {
- average++;
+ if (req_size >= max) {
+ ra->ahead_size = get_next_ra_size(ra->size, max, min, &ra->flags);
+ ra->ahead_start = ra->start + ra->size;
+ actual = do_page_cache_readahead(mapping, filp,
+ ra->ahead_start, ra->ahead_size);
+ check_ra_success(ra, ra->ahead_size, actual);
}
- ra->average = (average + ra->currnt_wnd_hit) / 2;
- ra->currnt_wnd_hit = 1;
+ goto out;
}

- if ((long)ra->next_size > (long)max)
- ra->next_size = max;
- if ((long)ra->next_size <= 0L) {
- ra->next_size = -1UL;
- ra->size = 0;
- goto out; /* Readahead is off */
+ /* now handle the random case:
+ * partial page reads and first access were handled above,
+ * so this must be the next page otherwise it is random
+ */
+ if ((offset != (ra->prev_page+1) || (ra->size == 0))) {
+ ra_off(ra);
+ ra->prev_page = offset + newsize-1;
+ actual = do_page_cache_readahead(mapping, filp, offset, newsize);
+ check_ra_success(ra, newsize, actual);
+ goto out;
}

- /*
- * Is this request outside the current window?
+ /* If we get here we are doing sequential IO and this was
+ * not the first occurence (ie we have an existing window)
*/
- if (offset < ra->start || offset >= (ra->start + ra->size)) {
- /*
- * A miss against the current window. Have we merely
- * advanced into the ahead window?
- */
- if (offset == ra->ahead_start) {
- /*
- * Yes, we have. The ahead window now becomes
- * the current window.
- */
- ra->start = ra->ahead_start;
- ra->size = ra->ahead_size;
- ra->prev_page = ra->start;
- ra->ahead_start = 0;
- ra->ahead_size = 0;
-
- /*
- * Control now returns, probably to sleep until I/O
- * completes against the first ahead page.
- * When the second page in the old ahead window is
- * requested, control will return here and more I/O
- * will be submitted to build the new ahead window.
- */
- goto out;
- }
-do_io:
- /*
- * This is the "unusual" path. We come here during
- * startup or after an lseek. We invalidate the
- * ahead window and get some I/O underway for the new
- * current window.
- */
- if (!first_access) {
- /* Heuristic: there is a high probability
- * that around ra->average number of
- * pages shall be accessed in the next
- * current window.
- */
- average = ra->average;
- if (ra->currnt_wnd_hit > average)
- average = (ra->currnt_wnd_hit + ra->average + 1) / 2;

- ra->next_size = min(average , (unsigned long)max);
- }
- ra->start = offset;
- ra->size = ra->next_size;
- ra->ahead_start = 0; /* Invalidate these */
- ra->ahead_size = 0;
- actual = do_page_cache_readahead(mapping, filp, offset,
- ra->size);
- if(!first_access) {
- /*
- * do not adjust the readahead window size the first
- * time, the ahead window might get closed if all
- * the pages are already in the cache.
- */
- check_ra_success(ra, ra->size, actual, orig_next_size);
- }
+ if (ra->ahead_start == 0) { /* no ahead window yet */
+ ra->ahead_size = get_next_ra_size(ra->size, max, min, &ra->flags);
+ ra->ahead_start = ra->start + ra->size;
+ ra->prev_page = offset + newsize - 1;
+ actual = do_page_cache_readahead(mapping, filp,
+ ra->ahead_start, ra->ahead_size);
+ check_ra_success(ra, ra->ahead_size, actual);
} else {
- /*
- * This read request is within the current window. It may be
- * time to submit I/O for the ahead window while the
- * application is about to step into the ahead window.
- */
- if (ra->ahead_start == 0) {
- /*
- * If the average io-size is more than maximum
- * readahead size of the file the io pattern is
- * sequential. Hence bring in the readahead window
- * immediately.
- * If the average io-size is less than maximum
- * readahead size of the file the io pattern is
- * random. Hence don't bother to readahead.
+ /* already have an ahead window, check if we crossed into it */
+ if ((offset + newsize -1) >= ra->ahead_start) {
+ ra->start = ra->ahead_start;
+ ra->size = ra->ahead_size;
+ ra->ahead_start = ra->ahead_start + ra->ahead_size;
+ ra->ahead_size = get_next_ra_size(ra->ahead_size,
+ max, min, &ra->flags);
+ ra->prev_page = offset + newsize - 1;
+ actual = do_page_cache_readahead(mapping, filp,
+ ra->ahead_start, ra->ahead_size);
+ check_ra_success(ra, ra->ahead_size, actual);
+ } else {
+ /* do nothing, read contained in current window and ahead
+ * window populated just update the prev_page pointer.
*/
- average = ra->average;
- if (ra->currnt_wnd_hit > average)
- average = (ra->currnt_wnd_hit + ra->average + 1) / 2;
-
- if (average > max) {
- ra->ahead_start = ra->start + ra->size;
- ra->ahead_size = ra->next_size;
- actual = do_page_cache_readahead(mapping, filp,
- ra->ahead_start, ra->ahead_size);
- check_ra_success(ra, ra->ahead_size,
- actual, orig_next_size);
- }
+ ra->prev_page = offset + newsize -1;
}
}
-out:
- return;
+ out:
+ return(newsize);
}
-EXPORT_SYMBOL(page_cache_readahead);
-

/*
* handle_ra_miss() is called when it is known that a page which should have
* been present in the pagecache (we just did some readahead there) was in fact
* not found. This will happen if it was evicted by the VM (readahead
- * thrashing) or if the readahead window is maximally shrunk.
+ * thrashing)
*
- * If the window has been maximally shrunk (next_size == -1UL) then look to see
- * if we are getting misses against sequential file offsets. If so, and this
- * persists then resume readahead.
- *
- * Otherwise we're thrashing, so shrink the readahead window by three pages.
- * This is because it is grown by two pages on a readahead hit. Theory being
- * that the readahead window size will stabilise around the maximum level at
- * which there is no thrashing.
+ * turn on the cache miss flag in the RA struct, this will cause the RA code
+ * to reduce the RA size on the next read.
*/
void handle_ra_miss(struct address_space *mapping,
- struct file_ra_state *ra, pgoff_t offset)
+ struct file_ra_state *ra, pgoff_t offset)
{
- if (ra->next_size == -1UL) {
- const unsigned long max = get_max_readahead(ra);
-
- if (offset != ra->prev_page + 1) {
- ra->size = ra->size?ra->size-1:0; /* Not sequential */
- } else {
- ra->size++; /* A sequential read */
- if (ra->size >= max) { /* Resume readahead */
- ra->start = offset - max;
- ra->next_size = max;
- ra->size = max;
- ra->ahead_start = 0;
- ra->ahead_size = 0;
- ra->average = max / 2;
- }
- }
- ra->prev_page = offset;
- } else {
- const unsigned long min = get_min_readahead(ra);
-
- ra->next_size -= 3;
- if (ra->next_size < min)
- ra->next_size = min;
- }
+ ra->flags |= RA_FLAG_MISS;
+ ra->flags &= ~RA_FLAG_INCACHE;
}

/*
--- linux-2.6.9-rc2/mm/filemap.org.c 2004-09-17 16:11:21.000000000 -0500
+++ linux-2.6.9-rc2/mm/filemap.c 2004-09-21 13:45:25.000000000 -0500
@@ -717,14 +717,15 @@
read_actor_t actor)
{
struct inode *inode = mapping->host;
- unsigned long index, end_index, offset;
+ unsigned long index, end_index, offset, req_size, next_index;
loff_t isize;
struct page *cached_page;
int error;
struct file_ra_state ra = *_ra;

cached_page = NULL;
- index = *ppos >> PAGE_CACHE_SHIFT;
+ next_index = index = *ppos >> PAGE_CACHE_SHIFT;
+ req_size = (desc->count + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;

isize = i_size_read(inode);
@@ -734,7 +735,7 @@
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
for (;;) {
struct page *page;
- unsigned long nr, ret;
+ unsigned long ret_size, nr, ret;

/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
@@ -749,7 +750,12 @@
nr = nr - offset;

cond_resched();
- page_cache_readahead(mapping, &ra, filp, index);
+ if(index == next_index && req_size) {
+ ret_size = page_cache_readahead(mapping, &ra,
+ filp, index, req_size);
+ next_index += ret_size;
+ req_size -= ret_size;
+ }

find_page:
page = find_get_page(mapping, index);
@@ -1195,7 +1201,7 @@
* For sequential accesses, we use the generic readahead logic.
*/
if (VM_SequentialReadHint(area))
- page_cache_readahead(mapping, ra, file, pgoff);
+ page_cache_readahead(mapping, ra, file, pgoff, 1);

/*
* Do we have something in the page cache already?


Attachments:
newramm.patch (21.28 kB)

2004-09-24 22:55:00

by Steven Pratt

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Peter Chubb wrote:

>>>>>>"Andrew" == Andrew Morton <[email protected]> writes:
>>>>>>
>>>>>>
>
>Andrew> Steven Pratt <[email protected]> wrote:
>
>
>>> The readahead code has undergone many changes in the 2.6 kernel
>>>and the current implementation is in my opinion obtuse and hard to
>>>maintain.
>>>
>>>
>
>Andrew> It did get a bit ugly - it was intially designed to handle
>Andrew> pagefault readaround and perhaps could be further simplified
>Andrew> as we're now doing that independently.
>
>If you're coding up new readahead schemes, it may be worth taking into
>account Papathanasiou and Scott, `Energy Efficient Prefetching and
>Caching'
>( http://www.usenix.org/events/usenix04/tech/general/papathanasiou/papathanasiou_html/index.html
>)
>
>
Have not had time to look into this yet, but I will.

Thanks, Steve


2004-09-24 22:58:07

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Steven Pratt <[email protected]> wrote:
>
> >It's an application-specified readahead hint. It should ideally be
> >asynchronous so the application can get some I/O underway while it's
> >crunching on something else. If the queue is contested then the
> >application will accidentally block when launching the readahead, which
> >kinda defeats the purpose.
> >
> >
> Well if the app really does this asynchronously, does it matter that we
> block?

?? Not sure what you mean.

posix_fadvise(POSIX_FADV_WILLNEED) is used by applications to tell the
kernel that the application will need that part of the file in the future.
Presumably, the application has something else to be going on with
meanwhile. Hence the application doesn't want to block.

> >Yes, the application will block when it does the subsequent read() anyway,
> >but applications expect to block in read(). Seems saner this way.
> >
> Just to be sure I have this correct, the readahead code will be invoked
> once on the POSIX_FADV_WILLNEED request, but this looks mostly like a
> regular read, and then again for the same pages on a real read?


yup. POSIX_FADV_WILLNEED should just populate pagecache and should launch
asynchronous I/O.

2004-09-25 00:45:16

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Andrew Morton wrote:
> Steven Pratt <[email protected]> wrote:

>>Can you expand on the POSIX_FADV_WILLNEED.
>
>
> It's an application-specified readahead hint. It should ideally be
> asynchronous so the application can get some I/O underway while it's
> crunching on something else. If the queue is contested then the
> application will accidentally block when launching the readahead, which
> kinda defeats the purpose.
>
> Yes, the application will block when it does the subsequent read() anyway,
> but applications expect to block in read(). Seems saner this way.

Good point. I guess you're right.

2004-09-25 01:02:25

by Ram Pai

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

On Thu, 23 Sep 2004, Steven Pratt wrote:

> The readahead code has undergone many changes in the 2.6 kernel and the
> current implementation is in my opinion obtuse and hard to maintain. We
> would like to offer up an alternative simplified design which will not
> only make the code easier to maintain, but as performance tests have
> shown, results in better performance in many cases.
>
> We are very interested in having others review and try out the code and
> run whatever performance tests they see fit.
>
> Quick overview of the new design:
>
> The key design point of the new design is to make the readahead code
> aware of the size of the I/O request. This change eliminates the need
> for treating large random I/O as sequential and all of the averaging
> code that exists just to support this. In addition to this change, the
> new design ramps up quicker, and shuts off faster. This, combined with
> the request size awareness eliminates the so called "slow read path"
> that we try so hard to avoid in the current code. For complete details
> on the design of the new readahead logic, please refer to
> http://www-124.ibm.com/developerworks/opensource/linuxperf/readahead/read-ahead-design.pdf
>
> There are a few exception cases which still concern me.
>
> 1. pages already in cache
> 2. I/O queue congestion.
> 3. page stealing
>
> The first of these is a file already residing in page cache. If we do
> not code for this case we will end up doing multiple page lookups for
> each page. The current code tries to handle this using the
> check_ra_success function, but this code does not work.
> check_ra_success will subtract 1 page each time an I/O is completely
> contained in page cache, however on the main path we will increment the
> window size by 2 for each page in the request (up to max_readahead) thus
> negating the reduction. My question is what is the right behavior.

Not exactly true. If you look at the the parameters of check_ra_success()
it takes orig_next_size as its third parameter. And orig_next_size is
initialized to next_size right at the beginning of the function.

So if the pages are already in the page cache , the next_size is decremented
effectively by 1.



> Reducing the size of the ahead window doesn't help. You must turn off
> readahead to have any effect. Once you believe all pages to be in page
> cache we should just immediately turn off readahead. What is this
> trigger point? 4 I/Os in a row? 400? My concern is that the savings
> for skipping the double lookup appears to be on the order of .5% CPU in
> my tests, but the penalty for small I/O in sequential read case can be
> substantial. Currently the new code does not handle this case, but it
> could be enhanced to do so.

Currently the code does handle this automatically. If the size of the
next_size is 'n' then if the next 'n' window reads are already
in the page cache the readahead turns off. The problem with shutting
down readahead the first time when all the pages are in the cache, is that
there you will end up in the slow read path and it becomes miserable.
Because then onwards only one page gets read at a time even though
the read request is for larger number of pages. This behavior needs to
be fixed.



>
> The second case is on queue congestion. Current code does not submit
> the I/O if the queue is congested. This will result in each page being
> read serially on the cache miss path. Does submitting 32 4k I/Os
> instead of 1 128k I/O help queue congestion? Here is one place where
> the current cod gets real confusing. We reduce the window by 1 page for
> queue congestion(treated the same as page cache hit), but we leave the
> information about the current and ahead windows alone even though we did
> not issue the I/O to populate it and so we will never issue the I/O from
> the readahead code. Eventually we will start taking cache misses since
> no one read the pages. That code decrements the window size by 3, but
> as in the cache hit case since we are still reading sequentially we keep
> incrementing by 2 for each page; net effect -1 for each page not in
> cache. Again, the new code ignores the congestion case and still
> tries to do readahead, thus minimizing/optimizing the I/O requests sent
> to the device. Is this right? If not what should we do?

Yes. The code is currently not differentiating
queue congestion against 'pages already in the page cache'.

I think if the queue is congested and if we are trying to populate
the current window, we better wait by calling schedule(),
and if we are trying to populate readahead window, we can return
immediately resetting the ahead_size and ahead_start?



>
> The third exception case is page stealing where the page into which
> readahead was done is reclaimed before the data is copied to user
> space. This would seem to be a somewhat rare case only happening under
> sever memory pressure, but my tests have shown that it can occur quite
> frequently with as little as 4 threads doing 1M readahead or 16 threads
> doing 128k readahead on a machine with 1GB memory of which 950MB is page
> cache. Here it would seem the right thing to do is shrink the window
> size and reduce the chance for page stealing, this however kill I/O
> performance if done to aggressively. Again the current code may not

I have seen pages not up2date on most of my stress tests. But have not observed
page frames being stolen before the pages are accessed.


> perform as expected. As in the 2 previous cases, the -3 is offset by a
> +2 and so unless > 2/3 of pages in a given window are stolen the net
> effect is to ignore the page stealing. New code will slowly shrink the
> window as long as stealing occurs, but will quickly regrow once it stops.



I have not looked in detail at your design document. But I am sure will
have positive points.

The biggest problem I have seen with the current code is:
1. slow read path is really slow, should be avoided at all costs.
2. the ramp up time for sequential reads is slow and can be made fast
if the size parameter is passed.
3. max readahead window sizes are too low by default.

Does it make sense to make some infrastructure where multiple readahead
algorithms can be made available with a command line option to enable
one of them. Something like the i/o scheduler?

RP

2004-09-25 06:08:11

by Ram Pai

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

On Fri, 24 Sep 2004, Ram Pai wrote:

> On Thu, 23 Sep 2004, Steven Pratt wrote:
>
> > The readahead code has undergone many changes in the 2.6 kernel and the
> > current implementation is in my opinion obtuse and hard to maintain. We
> > would like to offer up an alternative simplified design which will not
> > only make the code easier to maintain, but as performance tests have
> > shown, results in better performance in many cases.
> >
> > We are very interested in having others review and try out the code and
> > run whatever performance tests they see fit.
> >
> > Quick overview of the new design:
> >
> > The key design point of the new design is to make the readahead code
> > aware of the size of the I/O request. This change eliminates the need
> > for treating large random I/O as sequential and all of the averaging
> > code that exists just to support this. In addition to this change, the
> > new design ramps up quicker, and shuts off faster. This, combined with
> > the request size awareness eliminates the so called "slow read path"
> > that we try so hard to avoid in the current code. For complete details
> > on the design of the new readahead logic, please refer to
> > http://www-124.ibm.com/developerworks/opensource/linuxperf/readahead/read-ahead-design.pdf
> >
> > There are a few exception cases which still concern me.
> >
> > 1. pages already in cache
> > 2. I/O queue congestion.
> > 3. page stealing
> >
> > The first of these is a file already residing in page cache. If we do
> > not code for this case we will end up doing multiple page lookups for
> > each page. The current code tries to handle this using the
> > check_ra_success function, but this code does not work.
> > check_ra_success will subtract 1 page each time an I/O is completely
> > contained in page cache, however on the main path we will increment the
> > window size by 2 for each page in the request (up to max_readahead) thus
> > negating the reduction. My question is what is the right behavior.
>
> Not exactly true. If you look at the the parameters of check_ra_success()
> it takes orig_next_size as its third parameter. And orig_next_size is
> initialized to next_size right at the beginning of the function.
>
> So if the pages are already in the page cache , the next_size is decremented
> effectively by 1.

Ah..I misread what you said. Right the first time the next_size is decremented
but since those pages will reside in the current window, next time onwards
since the pages are already in the page cache the next_size keeps incrementing.
Hence check_ra_success() effectively becomes useless.

Looked at your code to see how you handled page cache hits, and if my reading is
correct you don't handle it at all?



>
>
>
> > Reducing the size of the ahead window doesn't help. You must turn off
> > readahead to have any effect. Once you believe all pages to be in page
> > cache we should just immediately turn off readahead. What is this
> > trigger point? 4 I/Os in a row? 400? My concern is that the savings
> > for skipping the double lookup appears to be on the order of .5% CPU in
> > my tests, but the penalty for small I/O in sequential read case can be
> > substantial. Currently the new code does not handle this case, but it
> > could be enhanced to do so.

ok. so you don't handle the case too.

>
> Currently the code does handle this automatically. If the size of the
> next_size is 'n' then if the next 'n' window reads are already
> in the page cache the readahead turns off. The problem with shutting
> down readahead the first time when all the pages are in the cache, is that
> there you will end up in the slow read path and it becomes miserable.
> Because then onwards only one page gets read at a time even though
> the read request is for larger number of pages. This behavior needs to
> be fixed.
>
>
>
> >
> > The second case is on queue congestion. Current code does not submit
> > the I/O if the queue is congested. This will result in each page being
> > read serially on the cache miss path. Does submitting 32 4k I/Os
> > instead of 1 128k I/O help queue congestion? Here is one place where
> > the current cod gets real confusing. We reduce the window by 1 page for
> > queue congestion(treated the same as page cache hit), but we leave the
> > information about the current and ahead windows alone even though we did
> > not issue the I/O to populate it and so we will never issue the I/O from
> > the readahead code. Eventually we will start taking cache misses since
> > no one read the pages. That code decrements the window size by 3, but
> > as in the cache hit case since we are still reading sequentially we keep
> > incrementing by 2 for each page; net effect -1 for each page not in
> > cache. Again, the new code ignores the congestion case and still
> > tries to do readahead, thus minimizing/optimizing the I/O requests sent
> > to the device. Is this right? If not what should we do?
>
> Yes. The code is currently not differentiating
> queue congestion against 'pages already in the page cache'.
>
> I think if the queue is congested and if we are trying to populate
> the current window, we better wait by calling schedule(),
> and if we are trying to populate readahead window, we can return
> immediately resetting the ahead_size and ahead_start?
>
>
>
> >
> > The third exception case is page stealing where the page into which
> > readahead was done is reclaimed before the data is copied to user
> > space. This would seem to be a somewhat rare case only happening under
> > sever memory pressure, but my tests have shown that it can occur quite
> > frequently with as little as 4 threads doing 1M readahead or 16 threads
> > doing 128k readahead on a machine with 1GB memory of which 950MB is page
> > cache. Here it would seem the right thing to do is shrink the window
> > size and reduce the chance for page stealing, this however kill I/O
> > performance if done to aggressively. Again the current code may not
>
> I have seen pages not up2date on most of my stress tests. But have not observed
> page frames being stolen before the pages are accessed.
>
>
> > perform as expected. As in the 2 previous cases, the -3 is offset by a
> > +2 and so unless > 2/3 of pages in a given window are stolen the net
> > effect is to ignore the page stealing. New code will slowly shrink the
> > window as long as stealing occurs, but will quickly regrow once it stops.

both excessive readahead-thrashing and page-cache-hit imply readahead needs to
be temporarily halted. How about decrementing a counter for every page that is
trashed or is hit in the page cache and incrementing the counter for every
page-misses or no-page-trash. If the counter reaches 0 stop readahead. If the
counter reaches max-readahead resume readahead. something along these lines...


To summarize you noticed 3 problems:

1. page cache hits not handled properly.
2. readahead thrashing not accounted.
3. read congestion not accounted.

Currently both the patches do not handle all the above cases.

So if your patch performs much better than the current one, than
it is the winner anyway. But past experience has shown that some
benchmark gets a hit for any small change. This happens to be tooo big
a change.

The code looks familiar ;)
RP


2004-09-27 15:28:47

by Steven Pratt

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Ram Pai wrote:

>On Fri, 24 Sep 2004, Ram Pai wrote:
>
>
>>On Thu, 23 Sep 2004, Steven Pratt wrote:
>>
>>
>>>The readahead code has undergone many changes in the 2.6 kernel and the
>>>current implementation is in my opinion obtuse and hard to maintain. We
>>>would like to offer up an alternative simplified design which will not
>>>only make the code easier to maintain, but as performance tests have
>>>shown, results in better performance in many cases.
>>>
>>>We are very interested in having others review and try out the code and
>>>run whatever performance tests they see fit.
>>>
>>>Quick overview of the new design:
>>>
>>>The key design point of the new design is to make the readahead code
>>>aware of the size of the I/O request. This change eliminates the need
>>>for treating large random I/O as sequential and all of the averaging
>>>code that exists just to support this. In addition to this change, the
>>>new design ramps up quicker, and shuts off faster. This, combined with
>>>the request size awareness eliminates the so called "slow read path"
>>>that we try so hard to avoid in the current code. For complete details
>>>on the design of the new readahead logic, please refer to
>>>http://www-124.ibm.com/developerworks/opensource/linuxperf/readahead/read-ahead-design.pdf
>>>
>>>There are a few exception cases which still concern me.
>>>
>>>1. pages already in cache
>>>2. I/O queue congestion.
>>>3. page stealing
>>>
>>>The first of these is a file already residing in page cache. If we do
>>>not code for this case we will end up doing multiple page lookups for
>>>each page. The current code tries to handle this using the
>>>check_ra_success function, but this code does not work.
>>>check_ra_success will subtract 1 page each time an I/O is completely
>>>contained in page cache, however on the main path we will increment the
>>>window size by 2 for each page in the request (up to max_readahead) thus
>>>negating the reduction. My question is what is the right behavior.
>>>
>>>
>>Not exactly true. If you look at the the parameters of check_ra_success()
>>it takes orig_next_size as its third parameter. And orig_next_size is
>>initialized to next_size right at the beginning of the function.
>>
>>So if the pages are already in the page cache , the next_size is decremented
>>effectively by 1.
>>
>>
>
>Ah..I misread what you said. Right the first time the next_size is decremented
>but since those pages will reside in the current window, next time onwards
>since the pages are already in the page cache the next_size keeps incrementing.
>Hence check_ra_success() effectively becomes useless.
>
>Looked at your code to see how you handled page cache hits, and if my reading is
>correct you don't handle it at all?
>
>
>
Correct, in the first version I sent I did not handle this. In the
second version I do.

>>>Reducing the size of the ahead window doesn't help. You must turn off
>>>readahead to have any effect. Once you believe all pages to be in page
>>>cache we should just immediately turn off readahead. What is this
>>>trigger point? 4 I/Os in a row? 400? My concern is that the savings
>>>for skipping the double lookup appears to be on the order of .5% CPU in
>>>my tests, but the penalty for small I/O in sequential read case can be
>>>substantial. Currently the new code does not handle this case, but it
>>>could be enhanced to do so.
>>>
>>>
>
>ok. so you don't handle the case too.
>
Right, the code currently ignores queue congestion, but based on Andrew
and Nick's comments I think we will change that slightly.

>>Currently the code does handle this automatically. If the size of the
>>next_size is 'n' then if the next 'n' window reads are already
>>in the page cache the readahead turns off. The problem with shutting
>>down readahead the first time when all the pages are in the cache, is that
>>there you will end up in the slow read path and it becomes miserable.
>>Because then onwards only one page gets read at a time even though
>>the read request is for larger number of pages. This behavior needs to
>>be fixed.
>>
>>
>>
>>
>>>The second case is on queue congestion. Current code does not submit
>>>the I/O if the queue is congested. This will result in each page being
>>>read serially on the cache miss path. Does submitting 32 4k I/Os
>>>instead of 1 128k I/O help queue congestion? Here is one place where
>>>the current cod gets real confusing. We reduce the window by 1 page for
>>>queue congestion(treated the same as page cache hit), but we leave the
>>>information about the current and ahead windows alone even though we did
>>>not issue the I/O to populate it and so we will never issue the I/O from
>>>the readahead code. Eventually we will start taking cache misses since
>>>no one read the pages. That code decrements the window size by 3, but
>>>as in the cache hit case since we are still reading sequentially we keep
>>>incrementing by 2 for each page; net effect -1 for each page not in
>>>cache. Again, the new code ignores the congestion case and still
>>>tries to do readahead, thus minimizing/optimizing the I/O requests sent
>>>to the device. Is this right? If not what should we do?
>>>
>>>
>>Yes. The code is currently not differentiating
>>queue congestion against 'pages already in the page cache'.
>>
>>I think if the queue is congested and if we are trying to populate
>>the current window, we better wait by calling schedule(),
>>and if we are trying to populate readahead window, we can return
>>immediately resetting the ahead_size and ahead_start?
>>
>>
>>
>>
>>
>>>The third exception case is page stealing where the page into which
>>>readahead was done is reclaimed before the data is copied to user
>>>space. This would seem to be a somewhat rare case only happening under
>>>sever memory pressure, but my tests have shown that it can occur quite
>>>frequently with as little as 4 threads doing 1M readahead or 16 threads
>>>doing 128k readahead on a machine with 1GB memory of which 950MB is page
>>>cache. Here it would seem the right thing to do is shrink the window
>>>size and reduce the chance for page stealing, this however kill I/O
>>>performance if done to aggressively. Again the current code may not
>>>
>>>
>>I have seen pages not up2date on most of my stress tests. But have not observed
>>page frames being stolen before the pages are accessed.
>>
>>
>>
>>
>>>perform as expected. As in the 2 previous cases, the -3 is offset by a
>>>+2 and so unless > 2/3 of pages in a given window are stolen the net
>>>effect is to ignore the page stealing. New code will slowly shrink the
>>>window as long as stealing occurs, but will quickly regrow once it stops.
>>>
>>>
>
>both excessive readahead-thrashing and page-cache-hit imply readahead needs to
>be temporarily halted. How about decrementing a counter for every page that is
>trashed or is hit in the page cache and incrementing the counter for every
>page-misses or no-page-trash. If the counter reaches 0 stop readahead. If the
>counter reaches max-readahead resume readahead. something along these lines...
>
>
>
What I do now for page cache hits is count how many pages in a row are
found in page cache when trying to do readahead. If any page is missing
I reset the count to 0 and start over since we want to avoid
fragmented/small I/Os. The limit on how many pages in a row disable
readahead is arbitrarily set at 2560 (10M). I don't know if this is a
good number or not.

>To summarize you noticed 3 problems:
>
>1. page cache hits not handled properly.
>2. readahead thrashing not accounted.
>3. read congestion not accounted.
>
>
Yes.

>Currently both the patches do not handle all the above cases.
>
>
No, thrashing was handled in the first patch, and both thrashing and
page cache hits are handled in the second. Also, it seems to be the
consensus that on normal I/O ignoring queue congestion is the right
behavior.

>So if your patch performs much better than the current one, than
>it is the winner anyway. But past experience has shown that some
>benchmark gets a hit for any small change. This happens to be tooo big
>a change.
>
I agree, we need more people to test this.

>
>The code looks familiar ;)
>RP
>
>
Steve

2004-09-27 15:38:52

by Steven Pratt

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Andrew Morton wrote:

>Steven Pratt <[email protected]> wrote:
>
>
>>>It's an application-specified readahead hint. It should ideally be
>>>asynchronous so the application can get some I/O underway while it's
>>>crunching on something else. If the queue is contested then the
>>>application will accidentally block when launching the readahead, which
>>>kinda defeats the purpose.
>>>
>>>
>>>
>>>posix_fadvise(POSIX_FADV_WILLNEED) is used by applications to tell the
>>>kernel that the application will need that part of the file in the future.
>>>Presumably, the application has something else to be going on with
>>>meanwhile. Hence the application doesn't want to block.
>>>
>>>
Ok, got it.

>>>Yes, the application will block when it does the subsequent read() anyway,
>>>but applications expect to block in read(). Seems saner this way.
>>>
>>>
>>>
>>Just to be sure I have this correct, the readahead code will be invoked
>>once on the POSIX_FADV_WILLNEED request, but this looks mostly like a
>>regular read, and then again for the same pages on a real read?
>>
>>
>
>
>yup. POSIX_FADV_WILLNEED should just populate pagecache and should launch
>asynchronous I/O.
>

Well then this could cause problems (other than congestion) on both the
current and new code since both will effectivly see 2 reads, the second
of which may appear to be a seek backwards thus confusing the code
slightly. Would it be best to just special case the POSIX_FADV_WILLNEED
and issue the I/O required (via do_page_cache_readahead) without
updating any of the window or current page offset information ? Of
course adding the neccesary check for queue congestion.

Steve

2004-09-27 18:42:55

by Ram Pai

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

On Mon, 2004-09-27 at 08:30, Steven Pratt wrote:
> Ram Pai wrote:
>
> >On Fri, 24 Sep 2004, Ram Pai wrote:
> >
> >
> >>On Thu, 23 Sep 2004, Steven Pratt wrote:
> >>
> >>

..snip..

> >To summarize you noticed 3 problems:
> >
> >1. page cache hits not handled properly.
> >2. readahead thrashing not accounted.
> >3. read congestion not accounted.
> >
> >
> Yes.
>
> >Currently both the patches do not handle all the above cases.
> >
> >
> No, thrashing was handled in the first patch, and both thrashing and
> page cache hits are handled in the second. Also, it seems to be the
> consensus that on normal I/O ignoring queue congestion is the right
> behavior.
>
> >So if your patch performs much better than the current one, than
> >it is the winner anyway. But past experience has shown that some
> >benchmark gets a hit for any small change. This happens to be tooo big
> >a change.
> >
> I agree, we need more people to test this.
>
> >


I will fix the 3 problems you discovered in the current code.
And lets compare the two results. However you have more features in
your patch which will be the differentiating factor between the two
versions.

1. exponential increase and decrease of window size
2. overlapped read of current window and readahead window.
3. fixed slow-read path.
4. anything else?

The readsize parameter comes in handy to incorporate the
the above features.

RP

2004-09-27 19:28:35

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Steven Pratt <[email protected]> wrote:
>
> >yup. POSIX_FADV_WILLNEED should just populate pagecache and should launch
> >asynchronous I/O.
> >
>
> Well then this could cause problems (other than congestion) on both the
> current and new code since both will effectivly see 2 reads, the second
> of which may appear to be a seek backwards thus confusing the code
> slightly. Would it be best to just special case the POSIX_FADV_WILLNEED
> and issue the I/O required (via do_page_cache_readahead) without
> updating any of the window or current page offset information ?

That's what we do at present. do_page_cache_readahead() and
force_page_cache_readahead() are low-level functions which do not affect
file->ra_state.

Except whoops. POSIX_FADV_WILLNEED is using force_page_cache_readahead(),
which bypasses the congested check. Wonder how that happened.

<digs out the changeset>

ChangeSet 1.1046.589.14 2003/08/01 10:02:32 [email protected]
[PATCH] rework readahead for congested queues

Since Jens changed the block layer to fail readahead if the queue has no
requests free, a few changes suggest themselves.

- It's a bit silly to go and alocate a bunch of pages, build BIOs for them,
submit the IO only to have it fail, forcing us to free the pages again.

So the patch changes do_page_cache_readahead() to peek at the queue's
read_congested state. If the queue is read-congested we abandon the entire
readahead up-front without doing all that work.

- If the queue is not read-congested, we go ahead and do the readahead,
after having set PF_READAHEAD.

The backing_dev_info's read-congested threshold cuts in when 7/8ths of
the queue's requests are in flight, so it is probable that the readahead
abandonment code in __make_request will now almost never trigger.

- The above changes make do_page_cache_readahead() "unreliable", in that it
may do nothing at all.

However there are some system calls:

- fadvise(POSIX_FADV_WILLNEED)
- madvise(MADV_WILLNEED)
- sys_readahead()

In which the user has an expectation that the kernel will actually
perform the IO.

So the patch creates a new "force_page_cache_readahead()" which will
perform the IO regardless of the queue's congestion state.

Arguably, this is the wrong thing to do: even though the application
requested readahead it could be that the kernel _should_ abandon the user's
request because the disk is so busy.

I don't know. But for now, let's keep the above syscalls behaviour
unchanged. It is trivial to switch back to do_page_cache_readahead()
later.


So there we have it. The reason why normal readahead skips congested
queues is because the block layer will drop the I/O on the floor *anyway*
because it also skips congested queues for readahead I/O requests.

And fadvise() was switched to force_page_cache_readahead() because that was
the old behaviour.

But PF_READAHEAD isn't there any more, and BIO_RW_AHEAD and BIO_RW_BLOCK
are not used anywhere, so we broke that. Jens, do you remember what
happened there?

2004-09-27 20:04:11

by Steven Pratt

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Ram Pai wrote:

>On Mon, 2004-09-27 at 08:30, Steven Pratt wrote:
>
>
>>Ram Pai wrote:
>>
>>
>>>On Fri, 24 Sep 2004, Ram Pai wrote:
>>>
>>>
>>>
>>>>On Thu, 23 Sep 2004, Steven Pratt wrote:
>>>>
>>>>
>>>>
>
> ..snip..
>
>
>>>To summarize you noticed 3 problems:
>>>
>>>1. page cache hits not handled properly.
>>>2. readahead thrashing not accounted.
>>>3. read congestion not accounted.
>>>
>>>
>>>
>>>
>>Yes.
>>
>>
>>>Currently both the patches do not handle all the above cases.
>>>
>>>
>>>
>>>
>>No, thrashing was handled in the first patch, and both thrashing and
>>page cache hits are handled in the second. Also, it seems to be the
>>consensus that on normal I/O ignoring queue congestion is the right
>>behavior.
>>
>>
>>
>>>So if your patch performs much better than the current one, than
>>>it is the winner anyway. But past experience has shown that some
>>>benchmark gets a hit for any small change. This happens to be tooo big
>>>a change.
>>>
>>>
>>>
>>I agree, we need more people to test this.
>>
>>
>>
>
>
>I will fix the 3 problems you discovered in the current code.
>And lets compare the two results.
>
Ok, great.

> However you have more features in
>your patch which will be the differentiating factor between the two
>versions.
>
>1. exponential increase and decrease of window size
>2. overlapped read of current window and readahead window.
>3. fixed slow-read path.
>4. anything else?
>
>
No, I think that is it.

>The readsize parameter comes in handy to incorporate the
>the above features.
>
>
Yes, without it I think you still need to do the average calculations
that you do today.

Also, remember that we did not re-write the readahead code just for the
fun of it. It was simply the most efficient way we came up with to
address the current issues as well as add the new features. The actual
I/O patterns generated are not that different in most cases.

Steve


2004-09-27 20:30:44

by Ray Bryant

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Hi Steve,

On question I have (and I'm sorry, I haven't had time to look at your
patch to sort this out) is what happens if the user supplies a rather
serious I/O size, will you read ahead multiples of that, or what
happens? Or, for that matter, how well will it perform?

I've heard about HPC applications for IRIX that issue a 2GB read. :-)

--
Best Regards,
Ray
-----------------------------------------------
Ray Bryant
512-453-9679 (work) 512-507-7807 (cell)
[email protected] [email protected]
The box said: "Requires Windows 98 or better",
so I installed Linux.
-----------------------------------------------

2004-09-27 21:05:19

by Steven Pratt

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Ray Bryant wrote:

> Hi Steve,
>
> On question I have (and I'm sorry, I haven't had time to look at your
> patch to sort this out) is what happens if the user supplies a rather
> serious I/O size, will you read ahead multiples of that, or what
> happens? Or, for that matter, how well will it perform?

Same behavior as the old code. I/Os are broken up into at most
max_readahead size pieces. In the case of the old code only 1 of these
could be outstanding. In the new code there could be at most 2
outstanding at any point in time.

>
> I've heard about HPC applications for IRIX that issue a 2GB read. :-)

This is why for these types of applications, especially on RAID arrays,
you need to set max_readahead into the MBs. (But that is a different topic).

Steve

2004-09-28 10:16:11

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

On Mon, Sep 27 2004, Andrew Morton wrote:
> Steven Pratt <[email protected]> wrote:
> >
> > >yup. POSIX_FADV_WILLNEED should just populate pagecache and should launch
> > >asynchronous I/O.
> > >
> >
> > Well then this could cause problems (other than congestion) on both the
> > current and new code since both will effectivly see 2 reads, the second
> > of which may appear to be a seek backwards thus confusing the code
> > slightly. Would it be best to just special case the POSIX_FADV_WILLNEED
> > and issue the I/O required (via do_page_cache_readahead) without
> > updating any of the window or current page offset information ?
>
> That's what we do at present. do_page_cache_readahead() and
> force_page_cache_readahead() are low-level functions which do not affect
> file->ra_state.
>
> Except whoops. POSIX_FADV_WILLNEED is using force_page_cache_readahead(),
> which bypasses the congested check. Wonder how that happened.
>
> <digs out the changeset>
>
> ChangeSet 1.1046.589.14 2003/08/01 10:02:32 [email protected]
> [PATCH] rework readahead for congested queues
>
> Since Jens changed the block layer to fail readahead if the queue has no
> requests free, a few changes suggest themselves.
>
> - It's a bit silly to go and alocate a bunch of pages, build BIOs for them,
> submit the IO only to have it fail, forcing us to free the pages again.
>
> So the patch changes do_page_cache_readahead() to peek at the queue's
> read_congested state. If the queue is read-congested we abandon the entire
> readahead up-front without doing all that work.
>
> - If the queue is not read-congested, we go ahead and do the readahead,
> after having set PF_READAHEAD.
>
> The backing_dev_info's read-congested threshold cuts in when 7/8ths of
> the queue's requests are in flight, so it is probable that the readahead
> abandonment code in __make_request will now almost never trigger.
>
> - The above changes make do_page_cache_readahead() "unreliable", in that it
> may do nothing at all.
>
> However there are some system calls:
>
> - fadvise(POSIX_FADV_WILLNEED)
> - madvise(MADV_WILLNEED)
> - sys_readahead()
>
> In which the user has an expectation that the kernel will actually
> perform the IO.
>
> So the patch creates a new "force_page_cache_readahead()" which will
> perform the IO regardless of the queue's congestion state.
>
> Arguably, this is the wrong thing to do: even though the application
> requested readahead it could be that the kernel _should_ abandon the user's
> request because the disk is so busy.
>
> I don't know. But for now, let's keep the above syscalls behaviour
> unchanged. It is trivial to switch back to do_page_cache_readahead()
> later.
>
>
> So there we have it. The reason why normal readahead skips congested
> queues is because the block layer will drop the I/O on the floor *anyway*
> because it also skips congested queues for readahead I/O requests.
>
> And fadvise() was switched to force_page_cache_readahead() because that was
> the old behaviour.
>
> But PF_READAHEAD isn't there any more, and BIO_RW_AHEAD and BIO_RW_BLOCK
> are not used anywhere, so we broke that. Jens, do you remember what
> happened there?

Nothing changed from the block layer point of view (in 2.6 or earler,
rw_ahead was always tossed away for congested queues). Why the
read-ahead algorithms dropped PF_READAHEAD or flagging bio_rw_ahead() I
don't know, I haven't worked on that.

--
Jens Axboe

2004-09-29 18:48:31

by Ram Pai

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

On Mon, 27 Sep 2004, Steven Pratt wrote:

> Ram Pai wrote:
>
> >On Mon, 2004-09-27 at 08:30, Steven Pratt wrote:
> >
> >
> >>Ram Pai wrote:
> >>
> >>
> >>>On Fri, 24 Sep 2004, Ram Pai wrote:
> >>>
> >>>
> >>>
> >>>>On Thu, 23 Sep 2004, Steven Pratt wrote:
> >>>>
> >>>>
> >>>>
> >
> > ..snip..
> >
> >
> >>>To summarize you noticed 3 problems:
> >>>
> >>>1. page cache hits not handled properly.
> >>>2. readahead thrashing not accounted.
> >>>3. read congestion not accounted.


I have enclosed 5 patches that address each of the issues.

1 . Code is obtuse and hard to maintain

The best I could do is update the comments to reflect the
current code. Hopefully that should help.

attached patch 1_comment.patch takes care of that part to
some extent.


2. page cache hits not handled properly.

I fixed this by decrementing the size of the next readahead window
by the number of pages hit in the page cache. Now it slowly
accomodates the page cache hits.

attached patch 2_cachehits.patch takes care of this issue.

3. queue congestion not handled.

The fix is: call force_page_cache_readahead() if we are
populating pages in the current window.
And call do_page_cache_readahead() if we are populating
pages in the ahead window. However if do_page_cache_readahead()
return with congestion, the readahead window is collapsed back
to size zero. This will ensure that the next time ahead window
is attempted to populate.

attached patch 3_queuecongestion.patch handles this issue.

4. page thrash handled ineffectively.

The fix is: on page thrash detection shutdown readahead.

attached patch 4_pagethrash.patch handles this issue.

5. slow read path is too slow.

I could not figure out a way to atleast-read-the-requested-
number-of-pages if readahead is shutdown, without incorporating
the readsize parameter to page_cache_readahead(). So had
to pick some of your code in filemap.c to do that. Thanks!

attached patch 5_fixedslowread.patch handles this issue.


Apart from this you have noticed other issues

6. cache lookup done unneccessrily twice for pagecache_hits.

I have not handled this issue currently. But should be doable
if I introducing a flag, which notes when readahead is
shutdown by pagecahche hits. And hence attempts to lookup
the page only once.


And you have other features in your patch which will be the real
differentiating factors.

7. exponential expand and shrink of window sizes.

8. overlapped read of current window and ahead window.

( I think both are desirable feature )

I did run some premilinary tests using your patch and the above patches
and found

your patch was doing slightly better on iozone and sysbench.
however the above patch were doing slightly better with DSS workload.

But my setup is rather tiny compared to your setup, so my comparison
is rather incomplete.

RP


Attachments:
1_comment.patch (5.91 kB)
2_cachehits.patch (1.83 kB)
3_queuecongestion.patch (1.85 kB)
4_pagethrash.patch (1.10 kB)
5_fixedslowread.patch (2.88 kB)
Download all attachments

2004-09-29 22:35:38

by Steven Pratt

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Ram Pai wrote:

snip...

>
>
> I have enclosed 5 patches that address each of the issues.
>
>1 . Code is obtuse and hard to maintain
>
> The best I could do is update the comments to reflect the
> current code. Hopefully that should help.
>
> attached patch 1_comment.patch takes care of that part to
> some extent.
>
>
I was more concerend with the multiple ways in which the readhead
changed, and the fact that we were going down the sequential read path
on random I/O requests. More comments are always good, but that was not
really my concern.

>
>2. page cache hits not handled properly.
>
> I fixed this by decrementing the size of the next readahead window
> by the number of pages hit in the page cache. Now it slowly
> accomodates the page cache hits.
>
> attached patch 2_cachehits.patch takes care of this issue.
>
>
I think this will be way too agressive. This means turn off readhead if
1 max_readhead I/O is completely in cache. You then will need to do
multiple 4k I/Os to get it turned back on.

>3. queue congestion not handled.
>
> The fix is: call force_page_cache_readahead() if we are
> populating pages in the current window.
> And call do_page_cache_readahead() if we are populating
> pages in the ahead window. However if do_page_cache_readahead()
> return with congestion, the readahead window is collapsed back
> to size zero. This will ensure that the next time ahead window
> is attempted to populate.
>
> attached patch 3_queuecongestion.patch handles this issue.
>
>
Yeah, I had thought about something along these lines. Just not sure if
it is worth it.

>4. page thrash handled ineffectively.
>
> The fix is: on page thrash detection shutdown readahead.
>
> attached patch 4_pagethrash.patch handles this issue.
>
>
Same comments as on 2. Way too agressive.

>5. slow read path is too slow.
>
> I could not figure out a way to atleast-read-the-requested-
> number-of-pages if readahead is shutdown, without incorporating
> the readsize parameter to page_cache_readahead(). So had
> to pick some of your code in filemap.c to do that. Thanks!
>
> attached patch 5_fixedslowread.patch handles this issue.
>
>
>
Step in the right direction. Now if I could just get you to pick up the
rest we would be done :-)

>Apart from this you have noticed other issues
>
>6. cache lookup done unneccessrily twice for pagecache_hits.
>
> I have not handled this issue currently. But should be doable
> if I introducing a flag, which notes when readahead is
> shutdown by pagecahche hits. And hence attempts to lookup
> the page only once.
>
>
>
Umm, actually you do (if the code works). When you get too many cache
hits you turn off readahead. This will disable the multiple lookups
until you re-enable readhead which will only happen in handle_ra_miss
which means you are reading pages not in page cache so that is ok.

>And you have other features in your patch which will be the real
>differentiating factors.
>
>7. exponential expand and shrink of window sizes.
>
>8. overlapped read of current window and ahead window.
>
> ( I think both are desirable feature )
>
>
Glad we agree.

>I did run some premilinary tests using your patch and the above patches
>and found
>
>your patch was doing slightly better on iozone and sysbench.
>however the above patch were doing slightly better with DSS workload.
>
>
Care to expand on slightly better? Also these tests don't cover many
cases. You are only running iozone single threaded which won't show
much, you don't seem to vary IO requests sizes to see the effect (all
this based on your readahead web page). Also it would be really helpful
if you had some sort of machine description, disk, memory etc. Also for
the DSS workload I need ot know what type of IO pattern you generate. I
know it is mostly random IO, but the size of the IOs and the number of
prefetcher threads and the number of disks make a huge difference.

Also what is the units in the iozone results? I thought it was in
kbytes which would make your throughputs in the 350-400MB/sec range
which is way more than you could do on a single adapter. So unless I am
really off base here, your IOzone results appear to be mostly cache
reads and thus not really testing readahead. I must have missed something.

>But my setup is rather tiny compared to your setup, so my comparison
>is rather incomplete.
>
>
Not really. I made sure that I ran this on machine from single cpu ide
up through really big machines. It is important to run well across a
variety of platforms which I tried to ensure my code does.


I'll try to run this through my test suite tomorrow, but is there
something you don't like about the new code? You seem to be moving in
that direction. Is there any reason to not make the complete jump and
help out on the (hopefully) simpler code base?

Steve


2004-09-29 23:13:53

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

On a readahead-related note, I'm wondering how hard it would be to have
some tunables and/or hooks from the readahead state manchine made
available to the filesystem? With the 2.4 readahead code it was basically
impossible for the filesystem to disable the readahead, I haven't looked
at the 2.6 readahead enough to determine whether we need that or not.

The real issue (reason for turning off RA in 2.4) is that within Lustre
there can be many DLM extent locks for a single file, so client A can
be writing to one part of the file, and client B can be reading from
another part of the same file. With the stock readahead it wouldn't
stay within the lock extent boundaries, and we couldn't turn it off
easily. Having some sort of FS method that says "don't do RA beyond
this offset" would be useful here.

The other problem that Lustre had was that the stock readahead would
send out page reads in small chunks as the window grew instead of
sending out large requests that could be turned into large, efficient
network RPCs. So the desire would be to have some sort of tunable in
the readahead state (per fs or per file) that says "don't submit
another readahead until the window is growing by X pages".

As it is we've basically had to implement our own readahead code within
the filesystem in order to get correct behaviour and good performance.
This is of course not optimal from a code duplication point of view and
also we don't get any benefits from the algorithm improvements being
done here.


The other question is whether the new readahead code takes the latency
of completing read requests into account when determining the size of
the readahead window? Lustre generally runs with very fast disk and
network systems so the size of the readahead window has to be very large
in order to keep the pipeline full to avoid stalling on the client.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://members.shaw.ca/adilger/ http://members.shaw.ca/golinux/


Attachments:
(No filename) (1.97 kB)
(No filename) (189.00 B)
Download all attachments

2004-09-30 01:12:51

by Ram Pai

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

On Wed, 29 Sep 2004, Steven Pratt wrote:

> Ram Pai wrote:
>
> snip...
>
> >
> >
> > I have enclosed 5 patches that address each of the issues.
> >
> >1 . Code is obtuse and hard to maintain
> >
> > The best I could do is update the comments to reflect the
> > current code. Hopefully that should help.
> >
> > attached patch 1_comment.patch takes care of that part to
> > some extent.
> >
> >
> I was more concerend with the multiple ways in which the readhead
> changed, and the fact that we were going down the sequential read path
> on random I/O requests. More comments are always good, but that was not
> really my concern.
>
> >
> >2. page cache hits not handled properly.
> >
> > I fixed this by decrementing the size of the next readahead window
> > by the number of pages hit in the page cache. Now it slowly
> > accomodates the page cache hits.
> >
> > attached patch 2_cachehits.patch takes care of this issue.
> >
> >
> I think this will be way too agressive. This means turn off readhead if
> 1 max_readhead I/O is completely in cache. You then will need to do
> multiple 4k I/Os to get it turned back on.

Having a max_readahead pages completely in cache is pretty good reason
to switch-off readahead. Yes it is aggressive, and the downside of
this is multiple 4k I/o. That is the reason we need the slow-read
patch[patch number 5] to speed up the slow-read path.


>
> >3. queue congestion not handled.
> >
> > The fix is: call force_page_cache_readahead() if we are
> > populating pages in the current window.
> > And call do_page_cache_readahead() if we are populating
> > pages in the ahead window. However if do_page_cache_readahead()
> > return with congestion, the readahead window is collapsed back
> > to size zero. This will ensure that the next time ahead window
> > is attempted to populate.
> >
> > attached patch 3_queuecongestion.patch handles this issue.
> >
> >
> Yeah, I had thought about something along these lines. Just not sure if
> it is worth it.

Well does not seem to hurt!

>
> >4. page thrash handled ineffectively.
> >
> > The fix is: on page thrash detection shutdown readahead.
> >
> > attached patch 4_pagethrash.patch handles this issue.
> >
> >
> Same comments as on 2. Way too agressive.

yes. But think about it. Why did a page get thrashed? there
is memory pressure. Hence better stop reading ahead. Again
yes it is agressive. And to compensate the agression we need the
slowread patch that fixes the slow read path.


>
> >5. slow read path is too slow.
> >
> > I could not figure out a way to atleast-read-the-requested-
> > number-of-pages if readahead is shutdown, without incorporating
> > the readsize parameter to page_cache_readahead(). So had
> > to pick some of your code in filemap.c to do that. Thanks!
> >
> > attached patch 5_fixedslowread.patch handles this issue.
> >
> >
> >
> Step in the right direction. Now if I could just get you to pick up the
> rest we would be done :-)

:-) I wish it was my call.


>
> >Apart from this you have noticed other issues
> >
> >6. cache lookup done unneccessrily twice for pagecache_hits.
> >
> > I have not handled this issue currently. But should be doable
> > if I introducing a flag, which notes when readahead is
> > shutdown by pagecahche hits. And hence attempts to lookup
> > the page only once.
> >
> >
> >
> Umm, actually you do (if the code works). When you get too many cache
> hits you turn off readahead. This will disable the multiple lookups
> until you re-enable readhead which will only happen in handle_ra_miss
> which means you are reading pages not in page cache so that is ok.


yes but with the slowread patch, we attempt to read pages
and then do_generic_mapping_read() anyway attempts to check if the
page frame is there. This is a small optimization. You did mention that you
saw .5% decrease in CPU time. Did'nt you?

>
> >And you have other features in your patch which will be the real
> >differentiating factors.
> >
> >7. exponential expand and shrink of window sizes.
> >
> >8. overlapped read of current window and ahead window.
> >
> > ( I think both are desirable feature )
> >
> >
> Glad we agree.
>
We never disagreed. Did we? :) The concern Andrew had was, have you
tried to fix the problem with the current code, and then check
if your patch does better with the fixed-code.

I am trying to help you get a solid case to get Andrew's acceptance.

So with these set of patches, your algorithm and the current algorithm
are on even playing ground to do some fair comparison.


> >I did run some premilinary tests using your patch and the above patches
> >and found
> >
> >your patch was doing slightly better on iozone and sysbench.
> >however the above patch were doing slightly better with DSS workload.
> >
> >
> Care to expand on slightly better? Also these tests don't cover many
> cases. You are only running iozone single threaded which won't show
> much, you don't seem to vary IO requests sizes to see the effect (all
> this based on your readahead web page). Also it would be really helpful
> if you had some sort of machine description, disk, memory etc.

2proc 700Mhz , 2GBmemory. Ultra160 SCSI disk.

> Also for
> the DSS workload I need ot know what type of IO pattern you generate. I
> know it is mostly random IO, but the size of the IOs and the number of
> prefetcher threads and the number of disks make a huge difference.

256k iosize with 50 prefetchers.

>
> Also what is the units in the iozone results? I thought it was in
> kbytes which would make your throughputs in the 350-400MB/sec range
> which is way more than you could do on a single adapter. So unless I am
> really off base here, your IOzone results appear to be mostly cache
> reads and thus not really testing readahead. I must have missed something.

correct observation. My filesize is 1.2GB and memory size is 2GB so it
will end up being entirely cache based. Good observation. Rerunning
with a larger file.

>
> >But my setup is rather tiny compared to your setup, so my comparison
> >is rather incomplete.
> >
> >
> Not really. I made sure that I ran this on machine from single cpu ide
> up through really big machines. It is important to run well across a
> variety of platforms which I tried to ensure my code does.
>
>
> I'll try to run this through my test suite tomorrow

sure. that will help. And once results are in place, its Andrew's call.
No, these patches are not in competition with yours.
Its just helping you to convince others.

> , but is there
> something you don't like about the new code? You seem to be moving in
> that direction. Is there any reason to not make the complete jump and
> help out on the (hopefully) simpler code base?

Thought I was helping you out, No?

BTW: the last patch 5_fixedslowread.patch accidently did not contain all
the changes. Enclosed the remaining changes.

RP
>
> Steve
>


Attachments:
6_fixedslowread.patch (3.14 kB)

2004-09-30 02:27:04

by Ram Pai

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

On Wed, 2004-09-29 at 16:13, Andreas Dilger wrote:
> On a readahead-related note, I'm wondering how hard it would be to have
> some tunables and/or hooks from the readahead state manchine made
> available to the filesystem? With the 2.4 readahead code it was basically
> impossible for the filesystem to disable the readahead, I haven't looked
> at the 2.6 readahead enough to determine whether we need that or not.
>
The best way currently to shutoff readahead is to poke into the file
descriptors' readahead structure and set ra_pages to 0.


> The real issue (reason for turning off RA in 2.4) is that within Lustre
> there can be many DLM extent locks for a single file, so client A can
> be writing to one part of the file, and client B can be reading from
> another part of the same file. With the stock readahead it wouldn't
> stay within the lock extent boundaries, and we couldn't turn it off
> easily. Having some sort of FS method that says "don't do RA beyond
> this offset" would be useful here.
>
> The other problem that Lustre had was that the stock readahead would
> send out page reads in small chunks as the window grew instead of
> sending out large requests that could be turned into large, efficient
> network RPCs. So the desire would be to have some sort of tunable in
> the readahead state (per fs or per file) that says "don't submit
> another readahead until the window is growing by X pages".
>
> As it is we've basically had to implement our own readahead code within
> the filesystem in order to get correct behaviour and good performance.
> This is of course not optimal from a code duplication point of view and
> also we don't get any benefits from the algorithm improvements being
> done here.
>
>
> The other question is whether the new readahead code takes the latency
> of completing read requests into account when determining the size of
> the readahead window?

No. it does not currently. But taking that into consideration will be
tricky. If Luster does that it would be nice to know how does it.

RP

2004-09-30 02:40:15

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Ram Pai <[email protected]> wrote:
>
> On Wed, 2004-09-29 at 16:13, Andreas Dilger wrote:
> > On a readahead-related note, I'm wondering how hard it would be to have
> > some tunables and/or hooks from the readahead state manchine made
> > available to the filesystem? With the 2.4 readahead code it was basically
> > impossible for the filesystem to disable the readahead, I haven't looked
> > at the 2.6 readahead enough to determine whether we need that or not.
> >
> The best way currently to shutoff readahead is to poke into the file
> descriptors' readahead structure and set ra_pages to 0.

fadvise(fd, POSIX_FADV_RANDOM).

2004-09-30 20:21:29

by Stephen C. Tweedie

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Hi,

On Thu, 2004-09-30 at 00:13, Andreas Dilger wrote:
> On a readahead-related note, I'm wondering how hard it would be to have
> some tunables and/or hooks from the readahead state manchine made
> available to the filesystem? With the 2.4 readahead code it was basically
> impossible for the filesystem to disable the readahead, I haven't looked
> at the 2.6 readahead enough to determine whether we need that or not.

Well, readahead is still done internally by filemap.c without consulting
the individual fs.

> The real issue (reason for turning off RA in 2.4) is that within Lustre
> there can be many DLM extent locks for a single file, so client A can
> be writing to one part of the file, and client B can be reading from
> another part of the same file. With the stock readahead it wouldn't
> stay within the lock extent boundaries, and we couldn't turn it off
> easily. Having some sort of FS method that says "don't do RA beyond
> this offset" would be useful here.

Do you really want that, though? I'd have thought that grabbing a DLM
lock was potentially a high-latency operation, so you might actually
_benefit_ from notification that you want to start acquiring it early.
If it's going to be prohibitively expensive to acquire, though, you'd
still want the option of not doing so.

But there's not really any way for the fs to tell right now whether a
read is for readahead or not. It can _nearly_ do so: if you implement
a_ops->readpages(), then that only gets called for readahead.

Trouble is, that same readahead code is also used to kick off early
reads when the user actually requested a single large read. If the app
reads 100k at once, then do_generic_mapping_read() first kicks off
readahead, then does individual page-sized read_page()s to get the
data. And the readahead code doesn't really know about this at all; it
can't tell what amount of sequential data the user is guaranteed to
want.

ext3 could make use of this sort of information too, btw. Currently,
mpage_readpages() ends up calling the fs get_block() to map all the
pages for reading, before submitting the IO request; and even though
the final IO is async, the get_block() is still fully synchronous. If
ext3 knew it was a readahead, it could potentially queue the read for
the indirect block and return EAGAIN, stopping the readahead at the
point where we don't yet have mapping information in cache.

But you don't want ext3_get_block() returning EAGAIN for synchronous
reads. :-)

> The other problem that Lustre had was that the stock readahead would
> send out page reads in small chunks as the window grew instead of
> sending out large requests that could be turned into large, efficient
> network RPCs. So the desire would be to have some sort of tunable in
> the readahead state (per fs or per file) that says "don't submit
> another readahead until the window is growing by X pages".

Does the current 2.6 ahead-window readahead still show that problem?

--Stephen


2004-10-01 21:17:38

by Steven Pratt

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

Ram Pai wrote:

snip...

>>>>>To summarize you noticed 3 problems:
>>>>>
>>>>>1. page cache hits not handled properly.
>>>>>2. readahead thrashing not accounted.
>>>>>3. read congestion not accounted.
>>>>>
>>>>>
>
>
>I have enclosed 5 patches that address each of the issues.
>
>1 . Code is obtuse and hard to maintain
>
> The best I could do is update the comments to reflect the
> current code. Hopefully that should help.
>
> attached patch 1_comment.patch takes care of that part to
> some extent.
>
>
>2. page cache hits not handled properly.
>
> I fixed this by decrementing the size of the next readahead window
> by the number of pages hit in the page cache. Now it slowly
> accomodates the page cache hits.
>
> attached patch 2_cachehits.patch takes care of this issue.
>
>3. queue congestion not handled.
>
> The fix is: call force_page_cache_readahead() if we are
> populating pages in the current window.
> And call do_page_cache_readahead() if we are populating
> pages in the ahead window. However if do_page_cache_readahead()
> return with congestion, the readahead window is collapsed back
> to size zero. This will ensure that the next time ahead window
> is attempted to populate.
>
> attached patch 3_queuecongestion.patch handles this issue.
>
>4. page thrash handled ineffectively.
>
> The fix is: on page thrash detection shutdown readahead.
>
> attached patch 4_pagethrash.patch handles this issue.
>
>5. slow read path is too slow.
>
> I could not figure out a way to atleast-read-the-requested-
> number-of-pages if readahead is shutdown, without incorporating
> the readsize parameter to page_cache_readahead(). So had
> to pick some of your code in filemap.c to do that. Thanks!
>
> attached patch 5_fixedslowread.patch handles this issue.
>
>
>Apart from this you have noticed other issues
>
>6. cache lookup done unneccessrily twice for pagecache_hits.
>
> I have not handled this issue currently. But should be doable
> if I introducing a flag, which notes when readahead is
> shutdown by pagecahche hits. And hence attempts to lookup
> the page only once.
>
>
>And you have other features in your patch which will be the real
>differentiating factors.
>
>7. exponential expand and shrink of window sizes.
>
>8. overlapped read of current window and ahead window.
>
> ( I think both are desirable feature )
>
>I did run some premilinary tests using your patch and the above patches
>and found
>
>your patch was doing slightly better on iozone and sysbench.
>however the above patch were doing slightly better with DSS workload.
>
>

Ok, I have re-run the Tiobench tests. On a single cpu ide based system
you new patches have no noticable effect on sequential read performance
(a good thing); but on random I/O things went bad :-(.

Here are the random read results for 16k io with 4GB fileset on 256MB
mem, single cpu IDE

Stock w/ patches

Threads MBs/sec MBs/sec %diff diff
---------- ------------ ------------ -------- ------------
1 1.73 1.72 -0.58 -0.01
4 1.70 1.56 -8.24 -0.14
16 1.66 0.81 -51.20 -0.85
64 1.49 0.68 -54.36 -0.81

As you can see somewhere after 4 threads the new patches cause performance to tank.

With 512k ios the problem kicks in with less than 4 threads.

Stock w/ patches
Threads MBs/sec MBs/sec %diff diff
---------- ------------ ------------ -------- ------------
1 18.50 18.55 0.27 0.05
4 8.55 6.59 -22.92 -1.96
16 8.40 5.18 -38.33 -3.22
64 7.34 4.76 -35.15 -2.58


Unfortunately this is the _good_ news. The bad news is that this is much worse on SCSI.
We lose a few percent on sequential reads for all block sizes and random is just totally screwed.

Here is the same 16k io requests size with 4GB fileset on 1GB memory on 8way system on single scsi disk.

stock w/ patch
Threads MBs/sec MBs/sec %diff diff
---------- ------------ ------------ -------- ------------
1 3.43 3.03 -11.66 -0.40
4 4.51 1.06 -76.50 -3.45
16 5.86 1.43 -75.60 -4.43
64 6.13 1.66 -72.92 -4.47

11% degrade even on 1 thread, 75% degrade for 4 threads and above! This is horribly broken.


Steve







2004-10-05 17:54:55

by Ram Pai

[permalink] [raw]
Subject: Re: [PATCH/RFC] Simplified Readahead

On Fri, 2004-10-01 at 14:02, Steven Pratt wrote:
> Ram Pai wrote:
>
> snip...
>
> >>>>>To summarize you noticed 3 problems:
> >>>>>
> >>>>>1. page cache hits not handled properly.
> >>>>>2. readahead thrashing not accounted.
> >>>>>3. read congestion not accounted.
> >>>>>
> >>>>>
> >
> >
> >I have enclosed 5 patches that address each of the issues.
> >
> >1 . Code is obtuse and hard to maintain
> >
> > The best I could do is update the comments to reflect the
> > current code. Hopefully that should help.
> >
> > attached patch 1_comment.patch takes care of that part to
> > some extent.
> >
> >
> >2. page cache hits not handled properly.
> >
> > I fixed this by decrementing the size of the next readahead window
> > by the number of pages hit in the page cache. Now it slowly
> > accomodates the page cache hits.
> >
> > attached patch 2_cachehits.patch takes care of this issue.
> >
> >3. queue congestion not handled.
> >
> > The fix is: call force_page_cache_readahead() if we are
> > populating pages in the current window.
> > And call do_page_cache_readahead() if we are populating
> > pages in the ahead window. However if do_page_cache_readahead()
> > return with congestion, the readahead window is collapsed back
> > to size zero. This will ensure that the next time ahead window
> > is attempted to populate.
> >
> > attached patch 3_queuecongestion.patch handles this issue.
> >
> >4. page thrash handled ineffectively.
> >
> > The fix is: on page thrash detection shutdown readahead.
> >
> > attached patch 4_pagethrash.patch handles this issue.
> >
> >5. slow read path is too slow.
> >
> > I could not figure out a way to atleast-read-the-requested-
> > number-of-pages if readahead is shutdown, without incorporating
> > the readsize parameter to page_cache_readahead(). So had
> > to pick some of your code in filemap.c to do that. Thanks!
> >
> > attached patch 5_fixedslowread.patch handles this issue.
> >
> >
> >Apart from this you have noticed other issues
> >
> >6. cache lookup done unneccessrily twice for pagecache_hits.
> >
> > I have not handled this issue currently. But should be doable
> > if I introducing a flag, which notes when readahead is
> > shutdown by pagecahche hits. And hence attempts to lookup
> > the page only once.
> >
> >
> >And you have other features in your patch which will be the real
> >differentiating factors.
> >
> >7. exponential expand and shrink of window sizes.
> >
> >8. overlapped read of current window and ahead window.
> >
> > ( I think both are desirable feature )
> >
> >I did run some premilinary tests using your patch and the above patches
> >and found
> >
> >your patch was doing slightly better on iozone and sysbench.
> >however the above patch were doing slightly better with DSS workload.
> >
> >
>
> Ok, I have re-run the Tiobench tests. On a single cpu ide based system
> you new patches have no noticable effect on sequential read performance
> (a good thing); but on random I/O things went bad :-(.
>
> Here are the random read results for 16k io with 4GB fileset on 256MB
> mem, single cpu IDE
>
> Stock w/ patches
>
> Threads MBs/sec MBs/sec %diff diff
> ---------- ------------ ------------ -------- ------------
> 1 1.73 1.72 -0.58 -0.01
> 4 1.70 1.56 -8.24 -0.14
> 16 1.66 0.81 -51.20 -0.85
> 64 1.49 0.68 -54.36 -0.81
>
> As you can see somewhere after 4 threads the new patches cause performance to tank.
>
> With 512k ios the problem kicks in with less than 4 threads.
>
> Stock w/ patches
> Threads MBs/sec MBs/sec %diff diff
> ---------- ------------ ------------ -------- ------------
> 1 18.50 18.55 0.27 0.05
> 4 8.55 6.59 -22.92 -1.96
> 16 8.40 5.18 -38.33 -3.22
> 64 7.34 4.76 -35.15 -2.58
>
>
> Unfortunately this is the _good_ news. The bad news is that this is much worse on SCSI.
> We lose a few percent on sequential reads for all block sizes and random is just totally screwed.
>
> Here is the same 16k io requests size with 4GB fileset on 1GB memory on 8way system on single scsi disk.
>
> stock w/ patch
> Threads MBs/sec MBs/sec %diff diff
> ---------- ------------ ------------ -------- ------------
> 1 3.43 3.03 -11.66 -0.40
> 4 4.51 1.06 -76.50 -3.45
> 16 5.86 1.43 -75.60 -4.43
> 64 6.13 1.66 -72.92 -4.47
>
> 11% degrade even on 1 thread, 75% degrade for 4 threads and above! This is horribly broken.
>
>
Sorry for the late response. Was out yesterday.

Yes something is broken horribly. Will look into what is broken.

RP