2005-01-29 10:09:47

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 3/4] readahead: factor out duplicated code

This patch introduces make_ahead_window() function for
simplification of page_cache_readahead.

Signed-off-by: Oleg Nesterov <[email protected]>

--- 2.6.11-rc2/mm/readahead.c~ 2005-01-27 22:14:39.000000000 +0300
+++ 2.6.11-rc2/mm/readahead.c 2005-01-29 15:51:04.000000000 +0300
@@ -406,6 +406,37 @@ blockable_page_cache_readahead(struct ad
return check_ra_success(ra, nr_to_read, actual);
}

+static int make_ahead_window(struct address_space *mapping, struct file *filp,
+ struct file_ra_state *ra, int force)
+{
+ int block, ret;
+
+ ra->ahead_size = get_next_ra_size(ra);
+ ra->ahead_start = ra->start + ra->size;
+
+ block = force || (ra->prev_page >= ra->ahead_start);
+ ret = blockable_page_cache_readahead(mapping, filp,
+ ra->ahead_start, ra->ahead_size, ra, block);
+
+ if (!ret && !force) {
+ /* A read failure in blocking mode, implies pages are
+ * all cached. So we can safely assume we have taken
+ * care of all the pages requested in this call.
+ * A read failure in non-blocking mode, implies we are
+ * reading more pages than requested in this call. So
+ * we safely assume we have taken care of all the pages
+ * requested in this call.
+ *
+ * Just reset the ahead window in case we failed due to
+ * congestion. The ahead window will any way be closed
+ * in case we failed due to excessive page cache hits.
+ */
+ ra->ahead_start = 0;
+ ra->ahead_size = 0;
+ }
+
+ return ret;
+}
/*
* page_cache_readahead is the main function. If performs the adaptive
* readahead window size management and submits the readahead I/O.
@@ -415,9 +446,8 @@ page_cache_readahead(struct address_spac
struct file *filp, unsigned long offset,
unsigned long req_size)
{
- unsigned long max;
- unsigned long newsize = req_size;
- unsigned long block;
+ unsigned long max, newsize = req_size;
+ int sequential = (offset == ra->prev_page + 1);

/*
* Here we detect the case where the application is performing
@@ -428,6 +458,7 @@ page_cache_readahead(struct address_spac
if (offset == ra->prev_page && req_size == 1 && ra->size != 0)
goto out;

+ ra->prev_page = offset;
max = get_max_readahead(ra);
newsize = min(req_size, max);

@@ -435,13 +466,16 @@ page_cache_readahead(struct address_spac
newsize = 1;
goto out; /* No readahead or file already in cache */
}
+
+ ra->prev_page += newsize - 1;
+
/*
* Special case - first read. We'll assume it's a whole-file read if
* at start of file, and grow the window fast. Or detect first
* sequential access
*/
if ((ra->size == 0 && offset == 0) /* first io and start of file */
- || (ra->size == -1 && ra->prev_page == offset - 1)) {
+ || (ra->size == -1 && sequential)) {
/* First sequential */
ra->size = get_init_ra_size(newsize, max);
ra->start = offset;
@@ -457,12 +491,9 @@ page_cache_readahead(struct address_spac
* IOs,* thus preventing stalls. so issue the ahead window
* immediately.
*/
- if (req_size >= max) {
- ra->ahead_size = get_next_ra_size(ra);
- ra->ahead_start = ra->start + ra->size;
- blockable_page_cache_readahead(mapping, filp,
- ra->ahead_start, ra->ahead_size, ra, 1);
- }
+ if (req_size >= max)
+ make_ahead_window(mapping, filp, ra, 1);
+
goto out;
}

@@ -471,7 +502,7 @@ page_cache_readahead(struct address_spac
* partial page reads and first access were handled above,
* so this must be the next page otherwise it is random
*/
- if ((offset != (ra->prev_page+1) || (ra->size == 0))) {
+ if (!sequential || (ra->size == 0)) {
ra_off(ra);
blockable_page_cache_readahead(mapping, filp, offset,
newsize, ra, 1);
@@ -484,27 +515,8 @@ page_cache_readahead(struct address_spac
*/

if (ra->ahead_start == 0) { /* no ahead window yet */
- ra->ahead_size = get_next_ra_size(ra);
- ra->ahead_start = ra->start + ra->size;
- block = ((offset + newsize -1) >= ra->ahead_start);
- if (!blockable_page_cache_readahead(mapping, filp,
- ra->ahead_start, ra->ahead_size, ra, block)) {
- /* A read failure in blocking mode, implies pages are
- * all cached. So we can safely assume we have taken
- * care of all the pages requested in this call. A read
- * failure in non-blocking mode, implies we are reading
- * more pages than requested in this call. So we safely
- * assume we have taken care of all the pages requested
- * in this call.
- *
- * Just reset the ahead window in case we failed due to
- * congestion. The ahead window will any way be closed
- * in case we failed due to exessive page cache hits.
- */
- ra->ahead_start = 0;
- ra->ahead_size = 0;
+ if (!make_ahead_window(mapping, filp, ra, 0))
goto out;
- }
}
/*
* Already have an ahead window, check if we crossed into it.
@@ -513,33 +525,13 @@ page_cache_readahead(struct address_spac
* we get called back on the first page of the ahead window which
* will allow us to submit more IO.
*/
- if ((offset + newsize - 1) >= ra->ahead_start) {
+ if (ra->prev_page >= ra->ahead_start) {
ra->start = ra->ahead_start;
ra->size = ra->ahead_size;
- ra->ahead_start = ra->start + ra->size;
- ra->ahead_size = get_next_ra_size(ra);
- block = ((offset + newsize - 1) >= ra->ahead_start);
- if (!blockable_page_cache_readahead(mapping, filp,
- ra->ahead_start, ra->ahead_size, ra, block)) {
- /* A read failure in blocking mode, implies pages are
- * all cached. So we can safely assume we have taken
- * care of all the pages requested in this call.
- * A read failure in non-blocking mode, implies we are
- * reading more pages than requested in this call. So
- * we safely assume we have taken care of all the pages
- * requested in this call.
- *
- * Just reset the ahead window in case we failed due to
- * congestion. The ahead window will any way be closed
- * in case we failed due to excessive page cache hits.
- */
- ra->ahead_start = 0;
- ra->ahead_size = 0;
- }
+ make_ahead_window(mapping, filp, ra, 0);
}

out:
- ra->prev_page = offset + newsize - 1;
return newsize;
}


2005-01-29 10:32:02

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 3/4] readahead: factor out duplicated code

> This patch introduces make_ahead_window() function for
> simplification of page_cache_readahead.

If you will count this patch acceptable, I'll rediff it against
next mm iteration.

For your convenience here is the code with the patch applied.

int make_ahead_window(mapping, filp, ra, int force)
{
int block, ret;

ra->ahead_size = get_next_ra_size(ra);
ra->ahead_start = ra->start + ra->size;

block = force || (ra->prev_page >= ra->ahead_start);
ret = blockable_page_cache_readahead(mapping, filp,
ra->ahead_start, ra->ahead_size, ra, block);

if (!ret && !force) {
ra->ahead_start = 0;
ra->ahead_size = 0;
}

return ret;
}

unsigned long page_cache_readahead(mapping, ra, filp, offset, req_size)
{
unsigned long max, newsize = req_size;
int sequential = (offset == ra->prev_page + 1);

if (offset == ra->prev_page && req_size == 1 && ra->size != 0)
goto out;

ra->prev_page = offset;
max = get_max_readahead(ra);
newsize = min(req_size, max);

if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) {
newsize = 1;
goto out;
}

ra->prev_page += newsize - 1;

if ((ra->size == 0 && offset == 0) ||
(ra->size == -1 && sequential)) {
ra->size = get_init_ra_size(newsize, max);
ra->start = offset;
if (!blockable_page_cache_readahead(mapping, filp, offset, ra->size, ra, 1))
goto out;

if (req_size >= max)
make_ahead_window(mapping, filp, ra, 1);

goto out;
}

if (!sequential || (ra->size == 0)) {
ra_off(ra);
blockable_page_cache_readahead(mapping, filp, offset, newsize, ra, 1);
goto out;
}


if (ra->ahead_start == 0) {
if (!make_ahead_window(mapping, filp, ra, 0))
goto out;
}

if (ra->prev_page >= ra->ahead_start) {
ra->start = ra->ahead_start;
ra->size = ra->ahead_size;
make_ahead_window(mapping, filp, ra, 0);
}
out:
return newsize;
}

2005-02-02 21:41:56

by Steven Pratt

[permalink] [raw]
Subject: Re: [PATCH 3/4] readahead: factor out duplicated code

Oleg Nesterov wrote:

>This patch introduces make_ahead_window() function for
>simplification of page_cache_readahead.
>
>Signed-off-by: Oleg Nesterov <[email protected]>
>
>--- 2.6.11-rc2/mm/readahead.c~ 2005-01-27 22:14:39.000000000 +0300
>+++ 2.6.11-rc2/mm/readahead.c 2005-01-29 15:51:04.000000000 +0300
>@@ -406,6 +406,37 @@ blockable_page_cache_readahead(struct ad
> return check_ra_success(ra, nr_to_read, actual);
> }
>
>+static int make_ahead_window(struct address_space *mapping, struct file *filp,
>+ struct file_ra_state *ra, int force)
>+{
>+ int block, ret;
>+
>+ ra->ahead_size = get_next_ra_size(ra);
>+ ra->ahead_start = ra->start + ra->size;
>+
>+ block = force || (ra->prev_page >= ra->ahead_start);
>+ ret = blockable_page_cache_readahead(mapping, filp,
>+ ra->ahead_start, ra->ahead_size, ra, block);
>+
>+ if (!ret && !force) {
>
>
This really needs to be

if (!ret && !block) {

otherwise we can have an aheadwindow which was never populated which
will cause slow reads which we want to avoid in all cases.

>+ /* A read failure in blocking mode, implies pages are
>+ * all cached. So we can safely assume we have taken
>+ * care of all the pages requested in this call.
>+ * A read failure in non-blocking mode, implies we are
>+ * reading more pages than requested in this call. So
>+ * we safely assume we have taken care of all the pages
>+ * requested in this call.
>+ *
>+ * Just reset the ahead window in case we failed due to
>+ * congestion. The ahead window will any way be closed
>+ * in case we failed due to excessive page cache hits.
>+ */
>
>

Hmmm.. I realize that this piece is not really new, but I don't like
the fact that we can zero out the ahead window just because of a
complete cache hit. This will screw up logic for closing the window on
repeated cache hits by possibly repeatedly getting hits on the same
pages over and over again.

I guess this raises the question in my mind, which is "Is all of the
congestion code really worth it?" It is pretty much a corner case that
we don't want to block, plus that fact that we might block anyway even
with the congestion code (timing hole), and more than likely since we
are in sequential read pattern already and the device is really slow(ie
congested) we will most likely block soon anyway when we come back for
these pages. Just a thought.

>+ ra->ahead_start = 0;
>+ ra->ahead_size = 0;
>+ }
>+
>+ return ret;
>+}
> /*
> * page_cache_readahead is the main function. If performs the adaptive
> * readahead window size management and submits the readahead I/O.
>@@ -415,9 +446,8 @@ page_cache_readahead(struct address_spac
> struct file *filp, unsigned long offset,
> unsigned long req_size)
> {
>- unsigned long max;
>- unsigned long newsize = req_size;
>- unsigned long block;
>+ unsigned long max, newsize = req_size;
>+ int sequential = (offset == ra->prev_page + 1);
>
> /*
> * Here we detect the case where the application is performing
>@@ -428,6 +458,7 @@ page_cache_readahead(struct address_spac
> if (offset == ra->prev_page && req_size == 1 && ra->size != 0)
> goto out;
>
>+ ra->prev_page = offset;
> max = get_max_readahead(ra);
> newsize = min(req_size, max);
>
>@@ -435,13 +466,16 @@ page_cache_readahead(struct address_spac
> newsize = 1;
> goto out; /* No readahead or file already in cache */
> }
>+
>+ ra->prev_page += newsize - 1;
>+
> /*
> * Special case - first read. We'll assume it's a whole-file read if
> * at start of file, and grow the window fast. Or detect first
> * sequential access
> */
> if ((ra->size == 0 && offset == 0) /* first io and start of file */
>- || (ra->size == -1 && ra->prev_page == offset - 1)) {
>+ || (ra->size == -1 && sequential)) {
> /* First sequential */
> ra->size = get_init_ra_size(newsize, max);
> ra->start = offset;
>@@ -457,12 +491,9 @@ page_cache_readahead(struct address_spac
> * IOs,* thus preventing stalls. so issue the ahead window
> * immediately.
> */
>- if (req_size >= max) {
>- ra->ahead_size = get_next_ra_size(ra);
>- ra->ahead_start = ra->start + ra->size;
>- blockable_page_cache_readahead(mapping, filp,
>- ra->ahead_start, ra->ahead_size, ra, 1);
>- }
>+ if (req_size >= max)
>+ make_ahead_window(mapping, filp, ra, 1);
>+
> goto out;
> }
>
>@@ -471,7 +502,7 @@ page_cache_readahead(struct address_spac
> * partial page reads and first access were handled above,
> * so this must be the next page otherwise it is random
> */
>- if ((offset != (ra->prev_page+1) || (ra->size == 0))) {
>+ if (!sequential || (ra->size == 0)) {
> ra_off(ra);
> blockable_page_cache_readahead(mapping, filp, offset,
> newsize, ra, 1);
>@@ -484,27 +515,8 @@ page_cache_readahead(struct address_spac
> */
>
> if (ra->ahead_start == 0) { /* no ahead window yet */
>- ra->ahead_size = get_next_ra_size(ra);
>- ra->ahead_start = ra->start + ra->size;
>- block = ((offset + newsize -1) >= ra->ahead_start);
>- if (!blockable_page_cache_readahead(mapping, filp,
>- ra->ahead_start, ra->ahead_size, ra, block)) {
>- /* A read failure in blocking mode, implies pages are
>- * all cached. So we can safely assume we have taken
>- * care of all the pages requested in this call. A read
>- * failure in non-blocking mode, implies we are reading
>- * more pages than requested in this call. So we safely
>- * assume we have taken care of all the pages requested
>- * in this call.
>- *
>- * Just reset the ahead window in case we failed due to
>- * congestion. The ahead window will any way be closed
>- * in case we failed due to exessive page cache hits.
>- */
>- ra->ahead_start = 0;
>- ra->ahead_size = 0;
>+ if (!make_ahead_window(mapping, filp, ra, 0))
> goto out;
>- }
> }
> /*
> * Already have an ahead window, check if we crossed into it.
>@@ -513,33 +525,13 @@ page_cache_readahead(struct address_spac
> * we get called back on the first page of the ahead window which
> * will allow us to submit more IO.
> */
>- if ((offset + newsize - 1) >= ra->ahead_start) {
>+ if (ra->prev_page >= ra->ahead_start) {
> ra->start = ra->ahead_start;
> ra->size = ra->ahead_size;
>- ra->ahead_start = ra->start + ra->size;
>- ra->ahead_size = get_next_ra_size(ra);
>- block = ((offset + newsize - 1) >= ra->ahead_start);
>- if (!blockable_page_cache_readahead(mapping, filp,
>- ra->ahead_start, ra->ahead_size, ra, block)) {
>- /* A read failure in blocking mode, implies pages are
>- * all cached. So we can safely assume we have taken
>- * care of all the pages requested in this call.
>- * A read failure in non-blocking mode, implies we are
>- * reading more pages than requested in this call. So
>- * we safely assume we have taken care of all the pages
>- * requested in this call.
>- *
>- * Just reset the ahead window in case we failed due to
>- * congestion. The ahead window will any way be closed
>- * in case we failed due to excessive page cache hits.
>- */
>- ra->ahead_start = 0;
>- ra->ahead_size = 0;
>- }
>+ make_ahead_window(mapping, filp, ra, 0);
> }
>
> out:
>- ra->prev_page = offset + newsize - 1;
> return newsize;
> }
>
>
The rest of this loks ok, but I have not convinced myself that it
doesn't change the behavior somewhere. I need to actually do some
testing on this just to be certain.

Steve

2005-02-03 02:46:02

by Ram Pai

[permalink] [raw]
Subject: Re: [PATCH 3/4] readahead: factor out duplicated code

On Sat, 2005-01-29 at 03:35, Oleg Nesterov wrote:
> > This patch introduces make_ahead_window() function for
> > simplification of page_cache_readahead.
>
> If you will count this patch acceptable, I'll rediff it against
> next mm iteration.
>
> For your convenience here is the code with the patch applied.
>
> int make_ahead_window(mapping, filp, ra, int force)
> {
> int block, ret;
>
> ra->ahead_size = get_next_ra_size(ra);
> ra->ahead_start = ra->start + ra->size;
>
> block = force || (ra->prev_page >= ra->ahead_start);
> ret = blockable_page_cache_readahead(mapping, filp,
> ra->ahead_start, ra->ahead_size, ra, block);
>
> if (!ret && !force) {

As steve pointed out this should be :
if ( !ret && ! block ) {

> ra->ahead_start = 0;
> ra->ahead_size = 0;
> }
>
> return ret;
> }
>
> unsigned long page_cache_readahead(mapping, ra, filp, offset, req_size)
> {
> unsigned long max, newsize = req_size;
> int sequential = (offset == ra->prev_page + 1);
>
> if (offset == ra->prev_page && req_size == 1 && ra->size != 0)
> goto out;
> ra->prev_page = offset;
> max = get_max_readahead(ra);
> newsize = min(req_size, max);
>
> if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) {
> newsize = 1;

At this point prev_page has to be updated:
ra->prev_page = offset;

> goto out;
> }
>
> ra->prev_page += newsize - 1;
>
> if ((ra->size == 0 && offset == 0) ||
> (ra->size == -1 && sequential)) {
> ra->size = get_init_ra_size(newsize, max);
> ra->start = offset;
> if (!blockable_page_cache_readahead(mapping, filp, offset, ra->size, ra, 1))
> goto out;
>
> if (req_size >= max)
> make_ahead_window(mapping, filp, ra, 1);
>
> goto out;
> }
>
> if (!sequential || (ra->size == 0)) {
> ra_off(ra);
> blockable_page_cache_readahead(mapping, filp, offset, newsize, ra, 1);
> goto out;
> }
>
>
> if (ra->ahead_start == 0) {
> if (!make_ahead_window(mapping, filp, ra, 0))
> goto out;
> }
>
> if (ra->prev_page >= ra->ahead_start) {
> ra->start = ra->ahead_start;
> ra->size = ra->ahead_size;
> make_ahead_window(mapping, filp, ra, 0);
> }
> out:
> return newsize;
> }
Otherwise this code looks much cleaner and correct. Can you send me a
updated patch. I will run it through my test harness.

RP

2005-02-03 10:29:04

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 3/4] readahead: factor out duplicated code

Steven Pratt wrote:
>
> >+static int make_ahead_window(struct address_space *mapping, struct file *filp,
> >+ struct file_ra_state *ra, int force)
> >+{
> >+ int block, ret;
> >+
> >+ block = force || (ra->prev_page >= ra->ahead_start);
> >+ ret = blockable_page_cache_readahead(mapping, filp,
> >+ ra->ahead_start, ra->ahead_size, ra, block);
> >+
> >+ if (!ret && !force) {
> >
> This really needs to be
>
> if (!ret && !block) {
>

Current code:

block = offset + newsize-1 >= ra->ahead_start;
if (!blockable_page_cache_readahead(..., block) {
ra->ahead_start = 0;
ra->ahead_size = 0;
}

Patched code:
make_ahead_window(..., 0); // force == 0

So i think the patch is correct.

> otherwise we can have an aheadwindow which was never populated which
> will cause slow reads which we want to avoid in all cases.

may be, but this patch tries not to change the current behavior.

Oleg.

2005-02-03 10:38:48

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 3/4] readahead: factor out duplicated code

Ram wrote:
> > unsigned long page_cache_readahead(mapping, ra, filp, offset, req_size)
> > {
> > unsigned long max, newsize = req_size;
> > int sequential = (offset == ra->prev_page + 1);
> >
> > if (offset == ra->prev_page && req_size == 1 && ra->size != 0)
> > goto out;
> > ra->prev_page = offset; <============== PLEASE LOOK HERE :)
> > max = get_max_readahead(ra);
> > newsize = min(req_size, max);
> >
> > if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) {
> > newsize = 1;
>
> At this point prev_page has to be updated:
> ra->prev_page = offset;

Yes, it is already updated, before "max = get_max_readahead(ra);"

> Otherwise this code looks much cleaner and correct. Can you send me a
> updated patch. I will run it through my test harness.

Well, currently I do not know, what should be changed :)

Oleg.