2008-10-10 18:03:28

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH updated] ext4: Fix file fragmentation during large file write.

The range_cyclic writeback mode use the address_space
writeback_index as the start index for writeback. With
delayed allocation we were updating writeback_index
wrongly resulting in highly fragmented file. Number of
extents reduced from 4000 to 27 for a 3GB file with
the below patch.

The patch also removes the range_cont writeback mode
added for ext4 delayed allocation. Instead we add
two new flags in writeback_control which control
the behaviour of write_cache_pages.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/inode.c | 110 +++++++++++++++++++++++++-------------------
include/linux/writeback.h | 5 ++-
mm/page-writeback.c | 11 +++--
3 files changed, 73 insertions(+), 53 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7c2820e..f8890b9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
int ret = 0, err, nr_pages, i;
unsigned long index, end;
struct pagevec pvec;
+ long pages_skipped;

BUG_ON(mpd->next_page <= mpd->first_page);
pagevec_init(&pvec, 0);
@@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
end = mpd->next_page - 1;

while (index <= end) {
- /* XXX: optimize tail */
- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ /*
+ * We can use PAGECACHE_TAG_DIRTY lookup here because
+ * even though we have cleared the dirty flag on the page
+ * We still keep the page in the radix tree with tag
+ * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
+ * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
+ * which is called via the below writepage callback.
+ */
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index,
+ (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];

- index = page->index;
- if (index > end)
- break;
- index++;
-
+ pages_skipped = mpd->wbc->pages_skipped;
err = mapping->a_ops->writepage(page, mpd->wbc);
- if (!err)
+ if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+ /*
+ * have successfully written the page
+ * without skipping the same
+ */
mpd->pages_written++;
/*
* In error case, we have to continue because
@@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
struct writeback_control *wbc,
struct mpage_da_data *mpd)
{
- long to_write;
int ret;

if (!mpd->get_block)
@@ -2119,10 +2129,7 @@ static int mpage_da_writepages(struct address_space *mapping,
mpd->pages_written = 0;
mpd->retval = 0;

- to_write = wbc->nr_to_write;
-
ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
-
/*
* Handle last extent of pages
*/
@@ -2131,7 +2138,7 @@ static int mpage_da_writepages(struct address_space *mapping,
mpage_da_submit_io(mpd);
}

- wbc->nr_to_write = to_write - mpd->pages_written;
+ wbc->nr_to_write -= mpd->pages_written;
return ret;
}

@@ -2360,12 +2367,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ pgoff_t index;
+ int range_whole = 0;
handle_t *handle = NULL;
- loff_t range_start = 0;
+ long pages_written = 0;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
+ int no_nrwrite_update, no_index_update;
int needed_blocks, ret = 0, nr_to_writebump = 0;
- long to_write, pages_skipped = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);

/*
@@ -2385,23 +2394,27 @@ static int ext4_da_writepages(struct address_space *mapping,
nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
wbc->nr_to_write = sbi->s_mb_stream_request;
}
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;

- if (!wbc->range_cyclic)
- /*
- * If range_cyclic is not set force range_cont
- * and save the old writeback_index
- */
- wbc->range_cont = 1;
-
- range_start = wbc->range_start;
- pages_skipped = wbc->pages_skipped;
+ if (wbc->range_cyclic)
+ index = mapping->writeback_index;
+ else
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;

mpd.wbc = wbc;
mpd.inode = mapping->host;

-restart_loop:
- to_write = wbc->nr_to_write;
- while (!ret && to_write > 0) {
+ /*
+ * we don't want write_cache_pages to update
+ * nr_to_write and writeback_index
+ */
+ no_nrwrite_update = wbc->no_nrwrite_update;
+ wbc->no_nrwrite_update = 1;
+ no_index_update = wbc->no_index_update;
+ wbc->no_index_update = 1;
+
+ while (!ret && wbc->nr_to_write > 0) {

/*
* we insert one extent at a time. So we need
@@ -2422,48 +2435,49 @@ static int ext4_da_writepages(struct address_space *mapping,
dump_stack();
goto out_writepages;
}
- to_write -= wbc->nr_to_write;
-
mpd.get_block = ext4_da_get_block_write;
ret = mpage_da_writepages(mapping, wbc, &mpd);

ext4_journal_stop(handle);

- if (mpd.retval == -ENOSPC)
+ if (mpd.retval == -ENOSPC) {
+ /* commit the transaction which would
+ * free blocks released in the transaction
+ * and try again
+ */
jbd2_journal_force_commit_nested(sbi->s_journal);
-
- /* reset the retry count */
- if (ret == MPAGE_DA_EXTENT_TAIL) {
+ ret = 0;
+ } else if (ret == MPAGE_DA_EXTENT_TAIL) {
/*
* got one extent now try with
* rest of the pages
*/
- to_write += wbc->nr_to_write;
+ pages_written += mpd.pages_written;
ret = 0;
- } else if (wbc->nr_to_write) {
+ } else if (wbc->nr_to_write)
/*
* There is no more writeout needed
* or we requested for a noblocking writeout
* and we found the device congested
*/
- to_write += wbc->nr_to_write;
break;
- }
- wbc->nr_to_write = to_write;
}

- if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
- /* We skipped pages in this loop */
- wbc->range_start = range_start;
- wbc->nr_to_write = to_write +
- wbc->pages_skipped - pages_skipped;
- wbc->pages_skipped = pages_skipped;
- goto restart_loop;
- }
+ /* Update index */
+ index += pages_written;
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ /*
+ * set the writeback_index so that range_cyclic
+ * mode will write it back later
+ */
+ mapping->writeback_index = index;

out_writepages:
- wbc->nr_to_write = to_write - nr_to_writebump;
- wbc->range_start = range_start;
+ if (!no_nrwrite_update)
+ wbc->no_nrwrite_update = 0;
+ if (!no_index_update)
+ wbc->no_index_update = 0;
+ wbc->nr_to_write -= nr_to_writebump;
return ret;
}

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 12b15c5..b04287e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,7 +63,10 @@ struct writeback_control {
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */
- unsigned range_cont:1;
+
+ /* write_cache_pages() control */
+ unsigned no_nrwrite_update:1; /* don't update nr_to_write */
+ unsigned no_index_update:1; /* don't update writeback_index */
};

/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 24de8b6..a85930c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -876,6 +876,7 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t end; /* Inclusive */
int scanned = 0;
int range_whole = 0;
+ long nr_to_write = wbc->nr_to_write;

if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
@@ -939,7 +940,7 @@ int write_cache_pages(struct address_space *mapping,
unlock_page(page);
ret = 0;
}
- if (ret || (--(wbc->nr_to_write) <= 0))
+ if (ret || (--nr_to_write <= 0))
done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
@@ -958,11 +959,13 @@ int write_cache_pages(struct address_space *mapping,
index = 0;
goto retry;
}
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ if (!wbc->no_index_update &&
+ (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))) {
mapping->writeback_index = index;
+ }
+ if (!wbc->no_nrwrite_update)
+ wbc->nr_to_write = nr_to_write;

- if (wbc->range_cont)
- wbc->range_start = index << PAGE_CACHE_SHIFT;
return ret;
}
EXPORT_SYMBOL(write_cache_pages);
--
1.6.0.1.285.g1070



2008-10-10 18:13:38

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH updated] ext4: Fix file fragmentation during large file write.

On Fri, Oct 10, 2008 at 11:32:56PM +0530, Aneesh Kumar K.V wrote:
> The range_cyclic writeback mode use the address_space
> writeback_index as the start index for writeback. With
> delayed allocation we were updating writeback_index
> wrongly resulting in highly fragmented file. Number of
> extents reduced from 4000 to 27 for a 3GB file with
> the below patch.
>
> The patch also removes the range_cont writeback mode
> added for ext4 delayed allocation. Instead we add
> two new flags in writeback_control which control
> the behaviour of write_cache_pages.
>

Need the below update. Will send the updated patch to ext4 list.

[2.6.27-rc9-1-working@linux-review-ext]$ git diff
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a85930c..4f359f4 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -960,7 +960,7 @@ int write_cache_pages(struct address_space *mapping,
goto retry;
}
if (!wbc->no_index_update &&
- (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))) {
+ (wbc->range_cyclic || (range_whole && nr_to_write > 0))) {
mapping->writeback_index = index;
}
if (!wbc->no_nrwrite_update)

2008-10-10 19:15:18

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH updated] ext4: Fix file fragmentation during large file write.

On Fri, 2008-10-10 at 23:32 +0530, Aneesh Kumar K.V wrote:
> The range_cyclic writeback mode use the address_space
> writeback_index as the start index for writeback. With
> delayed allocation we were updating writeback_index
> wrongly resulting in highly fragmented file. Number of
> extents reduced from 4000 to 27 for a 3GB file with
> the below patch.
>
> The patch also removes the range_cont writeback mode
> added for ext4 delayed allocation. Instead we add
> two new flags in writeback_control which control
> the behaviour of write_cache_pages.
>

I'm sorry, but I won't be able to test this until next wednesday. In
general, I like the structure of it, and I can see this being useful for
other filesystems too.

-chris


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2008-10-11 10:52:08

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH updated] ext4: Fix file fragmentation during large file write.

On Fri, Oct 10, 2008 at 11:32:56PM +0530, Aneesh Kumar K.V wrote:
> The range_cyclic writeback mode use the address_space
> writeback_index as the start index for writeback. With
> delayed allocation we were updating writeback_index
> wrongly resulting in highly fragmented file. Number of
> extents reduced from 4000 to 27 for a 3GB file with
> the below patch.
>
> The patch also removes the range_cont writeback mode
> added for ext4 delayed allocation. Instead we add
> two new flags in writeback_control which control
> the behaviour of write_cache_pages.

The mm/page-writeback.c changes look OK, although it loks like you've
got rid of range_cont? Should we do a patch to get rid of it entirely
from the tree first?

I don't mind rediffing my patchset on top of this, but this seems smaller
and not strictly a bugfix so I would prefer to go the other way if you
agree.

Seems like it could be broken up into several patches (eg. pagevec_lookup).

The results look very nice.

>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/inode.c | 110 +++++++++++++++++++++++++-------------------
> include/linux/writeback.h | 5 ++-
> mm/page-writeback.c | 11 +++--
> 3 files changed, 73 insertions(+), 53 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 7c2820e..f8890b9 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
> int ret = 0, err, nr_pages, i;
> unsigned long index, end;
> struct pagevec pvec;
> + long pages_skipped;
>
> BUG_ON(mpd->next_page <= mpd->first_page);
> pagevec_init(&pvec, 0);
> @@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
> end = mpd->next_page - 1;
>
> while (index <= end) {
> - /* XXX: optimize tail */
> - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
> + /*
> + * We can use PAGECACHE_TAG_DIRTY lookup here because
> + * even though we have cleared the dirty flag on the page
> + * We still keep the page in the radix tree with tag
> + * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
> + * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
> + * which is called via the below writepage callback.
> + */
> + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
> + PAGECACHE_TAG_DIRTY,
> + min(end - index,
> + (pgoff_t)PAGEVEC_SIZE-1) + 1);
> if (nr_pages == 0)
> break;
> for (i = 0; i < nr_pages; i++) {
> struct page *page = pvec.pages[i];
>
> - index = page->index;
> - if (index > end)
> - break;
> - index++;
> -
> + pages_skipped = mpd->wbc->pages_skipped;
> err = mapping->a_ops->writepage(page, mpd->wbc);
> - if (!err)
> + if (!err && (pages_skipped == mpd->wbc->pages_skipped))
> + /*
> + * have successfully written the page
> + * without skipping the same
> + */
> mpd->pages_written++;
> /*
> * In error case, we have to continue because
> @@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
> struct writeback_control *wbc,
> struct mpage_da_data *mpd)
> {
> - long to_write;
> int ret;
>
> if (!mpd->get_block)
> @@ -2119,10 +2129,7 @@ static int mpage_da_writepages(struct address_space *mapping,
> mpd->pages_written = 0;
> mpd->retval = 0;
>
> - to_write = wbc->nr_to_write;
> -
> ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
> -
> /*
> * Handle last extent of pages
> */
> @@ -2131,7 +2138,7 @@ static int mpage_da_writepages(struct address_space *mapping,
> mpage_da_submit_io(mpd);
> }
>
> - wbc->nr_to_write = to_write - mpd->pages_written;
> + wbc->nr_to_write -= mpd->pages_written;
> return ret;
> }
>
> @@ -2360,12 +2367,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
> static int ext4_da_writepages(struct address_space *mapping,
> struct writeback_control *wbc)
> {
> + pgoff_t index;
> + int range_whole = 0;
> handle_t *handle = NULL;
> - loff_t range_start = 0;
> + long pages_written = 0;
> struct mpage_da_data mpd;
> struct inode *inode = mapping->host;
> + int no_nrwrite_update, no_index_update;
> int needed_blocks, ret = 0, nr_to_writebump = 0;
> - long to_write, pages_skipped = 0;
> struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
>
> /*
> @@ -2385,23 +2394,27 @@ static int ext4_da_writepages(struct address_space *mapping,
> nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
> wbc->nr_to_write = sbi->s_mb_stream_request;
> }
> + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
> + range_whole = 1;
>
> - if (!wbc->range_cyclic)
> - /*
> - * If range_cyclic is not set force range_cont
> - * and save the old writeback_index
> - */
> - wbc->range_cont = 1;
> -
> - range_start = wbc->range_start;
> - pages_skipped = wbc->pages_skipped;
> + if (wbc->range_cyclic)
> + index = mapping->writeback_index;
> + else
> + index = wbc->range_start >> PAGE_CACHE_SHIFT;
>
> mpd.wbc = wbc;
> mpd.inode = mapping->host;
>
> -restart_loop:
> - to_write = wbc->nr_to_write;
> - while (!ret && to_write > 0) {
> + /*
> + * we don't want write_cache_pages to update
> + * nr_to_write and writeback_index
> + */
> + no_nrwrite_update = wbc->no_nrwrite_update;
> + wbc->no_nrwrite_update = 1;
> + no_index_update = wbc->no_index_update;
> + wbc->no_index_update = 1;
> +
> + while (!ret && wbc->nr_to_write > 0) {
>
> /*
> * we insert one extent at a time. So we need
> @@ -2422,48 +2435,49 @@ static int ext4_da_writepages(struct address_space *mapping,
> dump_stack();
> goto out_writepages;
> }
> - to_write -= wbc->nr_to_write;
> -
> mpd.get_block = ext4_da_get_block_write;
> ret = mpage_da_writepages(mapping, wbc, &mpd);
>
> ext4_journal_stop(handle);
>
> - if (mpd.retval == -ENOSPC)
> + if (mpd.retval == -ENOSPC) {
> + /* commit the transaction which would
> + * free blocks released in the transaction
> + * and try again
> + */
> jbd2_journal_force_commit_nested(sbi->s_journal);
> -
> - /* reset the retry count */
> - if (ret == MPAGE_DA_EXTENT_TAIL) {
> + ret = 0;
> + } else if (ret == MPAGE_DA_EXTENT_TAIL) {
> /*
> * got one extent now try with
> * rest of the pages
> */
> - to_write += wbc->nr_to_write;
> + pages_written += mpd.pages_written;
> ret = 0;
> - } else if (wbc->nr_to_write) {
> + } else if (wbc->nr_to_write)
> /*
> * There is no more writeout needed
> * or we requested for a noblocking writeout
> * and we found the device congested
> */
> - to_write += wbc->nr_to_write;
> break;
> - }
> - wbc->nr_to_write = to_write;
> }
>
> - if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
> - /* We skipped pages in this loop */
> - wbc->range_start = range_start;
> - wbc->nr_to_write = to_write +
> - wbc->pages_skipped - pages_skipped;
> - wbc->pages_skipped = pages_skipped;
> - goto restart_loop;
> - }
> + /* Update index */
> + index += pages_written;
> + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> + /*
> + * set the writeback_index so that range_cyclic
> + * mode will write it back later
> + */
> + mapping->writeback_index = index;
>
> out_writepages:
> - wbc->nr_to_write = to_write - nr_to_writebump;
> - wbc->range_start = range_start;
> + if (!no_nrwrite_update)
> + wbc->no_nrwrite_update = 0;
> + if (!no_index_update)
> + wbc->no_index_update = 0;
> + wbc->nr_to_write -= nr_to_writebump;
> return ret;
> }
>
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index 12b15c5..b04287e 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -63,7 +63,10 @@ struct writeback_control {
> unsigned for_writepages:1; /* This is a writepages() call */
> unsigned range_cyclic:1; /* range_start is cyclic */
> unsigned more_io:1; /* more io to be dispatched */
> - unsigned range_cont:1;
> +
> + /* write_cache_pages() control */
> + unsigned no_nrwrite_update:1; /* don't update nr_to_write */
> + unsigned no_index_update:1; /* don't update writeback_index */
> };
>
> /*
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 24de8b6..a85930c 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -876,6 +876,7 @@ int write_cache_pages(struct address_space *mapping,
> pgoff_t end; /* Inclusive */
> int scanned = 0;
> int range_whole = 0;
> + long nr_to_write = wbc->nr_to_write;
>
> if (wbc->nonblocking && bdi_write_congested(bdi)) {
> wbc->encountered_congestion = 1;
> @@ -939,7 +940,7 @@ int write_cache_pages(struct address_space *mapping,
> unlock_page(page);
> ret = 0;
> }
> - if (ret || (--(wbc->nr_to_write) <= 0))
> + if (ret || (--nr_to_write <= 0))
> done = 1;
> if (wbc->nonblocking && bdi_write_congested(bdi)) {
> wbc->encountered_congestion = 1;
> @@ -958,11 +959,13 @@ int write_cache_pages(struct address_space *mapping,
> index = 0;
> goto retry;
> }
> - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> + if (!wbc->no_index_update &&
> + (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))) {
> mapping->writeback_index = index;
> + }
> + if (!wbc->no_nrwrite_update)
> + wbc->nr_to_write = nr_to_write;
>
> - if (wbc->range_cont)
> - wbc->range_start = index << PAGE_CACHE_SHIFT;
> return ret;
> }
> EXPORT_SYMBOL(write_cache_pages);
> --
> 1.6.0.1.285.g1070

2008-10-11 18:14:52

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH updated] ext4: Fix file fragmentation during large file write.

On Sat, Oct 11, 2008 at 12:51:52PM +0200, Nick Piggin wrote:
> On Fri, Oct 10, 2008 at 11:32:56PM +0530, Aneesh Kumar K.V wrote:
> > The range_cyclic writeback mode use the address_space
> > writeback_index as the start index for writeback. With
> > delayed allocation we were updating writeback_index
> > wrongly resulting in highly fragmented file. Number of
> > extents reduced from 4000 to 27 for a 3GB file with
> > the below patch.
> >
> > The patch also removes the range_cont writeback mode
> > added for ext4 delayed allocation. Instead we add
> > two new flags in writeback_control which control
> > the behaviour of write_cache_pages.
>
> The mm/page-writeback.c changes look OK, although it loks like you've
> got rid of range_cont? Should we do a patch to get rid of it entirely
> from the tree first?
>
> I don't mind rediffing my patchset on top of this, but this seems smaller
> and not strictly a bugfix so I would prefer to go the other way if you
> agree.
>
> Seems like it could be broken up into several patches (eg. pagevec_lookup).
>
> The results look very nice.

I actually tried to do that. But to do that and also achieve a working
bisect kernel, I will have to do the patches in below way

a) Introduce ext4_write_cache_pages
b) remove range_cont from write_cache_pages
c) Introduce the new flags to writeback_control
d) switch ext4 to use write_cache_pages.

I thought that involved lot of code which are later getting removed.
So i went for a single patch.

-aneesh

2008-10-11 19:07:32

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH updated] ext4: Fix file fragmentation during large file write.

On Sat, Oct 11, 2008 at 11:44:26PM +0530, Aneesh Kumar K.V wrote:
> On Sat, Oct 11, 2008 at 12:51:52PM +0200, Nick Piggin wrote:
> > On Fri, Oct 10, 2008 at 11:32:56PM +0530, Aneesh Kumar K.V wrote:
> > > The range_cyclic writeback mode use the address_space
> > > writeback_index as the start index for writeback. With
> > > delayed allocation we were updating writeback_index
> > > wrongly resulting in highly fragmented file. Number of
> > > extents reduced from 4000 to 27 for a 3GB file with
> > > the below patch.
> > >
> > > The patch also removes the range_cont writeback mode
> > > added for ext4 delayed allocation. Instead we add
> > > two new flags in writeback_control which control
> > > the behaviour of write_cache_pages.
> >
> > The mm/page-writeback.c changes look OK, although it loks like you've
> > got rid of range_cont? Should we do a patch to get rid of it entirely
> > from the tree first?
> >
> > I don't mind rediffing my patchset on top of this, but this seems smaller
> > and not strictly a bugfix so I would prefer to go the other way if you
> > agree.
> >
> > Seems like it could be broken up into several patches (eg. pagevec_lookup).
> >
> > The results look very nice.
>
> I actually tried to do that. But to do that and also achieve a working
> bisect kernel, I will have to do the patches in below way
>
> a) Introduce ext4_write_cache_pages
> b) remove range_cont from write_cache_pages
> c) Introduce the new flags to writeback_control
> d) switch ext4 to use write_cache_pages.
>
> I thought that involved lot of code which are later getting removed.
> So i went for a single patch.
>

Ok I did the split as below.

a) ext4: Use tag dirty lookup during mpage_da_submit_io
b) vfs: Remove the range_cont writeback mode.
c) vfs: Add no_nrwrite_update and no_index_update writeback control flags
d) ext4: Fix file fragmentation during large file write.

I have sent the updated patches to ext4 list and also to you. Let me
know what you think. The final change is same as the old patch. So
no new changes added

-aneesh


2008-10-15 20:41:00

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH updated] ext4: Fix file fragmentation during large file write.

On Fri, 2008-10-10 at 23:32 +0530, Aneesh Kumar K.V wrote:
> The range_cyclic writeback mode use the address_space
> writeback_index as the start index for writeback. With
> delayed allocation we were updating writeback_index
> wrongly resulting in highly fragmented file. Number of
> extents reduced from 4000 to 27 for a 3GB file with
> the below patch.
>

I tested the ext4 patch queue from today on top of 2.6.27, and this
includes Aneesh's latest patches.

Things are going at disk speed for streaming writes, with the number of
extents generated for a 32GB file down to 27. So, this is definitely an
improvement for ext4.

-chris


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2008-10-15 23:51:32

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH updated] ext4: Fix file fragmentation during large file write.

On Wed, 2008-10-15 at 16:41 -0400, Chris Mason wrote:
> On Fri, 2008-10-10 at 23:32 +0530, Aneesh Kumar K.V wrote:
> > The range_cyclic writeback mode use the address_space
> > writeback_index as the start index for writeback. With
> > delayed allocation we were updating writeback_index
> > wrongly resulting in highly fragmented file. Number of
> > extents reduced from 4000 to 27 for a 3GB file with
> > the below patch.
> >
>
> I tested the ext4 patch queue from today on top of 2.6.27, and this
> includes Aneesh's latest patches.
>
> Things are going at disk speed for streaming writes, with the number of
> extents generated for a 32GB file down to 27. So, this is definitely an
> improvement for ext4.

Just FYI, I ran this with compilebench -i 20 --makej and my log is full
of these:

ext4_da_writepages: jbd2_start: 1024 pages, ino 520417; err -30
Pid: 4072, comm: pdflush Not tainted 2.6.27 #2

Call Trace:
[<ffffffffa0048493>] ext4_da_writepages+0x171/0x2d3 [ext4]
[<ffffffff802336be>] ? pick_next_task_fair+0x80/0x91
[<ffffffff80228fa8>] ? source_load+0x2a/0x58
[<ffffffff8038e499>] ? __next_cpu+0x19/0x26
[<ffffffff8026748f>] do_writepages+0x28/0x37
[<ffffffff802a6b39>] __writeback_single_inode+0x14f/0x26d
[<ffffffff802a6fb7>] generic_sync_sb_inodes+0x1c1/0x2a2
[<ffffffff802a70a1>] sync_sb_inodes+0x9/0xb
[<ffffffff802a73dc>] writeback_inodes+0x64/0xad
[<ffffffff802675db>] wb_kupdate+0x9a/0x10c
[<ffffffff80267fd1>] ? pdflush+0x0/0x1e9
[<ffffffff80267fd1>] ? pdflush+0x0/0x1e9
[<ffffffff8026810e>] pdflush+0x13d/0x1e9
[<ffffffff80267541>] ? wb_kupdate+0x0/0x10c
[<ffffffff80248222>] kthread+0x49/0x77
[<ffffffff8020c5e9>] child_rip+0xa/0x11
[<ffffffff802481d9>] ? kthread+0x0/0x77
[<ffffffff8020c5df>] ? child_rip+0x0/0x11


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2008-10-16 02:43:24

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH updated] ext4: Fix file fragmentation during large file write.

On Wed, Oct 15, 2008 at 07:51:32PM -0400, Chris Mason wrote:
>
> Just FYI, I ran this with compilebench -i 20 --makej and my log is full
> of these:
>
> ext4_da_writepages: jbd2_start: 1024 pages, ino 520417; err -30
> Pid: 4072, comm: pdflush Not tainted 2.6.27 #2

That's from ext4_journal_start_sb:

if (sb->s_flags & MS_RDONLY)
return ERR_PTR(-EROFS);

What was the very first error in your log? It looks like ext4 somehow
flagged some kind of filesystem error or aborted the journal due to
some failure, and log gets filled these messages. We should probably
should throttle these messages by simply putting a

if (sb->s_flags & MS_RDONLY)
return -ERNOFS;

at the beginning of ext4_da_writepages() so we don't fill the logs
with extraneous messages that obscure more important error messages.

- Ted

2008-10-16 02:44:33

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH updated] ext4: Fix file fragmentation during large file write.

On Oct 15, 2008 19:51 -0400, Chris Mason wrote:
> Just FYI, I ran this with compilebench -i 20 --makej and my log is full
> of these:
>
> ext4_da_writepages: jbd2_start: 1024 pages, ino 520417; err -30
> Pid: 4072, comm: pdflush Not tainted 2.6.27 #2

-30 is -EROFS... Was the filesystem remounted read-only because of
an error?

> Call Trace:
> [<ffffffffa0048493>] ext4_da_writepages+0x171/0x2d3 [ext4]
> [<ffffffff802336be>] ? pick_next_task_fair+0x80/0x91
> [<ffffffff80228fa8>] ? source_load+0x2a/0x58
> [<ffffffff8038e499>] ? __next_cpu+0x19/0x26
> [<ffffffff8026748f>] do_writepages+0x28/0x37
> [<ffffffff802a6b39>] __writeback_single_inode+0x14f/0x26d
> [<ffffffff802a6fb7>] generic_sync_sb_inodes+0x1c1/0x2a2
> [<ffffffff802a70a1>] sync_sb_inodes+0x9/0xb
> [<ffffffff802a73dc>] writeback_inodes+0x64/0xad
> [<ffffffff802675db>] wb_kupdate+0x9a/0x10c
> [<ffffffff80267fd1>] ? pdflush+0x0/0x1e9
> [<ffffffff80267fd1>] ? pdflush+0x0/0x1e9
> [<ffffffff8026810e>] pdflush+0x13d/0x1e9
> [<ffffffff80267541>] ? wb_kupdate+0x0/0x10c
> [<ffffffff80248222>] kthread+0x49/0x77
> [<ffffffff8020c5e9>] child_rip+0xa/0x11
> [<ffffffff802481d9>] ? kthread+0x0/0x77
> [<ffffffff8020c5df>] ? child_rip+0x0/0x11
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


2008-10-16 07:50:54

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH updated] ext4: Fix file fragmentation during large file write.

On Wed, Oct 15, 2008 at 07:51:32PM -0400, Chris Mason wrote:
> On Wed, 2008-10-15 at 16:41 -0400, Chris Mason wrote:
> > On Fri, 2008-10-10 at 23:32 +0530, Aneesh Kumar K.V wrote:
> > > The range_cyclic writeback mode use the address_space
> > > writeback_index as the start index for writeback. With
> > > delayed allocation we were updating writeback_index
> > > wrongly resulting in highly fragmented file. Number of
> > > extents reduced from 4000 to 27 for a 3GB file with
> > > the below patch.
> > >
> >
> > I tested the ext4 patch queue from today on top of 2.6.27, and this
> > includes Aneesh's latest patches.
> >
> > Things are going at disk speed for streaming writes, with the number of
> > extents generated for a 32GB file down to 27. So, this is definitely an
> > improvement for ext4.
>
> Just FYI, I ran this with compilebench -i 20 --makej and my log is full
> of these:
>
> ext4_da_writepages: jbd2_start: 1024 pages, ino 520417; err -30
> Pid: 4072, comm: pdflush Not tainted 2.6.27 #2
>
> Call Trace:
> [<ffffffffa0048493>] ext4_da_writepages+0x171/0x2d3 [ext4]
> [<ffffffff802336be>] ? pick_next_task_fair+0x80/0x91
> [<ffffffff80228fa8>] ? source_load+0x2a/0x58
> [<ffffffff8038e499>] ? __next_cpu+0x19/0x26
> [<ffffffff8026748f>] do_writepages+0x28/0x37
> [<ffffffff802a6b39>] __writeback_single_inode+0x14f/0x26d
> [<ffffffff802a6fb7>] generic_sync_sb_inodes+0x1c1/0x2a2
> [<ffffffff802a70a1>] sync_sb_inodes+0x9/0xb
> [<ffffffff802a73dc>] writeback_inodes+0x64/0xad
> [<ffffffff802675db>] wb_kupdate+0x9a/0x10c
> [<ffffffff80267fd1>] ? pdflush+0x0/0x1e9
> [<ffffffff80267fd1>] ? pdflush+0x0/0x1e9
> [<ffffffff8026810e>] pdflush+0x13d/0x1e9
> [<ffffffff80267541>] ? wb_kupdate+0x0/0x10c
> [<ffffffff80248222>] kthread+0x49/0x77
> [<ffffffff8020c5e9>] child_rip+0xa/0x11
> [<ffffffff802481d9>] ? kthread+0x0/0x77
> [<ffffffff8020c5df>] ? child_rip+0x0/0x11

I actually did the mount -o remount,ro test before sending out the
patches to see if we are skipping some pages during writeback. I
didn't see the error at that time. Today i tried again on a larger
file system and i am able to reproduce the above stack with -o
remount,ro. The patch below fix it for me. The VFS writeback looks
at pages_skipped and make some decision if the value increase
during call back(__fynsc_super -> generic_sync_sb_inodes).
So we need to update pages_skipped also. This may not apply on
top of what patches are in patchqueue. I also have other changes
to use single variable no_nrwite_index_update as per Christoph
suggestion. I will send out the patches for patchqueue after
I look at the compile bench numbers you reported.

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3c4b9b4..88ce29e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2370,10 +2370,10 @@ static int ext4_da_writepages(struct address_space *mapping,
pgoff_t index;
int range_whole = 0;
handle_t *handle = NULL;
- long pages_written = 0;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
int no_nrwrite_index_update;
+ long pages_written = 0, pages_skipped;
int needed_blocks, ret = 0, nr_to_writebump = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);

@@ -2411,6 +2411,7 @@ static int ext4_da_writepages(struct address_space *mapping,
*/
no_nrwrite_index_update = wbc->no_nrwrite_index_update;
wbc->no_nrwrite_index_update = 1;
+ pages_skipped = wbc->pages_skipped;

while (!ret && wbc->nr_to_write > 0) {

@@ -2444,6 +2445,7 @@ static int ext4_da_writepages(struct address_space *mapping,
* and try again
*/
jbd2_journal_force_commit_nested(sbi->s_journal);
+ wbc->pages_skipped = pages_skipeed;
ret = 0;
} else if (ret == MPAGE_DA_EXTENT_TAIL) {
/*
@@ -2451,6 +2453,7 @@ static int ext4_da_writepages(struct address_space *mapping,
* rest of the pages
*/
pages_written += mpd.pages_written;
+ wbc->pages_skipped = pages_skipeed;
ret = 0;
} else if (wbc->nr_to_write)
/*

2008-10-16 09:10:15

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH updated] ext4: Fix file fragmentation during large file write.

On Wed, Oct 15, 2008 at 07:51:32PM -0400, Chris Mason wrote:
> On Wed, 2008-10-15 at 16:41 -0400, Chris Mason wrote:
> > On Fri, 2008-10-10 at 23:32 +0530, Aneesh Kumar K.V wrote:
> > > The range_cyclic writeback mode use the address_space
> > > writeback_index as the start index for writeback. With
> > > delayed allocation we were updating writeback_index
> > > wrongly resulting in highly fragmented file. Number of
> > > extents reduced from 4000 to 27 for a 3GB file with
> > > the below patch.
> > >
> >
> > I tested the ext4 patch queue from today on top of 2.6.27, and this
> > includes Aneesh's latest patches.
> >
> > Things are going at disk speed for streaming writes, with the number of
> > extents generated for a 32GB file down to 27. So, this is definitely an
> > improvement for ext4.
>
> Just FYI, I ran this with compilebench -i 20 --makej and my log is full
> of these:
>
> ext4_da_writepages: jbd2_start: 1024 pages, ino 520417; err -30
> Pid: 4072, comm: pdflush Not tainted 2.6.27 #2
>

last patch I sent didn't solve the problem. The below patch
fixes it for me. The problem is not introduced by the file fragmentation
fixes. Full diff below. The first hunk is the important change. I also
kept the rest of the changes because I think they are fine. Also I added
a printk to figure out whether we ever exit from ext4_da_writepages with
increasing pages_skipped value

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3c4b9b4..b6768eb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2136,6 +2136,9 @@ static int mpage_da_writepages(struct address_space *mapping,
if (!mpd->io_done && mpd->next_page != mpd->first_page) {
if (mpage_da_map_blocks(mpd) == 0)
mpage_da_submit_io(mpd);
+
+ mpd->io_done = 1;
+ ret = MPAGE_DA_EXTENT_TAIL;
}

wbc->nr_to_write -= mpd->pages_written;
@@ -2370,10 +2373,10 @@ static int ext4_da_writepages(struct address_space *mapping,
pgoff_t index;
int range_whole = 0;
handle_t *handle = NULL;
- long pages_written = 0;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
int no_nrwrite_index_update;
+ long pages_written = 0, pages_skipped;
int needed_blocks, ret = 0, nr_to_writebump = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);

@@ -2411,6 +2414,7 @@ static int ext4_da_writepages(struct address_space *mapping,
*/
no_nrwrite_index_update = wbc->no_nrwrite_index_update;
wbc->no_nrwrite_index_update = 1;
+ pages_skipped = wbc->pages_skipped;

while (!ret && wbc->nr_to_write > 0) {

@@ -2444,6 +2448,7 @@ static int ext4_da_writepages(struct address_space *mapping,
* and try again
*/
jbd2_journal_force_commit_nested(sbi->s_journal);
+ wbc->pages_skipped = pages_skipped;
ret = 0;
} else if (ret == MPAGE_DA_EXTENT_TAIL) {
/*
@@ -2451,6 +2456,7 @@ static int ext4_da_writepages(struct address_space *mapping,
* rest of the pages
*/
pages_written += mpd.pages_written;
+ wbc->pages_skipped = pages_skipped;
ret = 0;
} else if (wbc->nr_to_write)
/*
@@ -2460,6 +2466,11 @@ static int ext4_da_writepages(struct address_space *mapping,
*/
break;
}
+ if (pages_skipped != wbc->pages_skipped)
+ printk(KERN_EMERG "This should not happen leaving %s "
+ "with nr_to_write %ld ret = %d\n",
+ __func__, wbc->nr_to_write, ret);
+
/* Update index */
index += pages_written;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))


2008-10-16 11:13:53

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH updated] ext4: Fix file fragmentation during large file write.

On Thu, Oct 16, 2008 at 02:40:15PM +0530, Aneesh Kumar K.V wrote:
> On Wed, Oct 15, 2008 at 07:51:32PM -0400, Chris Mason wrote:
> > On Wed, 2008-10-15 at 16:41 -0400, Chris Mason wrote:
> > > On Fri, 2008-10-10 at 23:32 +0530, Aneesh Kumar K.V wrote:
> > > > The range_cyclic writeback mode use the address_space
> > > > writeback_index as the start index for writeback. With
> > > > delayed allocation we were updating writeback_index
> > > > wrongly resulting in highly fragmented file. Number of
> > > > extents reduced from 4000 to 27 for a 3GB file with
> > > > the below patch.
> > > >
> > >
> > > I tested the ext4 patch queue from today on top of 2.6.27, and this
> > > includes Aneesh's latest patches.
> > >
> > > Things are going at disk speed for streaming writes, with the number of
> > > extents generated for a 32GB file down to 27. So, this is definitely an
> > > improvement for ext4.
> >
> > Just FYI, I ran this with compilebench -i 20 --makej and my log is full
> > of these:
> >
> > ext4_da_writepages: jbd2_start: 1024 pages, ino 520417; err -30
> > Pid: 4072, comm: pdflush Not tainted 2.6.27 #2
> >
>

compilebench numbers

ext4
==========================================================================
intial create total runs 20 avg 30.25 MB/s (user 0.74s sys 2.40s)
no runs for create
no runs for patch
compile total runs 20 avg 39.79 MB/s (user 0.17s sys 2.55s)
no runs for clean
no runs for read tree
read compiled tree total runs 3 avg 19.83 MB/s (user 0.97s sys 4.08s)
no runs for delete tree
delete compiled tree total runs 20 avg 4.42 seconds (user 0.58s sys 1.79s)
no runs for stat tree

ext3
======================
intial create total runs 20 avg 27.96 MB/s (user 0.73s sys 2.57s)
no runs for create
no runs for patch
compile total runs 20 avg 28.84 MB/s (user 0.17s sys 4.03s)
no runs for clean
no runs for read tree
read compiled tree total runs 3 avg 19.46 MB/s (user 0.97s sys 4.13s)
no runs for delete tree
delete compiled tree total runs 20 avg 18.09 seconds (user 0.58s sys 1.61s)
no runs for stat tree
no runs for stat compiled tree

-aneesh