2008-10-16 11:20:25

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3] vfs: Add no_nrwrite_index_update writeback control flags

If no_nrwrite_index_update is set we don't update nr_to_write
and address space writeback_index in write_cache_pages.
This change enable a file system to skip these updates
in write_cache_pages and do them in the writepages()
callback. This patch will be followed by an ext4 patch
that make use of these new flags.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
CC: [email protected]
---
include/linux/writeback.h | 9 +++++++++
mm/page-writeback.c | 10 +++++++---
2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index bd91987..4de3941 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,6 +63,15 @@ struct writeback_control {
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */
+ /*
+ * write_cache_pages won't update wbc->nr_to_write and
+ * mapping->writeback_index if no_nrwrite_index_update
+ * is set. write_cache_pages may write more than we
+ * requested and we want to make sure nr_to_write and
+ * writeback_index are updated in a consistent manner
+ * so we use a single control to update them
+ */
+ unsigned no_nrwrite_index_update:1;
};

/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 718efa6..b6d66d3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -876,6 +876,7 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t end; /* Inclusive */
int scanned = 0;
int range_whole = 0;
+ long nr_to_write = wbc->nr_to_write;

if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
@@ -939,7 +940,7 @@ int write_cache_pages(struct address_space *mapping,
unlock_page(page);
ret = 0;
}
- if (ret || (--(wbc->nr_to_write) <= 0))
+ if (ret || (--nr_to_write <= 0))
done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
@@ -958,8 +959,11 @@ int write_cache_pages(struct address_space *mapping,
index = 0;
goto retry;
}
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
- mapping->writeback_index = index;
+ if (!wbc->no_nrwrite_index_update) {
+ if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
+ mapping->writeback_index = index;
+ wbc->nr_to_write = nr_to_write;
+ }

return ret;
}
--
1.6.0.2.526.g5c283



2008-10-16 11:20:40

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3] ext4: Fix file fragmentation during large file write.

The range_cyclic writeback mode uses the address_space writeback_index
as the start index for writeback. With delayed allocation we were
updating writeback_index wrongly resulting in highly fragmented file.
Number of extents reduced from 4000 to 27 for a 3GB file with the below
patch.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: Theodore Ts'o <[email protected]>
---
fs/ext4/inode.c | 91 ++++++++++++++++++++++++++++++++++--------------------
1 files changed, 57 insertions(+), 34 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cba7960..46ffd0a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
int ret = 0, err, nr_pages, i;
unsigned long index, end;
struct pagevec pvec;
+ long pages_skipped;

BUG_ON(mpd->next_page <= mpd->first_page);
pagevec_init(&pvec, 0);
@@ -1655,7 +1656,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
end = mpd->next_page - 1;

while (index <= end) {
- /* XXX: optimize tail */
/*
* We can use PAGECACHE_TAG_DIRTY lookup here because
* even though we have cleared the dirty flag on the page
@@ -1673,8 +1673,13 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];

+ pages_skipped = mpd->wbc->pages_skipped;
err = mapping->a_ops->writepage(page, mpd->wbc);
- if (!err)
+ if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+ /*
+ * have successfully written the page
+ * without skipping the same
+ */
mpd->pages_written++;
/*
* In error case, we have to continue because
@@ -2110,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
struct writeback_control *wbc,
struct mpage_da_data *mpd)
{
- long to_write;
int ret;

if (!mpd->get_block)
@@ -2125,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping,
mpd->pages_written = 0;
mpd->retval = 0;

- to_write = wbc->nr_to_write;
-
ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
-
/*
* Handle last extent of pages
*/
if (!mpd->io_done && mpd->next_page != mpd->first_page) {
if (mpage_da_map_blocks(mpd) == 0)
mpage_da_submit_io(mpd);
- }

- wbc->nr_to_write = to_write - mpd->pages_written;
+ mpd->io_done = 1;
+ ret = MPAGE_DA_EXTENT_TAIL;
+ }
+ wbc->nr_to_write -= mpd->pages_written;
return ret;
}

@@ -2366,11 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ pgoff_t index;
+ int range_whole = 0;
handle_t *handle = NULL;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
+ int no_nrwrite_index_update;
+ long pages_written = 0, pages_skipped;
int needed_blocks, ret = 0, nr_to_writebump = 0;
- long to_write, pages_skipped = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);

/*
@@ -2390,16 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping,
nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
wbc->nr_to_write = sbi->s_mb_stream_request;
}
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;

-
- pages_skipped = wbc->pages_skipped;
+ if (wbc->range_cyclic)
+ index = mapping->writeback_index;
+ else
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;

mpd.wbc = wbc;
mpd.inode = mapping->host;

-restart_loop:
- to_write = wbc->nr_to_write;
- while (!ret && to_write > 0) {
+ /*
+ * we don't want write_cache_pages to update
+ * nr_to_write and writeback_index
+ */
+ no_nrwrite_index_update = wbc->no_nrwrite_index_update;
+ wbc->no_nrwrite_index_update = 1;
+ pages_skipped = wbc->pages_skipped;
+
+ while (!ret && wbc->nr_to_write > 0) {

/*
* we insert one extent at a time. So we need
@@ -2420,46 +2436,53 @@ static int ext4_da_writepages(struct address_space *mapping,
dump_stack();
goto out_writepages;
}
- to_write -= wbc->nr_to_write;
-
mpd.get_block = ext4_da_get_block_write;
ret = mpage_da_writepages(mapping, wbc, &mpd);

ext4_journal_stop(handle);

- if (mpd.retval == -ENOSPC)
+ if (mpd.retval == -ENOSPC) {
+ /* commit the transaction which would
+ * free blocks released in the transaction
+ * and try again
+ */
jbd2_journal_force_commit_nested(sbi->s_journal);
-
- /* reset the retry count */
- if (ret == MPAGE_DA_EXTENT_TAIL) {
+ wbc->pages_skipped = pages_skipped;
+ ret = 0;
+ } else if (ret == MPAGE_DA_EXTENT_TAIL) {
/*
* got one extent now try with
* rest of the pages
*/
- to_write += wbc->nr_to_write;
+ pages_written += mpd.pages_written;
+ wbc->pages_skipped = pages_skipped;
ret = 0;
- } else if (wbc->nr_to_write) {
+ } else if (wbc->nr_to_write)
/*
* There is no more writeout needed
* or we requested for a noblocking writeout
* and we found the device congested
*/
- to_write += wbc->nr_to_write;
break;
- }
- wbc->nr_to_write = to_write;
- }
-
- if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) {
- /* We skipped pages in this loop */
- wbc->nr_to_write = to_write +
- wbc->pages_skipped - pages_skipped;
- wbc->pages_skipped = pages_skipped;
- goto restart_loop;
}
+ if (pages_skipped != wbc->pages_skipped)
+ printk(KERN_EMERG "This should not happen leaving %s "
+ "with nr_to_write = %ld ret = %d\n",
+ __func__, wbc->nr_to_write, ret);
+
+ /* Update index */
+ index += pages_written;
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ /*
+ * set the writeback_index so that range_cyclic
+ * mode will write it back later
+ */
+ mapping->writeback_index = index;

out_writepages:
- wbc->nr_to_write = to_write - nr_to_writebump;
+ if (!no_nrwrite_index_update)
+ wbc->no_nrwrite_index_update = 0;
+ wbc->nr_to_write -= nr_to_writebump;
return ret;
}

--
1.6.0.2.526.g5c283