2008-10-11 19:04:48

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH 1/4] ext4: Use tag dirty lookup during mpage_da_submit_io

This enables us to drop the range_cont writeback mode
use from ext4_da_writepages.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/inode.c | 30 +++++++++++++-----------------
1 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7c2820e..cba7960 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1656,17 +1656,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)

while (index <= end) {
/* XXX: optimize tail */
- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ /*
+ * We can use PAGECACHE_TAG_DIRTY lookup here because
+ * even though we have cleared the dirty flag on the page
+ * We still keep the page in the radix tree with tag
+ * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
+ * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
+ * which is called via the below writepage callback.
+ */
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index,
+ (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];

- index = page->index;
- if (index > end)
- break;
- index++;
-
err = mapping->a_ops->writepage(page, mpd->wbc);
if (!err)
mpd->pages_written++;
@@ -2361,7 +2367,6 @@ static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
handle_t *handle = NULL;
- loff_t range_start = 0;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
int needed_blocks, ret = 0, nr_to_writebump = 0;
@@ -2386,14 +2391,7 @@ static int ext4_da_writepages(struct address_space *mapping,
wbc->nr_to_write = sbi->s_mb_stream_request;
}

- if (!wbc->range_cyclic)
- /*
- * If range_cyclic is not set force range_cont
- * and save the old writeback_index
- */
- wbc->range_cont = 1;

- range_start = wbc->range_start;
pages_skipped = wbc->pages_skipped;

mpd.wbc = wbc;
@@ -2452,9 +2450,8 @@ static int ext4_da_writepages(struct address_space *mapping,
wbc->nr_to_write = to_write;
}

- if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
+ if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) {
/* We skipped pages in this loop */
- wbc->range_start = range_start;
wbc->nr_to_write = to_write +
wbc->pages_skipped - pages_skipped;
wbc->pages_skipped = pages_skipped;
@@ -2463,7 +2460,6 @@ static int ext4_da_writepages(struct address_space *mapping,

out_writepages:
wbc->nr_to_write = to_write - nr_to_writebump;
- wbc->range_start = range_start;
return ret;
}

--
1.6.0.2.514.g23abd3



2008-10-11 19:04:54

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH 2/4] vfs: Remove the range_cont writeback mode.

Ext4 was the only user of range_cont writeback mode
and ext4 switched to a different method. So remove
the range_cont mode which is not used in the kernel.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
include/linux/writeback.h | 1 -
mm/page-writeback.c | 2 --
2 files changed, 0 insertions(+), 3 deletions(-)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 12b15c5..bd91987 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,7 +63,6 @@ struct writeback_control {
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */
- unsigned range_cont:1;
};

/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 24de8b6..718efa6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -961,8 +961,6 @@ int write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;

- if (wbc->range_cont)
- wbc->range_start = index << PAGE_CACHE_SHIFT;
return ret;
}
EXPORT_SYMBOL(write_cache_pages);
--
1.6.0.2.514.g23abd3


2008-10-11 19:05:01

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH 4/4] ext4: Fix file fragmentation during large file write.

The range_cyclic writeback mode uses the address_space writeback_index
as the start index for writeback. With delayed allocation we were
updating writeback_index wrongly resulting in highly fragmented file.
Number of extents reduced from 4000 to 27 for a 3GB file with the below
patch.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: Theodore Ts'o <[email protected]>
---
fs/ext4/inode.c | 83 +++++++++++++++++++++++++++++++++----------------------
1 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cba7960..844c136 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
int ret = 0, err, nr_pages, i;
unsigned long index, end;
struct pagevec pvec;
+ long pages_skipped;

BUG_ON(mpd->next_page <= mpd->first_page);
pagevec_init(&pvec, 0);
@@ -1655,7 +1656,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
end = mpd->next_page - 1;

while (index <= end) {
- /* XXX: optimize tail */
/*
* We can use PAGECACHE_TAG_DIRTY lookup here because
* even though we have cleared the dirty flag on the page
@@ -1673,8 +1673,13 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];

+ pages_skipped = mpd->wbc->pages_skipped;
err = mapping->a_ops->writepage(page, mpd->wbc);
- if (!err)
+ if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+ /*
+ * have successfully written the page
+ * without skipping the same
+ */
mpd->pages_written++;
/*
* In error case, we have to continue because
@@ -2110,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
struct writeback_control *wbc,
struct mpage_da_data *mpd)
{
- long to_write;
int ret;

if (!mpd->get_block)
@@ -2125,10 +2129,7 @@ static int mpage_da_writepages(struct address_space *mapping,
mpd->pages_written = 0;
mpd->retval = 0;

- to_write = wbc->nr_to_write;
-
ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
-
/*
* Handle last extent of pages
*/
@@ -2137,7 +2138,7 @@ static int mpage_da_writepages(struct address_space *mapping,
mpage_da_submit_io(mpd);
}

- wbc->nr_to_write = to_write - mpd->pages_written;
+ wbc->nr_to_write -= mpd->pages_written;
return ret;
}

@@ -2366,11 +2367,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ pgoff_t index;
+ int range_whole = 0;
handle_t *handle = NULL;
+ long pages_written = 0;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
+ int no_nrwrite_update, no_index_update;
int needed_blocks, ret = 0, nr_to_writebump = 0;
- long to_write, pages_skipped = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);

/*
@@ -2390,16 +2394,27 @@ static int ext4_da_writepages(struct address_space *mapping,
nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
wbc->nr_to_write = sbi->s_mb_stream_request;
}
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;

-
- pages_skipped = wbc->pages_skipped;
+ if (wbc->range_cyclic)
+ index = mapping->writeback_index;
+ else
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;

mpd.wbc = wbc;
mpd.inode = mapping->host;

-restart_loop:
- to_write = wbc->nr_to_write;
- while (!ret && to_write > 0) {
+ /*
+ * we don't want write_cache_pages to update
+ * nr_to_write and writeback_index
+ */
+ no_nrwrite_update = wbc->no_nrwrite_update;
+ wbc->no_nrwrite_update = 1;
+ no_index_update = wbc->no_index_update;
+ wbc->no_index_update = 1;
+
+ while (!ret && wbc->nr_to_write > 0) {

/*
* we insert one extent at a time. So we need
@@ -2420,46 +2435,48 @@ static int ext4_da_writepages(struct address_space *mapping,
dump_stack();
goto out_writepages;
}
- to_write -= wbc->nr_to_write;
-
mpd.get_block = ext4_da_get_block_write;
ret = mpage_da_writepages(mapping, wbc, &mpd);

ext4_journal_stop(handle);

- if (mpd.retval == -ENOSPC)
+ if (mpd.retval == -ENOSPC) {
+ /* commit the transaction which would
+ * free blocks released in the transaction
+ * and try again
+ */
jbd2_journal_force_commit_nested(sbi->s_journal);
-
- /* reset the retry count */
- if (ret == MPAGE_DA_EXTENT_TAIL) {
+ ret = 0;
+ } else if (ret == MPAGE_DA_EXTENT_TAIL) {
/*
* got one extent now try with
* rest of the pages
*/
- to_write += wbc->nr_to_write;
+ pages_written += mpd.pages_written;
ret = 0;
- } else if (wbc->nr_to_write) {
+ } else if (wbc->nr_to_write)
/*
* There is no more writeout needed
* or we requested for a noblocking writeout
* and we found the device congested
*/
- to_write += wbc->nr_to_write;
break;
- }
- wbc->nr_to_write = to_write;
- }
-
- if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) {
- /* We skipped pages in this loop */
- wbc->nr_to_write = to_write +
- wbc->pages_skipped - pages_skipped;
- wbc->pages_skipped = pages_skipped;
- goto restart_loop;
}
+ /* Update index */
+ index += pages_written;
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ /*
+ * set the writeback_index so that range_cyclic
+ * mode will write it back later
+ */
+ mapping->writeback_index = index;

out_writepages:
- wbc->nr_to_write = to_write - nr_to_writebump;
+ if (!no_nrwrite_update)
+ wbc->no_nrwrite_update = 0;
+ if (!no_index_update)
+ wbc->no_index_update = 0;
+ wbc->nr_to_write -= nr_to_writebump;
return ret;
}

--
1.6.0.2.514.g23abd3


2008-10-11 19:04:56

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH 3/4] vfs: Add no_nrwrite_update and no_index_update writeback control flags

If no_nrwrite_update is set we don't update nr_to_write in
write_cache_pages. Similarly if no_index_update is we don't
update address space writeback_index. These changes enable a
file system to skip these updates in write_cache_pages and do
them in the writepages() callback. This patch will be followed
by an ext4 patch that make use of these new flags.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
include/linux/writeback.h | 4 ++++
mm/page-writeback.c | 9 +++++++--
2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index bd91987..b04287e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,6 +63,10 @@ struct writeback_control {
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */
+
+ /* write_cache_pages() control */
+ unsigned no_nrwrite_update:1; /* don't update nr_to_write */
+ unsigned no_index_update:1; /* don't update writeback_index */
};

/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 718efa6..4f359f4 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -876,6 +876,7 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t end; /* Inclusive */
int scanned = 0;
int range_whole = 0;
+ long nr_to_write = wbc->nr_to_write;

if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
@@ -939,7 +940,7 @@ int write_cache_pages(struct address_space *mapping,
unlock_page(page);
ret = 0;
}
- if (ret || (--(wbc->nr_to_write) <= 0))
+ if (ret || (--nr_to_write <= 0))
done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
@@ -958,8 +959,12 @@ int write_cache_pages(struct address_space *mapping,
index = 0;
goto retry;
}
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ if (!wbc->no_index_update &&
+ (wbc->range_cyclic || (range_whole && nr_to_write > 0))) {
mapping->writeback_index = index;
+ }
+ if (!wbc->no_nrwrite_update)
+ wbc->nr_to_write = nr_to_write;

return ret;
}
--
1.6.0.2.514.g23abd3


2008-10-12 20:33:47

by Dmitry Monakhov

[permalink] [raw]
Subject: Re: [PATCH 4/4] ext4: Fix file fragmentation during large file write.

"Aneesh Kumar K.V" <[email protected]> writes:

> The range_cyclic writeback mode uses the address_space writeback_index
> as the start index for writeback. With delayed allocation we were
> updating writeback_index wrongly resulting in highly fragmented file.
> Number of extents reduced from 4000 to 27 for a 3GB file with the below
> patch.
Hi i've played with fragmentation patches with following result:
I've had several crash and deadlocks
for example objects wasn't freed on umount:
EXT4-fs: mballoc: 12800 blocks 13 reqs (6 success)
EXT4-fs: mballoc: 7 extents scanned, 12 goal hits, 1 2^N hits, 0 breaks, 0 lost
EXT4-fs: mballoc: 1 generated and it took 3024
EXT4-fs: mballoc: 7608 preallocated, 1536 discarded
slab error in kmem_cache_destroy(): cache `ext4_prealloc_space': Can't free all objects
Pid: 7703, comm: rmmod Not tainted 2.6.27-rc8 #3

Call Trace:
[<ffffffff8028b011>] kmem_cache_destroy+0x7d/0xc0
[<ffffffffa03ca057>] exit_ext4_mballoc+0x10/0x1e [ext4dev]
[<ffffffffa03d35b3>] exit_ext4_fs+0x1f/0x2f [ext4dev]
[<ffffffff80250dff>] sys_delete_module+0x199/0x1f3
[<ffffffff8025d06e>] audit_syscall_entry+0x12d/0x160
[<ffffffff8020be0b>] system_call_fastpath+0x16/0x1b
Some times sync has stuck.
I'm not shure is it because of current patch set, but where is at least one
brand new bug. See later.
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> Signed-off-by: Theodore Ts'o <[email protected]>
> ---
> fs/ext4/inode.c | 83 +++++++++++++++++++++++++++++++++----------------------
> 1 files changed, 50 insertions(+), 33 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index cba7960..844c136 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
> int ret = 0, err, nr_pages, i;
> unsigned long index, end;
> struct pagevec pvec;
> + long pages_skipped;
>
> BUG_ON(mpd->next_page <= mpd->first_page);
> pagevec_init(&pvec, 0);
> @@ -1655,7 +1656,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
> end = mpd->next_page - 1;
[1] In fact mpd->next_page may be bigger whan (index + wbc->nr_to_write)
this result in incorrect math while exit from mpage_da_writepages()
Probably we have bound end_index here.
end = min(mpd->next_page, index +wbc->nr_to_write) -1;
>
> while (index <= end) {
> - /* XXX: optimize tail */
> /*
> * We can use PAGECACHE_TAG_DIRTY lookup here because
> * even though we have cleared the dirty flag on the page
> @@ -1673,8 +1673,13 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
> for (i = 0; i < nr_pages; i++) {
> struct page *page = pvec.pages[i];
>
> + pages_skipped = mpd->wbc->pages_skipped;
> err = mapping->a_ops->writepage(page, mpd->wbc);
> - if (!err)
> + if (!err && (pages_skipped == mpd->wbc->pages_skipped))
> + /*
> + * have successfully written the page
> + * without skipping the same
> + */
> mpd->pages_written++;
> /*
> * In error case, we have to continue because
> @@ -2110,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
> struct writeback_control *wbc,
> struct mpage_da_data *mpd)
> {
> - long to_write;
> int ret;
>
> if (!mpd->get_block)
> @@ -2125,10 +2129,7 @@ static int mpage_da_writepages(struct address_space *mapping,
> mpd->pages_written = 0;
> mpd->retval = 0;
>
> - to_write = wbc->nr_to_write;
> -
> ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
> -
> /*
> * Handle last extent of pages
> */
> @@ -2137,7 +2138,7 @@ static int mpage_da_writepages(struct address_space *mapping,
> mpage_da_submit_io(mpd);
> }
>
> - wbc->nr_to_write = to_write - mpd->pages_written;
> + wbc->nr_to_write -= mpd->pages_written;
due to [1] wbc->nr_write becomes negative here
> return ret;
> }
>
> @@ -2366,11 +2367,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
> static int ext4_da_writepages(struct address_space *mapping,
> struct writeback_control *wbc)
> {
> + pgoff_t index;
> + int range_whole = 0;
> handle_t *handle = NULL;
> + long pages_written = 0;
> struct mpage_da_data mpd;
> struct inode *inode = mapping->host;
> + int no_nrwrite_update, no_index_update;
> int needed_blocks, ret = 0, nr_to_writebump = 0;
> - long to_write, pages_skipped = 0;
> struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
>
> /*
> @@ -2390,16 +2394,27 @@ static int ext4_da_writepages(struct address_space *mapping,
> nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
> wbc->nr_to_write = sbi->s_mb_stream_request;
> }
> + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
> + range_whole = 1;
>
> -
> - pages_skipped = wbc->pages_skipped;
> + if (wbc->range_cyclic)
> + index = mapping->writeback_index;
> + else
> + index = wbc->range_start >> PAGE_CACHE_SHIFT;
>
> mpd.wbc = wbc;
> mpd.inode = mapping->host;
>
> -restart_loop:
> - to_write = wbc->nr_to_write;
> - while (!ret && to_write > 0) {
> + /*
> + * we don't want write_cache_pages to update
> + * nr_to_write and writeback_index
> + */
> + no_nrwrite_update = wbc->no_nrwrite_update;
> + wbc->no_nrwrite_update = 1;
> + no_index_update = wbc->no_index_update;
> + wbc->no_index_update = 1;
> +
> + while (!ret && wbc->nr_to_write > 0) {
>
> /*
> * we insert one extent at a time. So we need
> @@ -2420,46 +2435,48 @@ static int ext4_da_writepages(struct address_space *mapping,
> dump_stack();
> goto out_writepages;
> }
> - to_write -= wbc->nr_to_write;
> -
> mpd.get_block = ext4_da_get_block_write;
> ret = mpage_da_writepages(mapping, wbc, &mpd);
>
> ext4_journal_stop(handle);
>
> - if (mpd.retval == -ENOSPC)
> + if (mpd.retval == -ENOSPC) {
> + /* commit the transaction which would
> + * free blocks released in the transaction
> + * and try again
> + */
> jbd2_journal_force_commit_nested(sbi->s_journal);
> -
> - /* reset the retry count */
> - if (ret == MPAGE_DA_EXTENT_TAIL) {
> + ret = 0;
> + } else if (ret == MPAGE_DA_EXTENT_TAIL) {
> /*
> * got one extent now try with
> * rest of the pages
> */
> - to_write += wbc->nr_to_write;
> + pages_written += mpd.pages_written;
> ret = 0;
> - } else if (wbc->nr_to_write) {
> + } else if (wbc->nr_to_write)
> /*
> * There is no more writeout needed
> * or we requested for a noblocking writeout
> * and we found the device congested
> */
> - to_write += wbc->nr_to_write;
> break;
> - }
> - wbc->nr_to_write = to_write;
> - }
> -
> - if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) {
> - /* We skipped pages in this loop */
> - wbc->nr_to_write = to_write +
> - wbc->pages_skipped - pages_skipped;
> - wbc->pages_skipped = pages_skipped;
> - goto restart_loop;
> }
> + /* Update index */
> + index += pages_written;
> + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> + /*
> + * set the writeback_index so that range_cyclic
> + * mode will write it back later
> + */
> + mapping->writeback_index = index;
>
> out_writepages:
> - wbc->nr_to_write = to_write - nr_to_writebump;
> + if (!no_nrwrite_update)
> + wbc->no_nrwrite_update = 0;
> + if (!no_index_update)
> + wbc->no_index_update = 0;
> + wbc->nr_to_write -= nr_to_writebump;
> return ret;
> }
BTW: please add following cleanup fix.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6efa4ca..c248cbc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2603,7 +2603,7 @@ static int ext4_da_write_end(struct file *file,
handle_t *handle = ext4_journal_current_handle();
loff_t new_i_size;
unsigned long start, end;
- int write_mode = (int)fsdata;
+ int write_mode = (int)(unsigned long)fsdata;

if (write_mode == FALL_BACK_TO_NONDELALLOC) {
if (ext4_should_order_data(inode)) {


2008-10-13 09:53:03

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH 4/4] ext4: Fix file fragmentation during large file write.

On Mon, Oct 13, 2008 at 12:31:43AM +0400, Dmitri Monakhov wrote:
> "Aneesh Kumar K.V" <[email protected]> writes:
>
> > The range_cyclic writeback mode uses the address_space writeback_index
> > as the start index for writeback. With delayed allocation we were
> > updating writeback_index wrongly resulting in highly fragmented file.
> > Number of extents reduced from 4000 to 27 for a 3GB file with the below
> > patch.
> Hi i've played with fragmentation patches with following result:
> I've had several crash and deadlocks
> for example objects wasn't freed on umount:
> EXT4-fs: mballoc: 12800 blocks 13 reqs (6 success)
> EXT4-fs: mballoc: 7 extents scanned, 12 goal hits, 1 2^N hits, 0 breaks, 0 lost
> EXT4-fs: mballoc: 1 generated and it took 3024
> EXT4-fs: mballoc: 7608 preallocated, 1536 discarded
> slab error in kmem_cache_destroy(): cache `ext4_prealloc_space': Can't free all objects
> Pid: 7703, comm: rmmod Not tainted 2.6.27-rc8 #3
>
> Call Trace:
> [<ffffffff8028b011>] kmem_cache_destroy+0x7d/0xc0
> [<ffffffffa03ca057>] exit_ext4_mballoc+0x10/0x1e [ext4dev]
> [<ffffffffa03d35b3>] exit_ext4_fs+0x1f/0x2f [ext4dev]
> [<ffffffff80250dff>] sys_delete_module+0x199/0x1f3
> [<ffffffff8025d06e>] audit_syscall_entry+0x12d/0x160
> [<ffffffff8020be0b>] system_call_fastpath+0x16/0x1b
> Some times sync has stuck.
> I'm not shure is it because of current patch set, but where is at least one
> brand new bug. See later.


I can't reproduce this. I build ext as a module and tried to unload the
module. Actually the umount should have released all the objects in
slab. Can you get the /proc/slabinfo output when this happens ?


> > Signed-off-by: Aneesh Kumar K.V <[email protected]>
> > Signed-off-by: Theodore Ts'o <[email protected]>
> > ---
> > fs/ext4/inode.c | 83 +++++++++++++++++++++++++++++++++----------------------
> > 1 files changed, 50 insertions(+), 33 deletions(-)
> >
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index cba7960..844c136 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
> > int ret = 0, err, nr_pages, i;
> > unsigned long index, end;
> > struct pagevec pvec;
> > + long pages_skipped;
> >
> > BUG_ON(mpd->next_page <= mpd->first_page);
> > pagevec_init(&pvec, 0);
> > @@ -1655,7 +1656,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
> > end = mpd->next_page - 1;
> [1] In fact mpd->next_page may be bigger whan (index + wbc->nr_to_write)
> this result in incorrect math while exit from mpage_da_writepages()
> Probably we have bound end_index here.

write_cache_pages also will write more than requested. The
pagevec_lookup_tag can return more than nr_to_write page which
implies wbc->nr_to_write can go negative.


> end = min(mpd->next_page, index +wbc->nr_to_write) -1;
> >
> > while (index <= end) {
> > - /* XXX: optimize tail */
> > /*
> > * We can use PAGECACHE_TAG_DIRTY lookup here because
> > * even though we have cleared the dirty flag on the page
> > @@ -1673,8 +1673,13 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
> > for (i = 0; i < nr_pages; i++) {
> > struct page *page = pvec.pages[i];
> >
> > + pages_skipped = mpd->wbc->pages_skipped;
> > err = mapping->a_ops->writepage(page, mpd->wbc);
> > - if (!err)
> > + if (!err && (pages_skipped == mpd->wbc->pages_skipped))
> > + /*
> > + * have successfully written the page
> > + * without skipping the same
> > + */
> > mpd->pages_written++;
> > /*
> > * In error case, we have to continue because
> > @@ -2110,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
> > struct writeback_control *wbc,
> > struct mpage_da_data *mpd)
> > {
> > - long to_write;
> > int ret;
> >
> > if (!mpd->get_block)
> > @@ -2125,10 +2129,7 @@ static int mpage_da_writepages(struct address_space *mapping,
> > mpd->pages_written = 0;
> > mpd->retval = 0;
> >
> > - to_write = wbc->nr_to_write;
> > -
> > ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
> > -
> > /*
> > * Handle last extent of pages
> > */
> > @@ -2137,7 +2138,7 @@ static int mpage_da_writepages(struct address_space *mapping,
> > mpage_da_submit_io(mpd);
> > }
> >
> > - wbc->nr_to_write = to_write - mpd->pages_written;
> > + wbc->nr_to_write -= mpd->pages_written;
> due to [1] wbc->nr_write becomes negative here


wbc->nr_ti_write can go negative right ?



> > return ret;
> > }
> >
> > @@ -2366,11 +2367,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
> > static int ext4_da_writepages(struct address_space *mapping,
> > struct writeback_control *wbc)
> > {
> > + pgoff_t index;
> > + int range_whole = 0;
> > handle_t *handle = NULL;
> > + long pages_written = 0;
> > struct mpage_da_data mpd;
> > struct inode *inode = mapping->host;
> > + int no_nrwrite_update, no_index_update;
> > int needed_blocks, ret = 0, nr_to_writebump = 0;
> > - long to_write, pages_skipped = 0;
> > struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
> >
> > /*
> > @@ -2390,16 +2394,27 @@ static int ext4_da_writepages(struct address_space *mapping,
> > nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
> > wbc->nr_to_write = sbi->s_mb_stream_request;
> > }
> > + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
> > + range_whole = 1;
> >
> > -
> > - pages_skipped = wbc->pages_skipped;
> > + if (wbc->range_cyclic)
> > + index = mapping->writeback_index;
> > + else
> > + index = wbc->range_start >> PAGE_CACHE_SHIFT;
> >
> > mpd.wbc = wbc;
> > mpd.inode = mapping->host;
> >
> > -restart_loop:
> > - to_write = wbc->nr_to_write;
> > - while (!ret && to_write > 0) {
> > + /*
> > + * we don't want write_cache_pages to update
> > + * nr_to_write and writeback_index
> > + */
> > + no_nrwrite_update = wbc->no_nrwrite_update;
> > + wbc->no_nrwrite_update = 1;
> > + no_index_update = wbc->no_index_update;
> > + wbc->no_index_update = 1;
> > +
> > + while (!ret && wbc->nr_to_write > 0) {
> >
> > /*
> > * we insert one extent at a time. So we need
> > @@ -2420,46 +2435,48 @@ static int ext4_da_writepages(struct address_space *mapping,
> > dump_stack();
> > goto out_writepages;
> > }
> > - to_write -= wbc->nr_to_write;
> > -
> > mpd.get_block = ext4_da_get_block_write;
> > ret = mpage_da_writepages(mapping, wbc, &mpd);
> >
> > ext4_journal_stop(handle);
> >
> > - if (mpd.retval == -ENOSPC)
> > + if (mpd.retval == -ENOSPC) {
> > + /* commit the transaction which would
> > + * free blocks released in the transaction
> > + * and try again
> > + */
> > jbd2_journal_force_commit_nested(sbi->s_journal);
> > -
> > - /* reset the retry count */
> > - if (ret == MPAGE_DA_EXTENT_TAIL) {
> > + ret = 0;
> > + } else if (ret == MPAGE_DA_EXTENT_TAIL) {
> > /*
> > * got one extent now try with
> > * rest of the pages
> > */
> > - to_write += wbc->nr_to_write;
> > + pages_written += mpd.pages_written;
> > ret = 0;
> > - } else if (wbc->nr_to_write) {
> > + } else if (wbc->nr_to_write)
> > /*
> > * There is no more writeout needed
> > * or we requested for a noblocking writeout
> > * and we found the device congested
> > */
> > - to_write += wbc->nr_to_write;
> > break;
> > - }
> > - wbc->nr_to_write = to_write;
> > - }
> > -
> > - if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) {
> > - /* We skipped pages in this loop */
> > - wbc->nr_to_write = to_write +
> > - wbc->pages_skipped - pages_skipped;
> > - wbc->pages_skipped = pages_skipped;
> > - goto restart_loop;
> > }
> > + /* Update index */
> > + index += pages_written;
> > + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> > + /*
> > + * set the writeback_index so that range_cyclic
> > + * mode will write it back later
> > + */
> > + mapping->writeback_index = index;
> >
> > out_writepages:
> > - wbc->nr_to_write = to_write - nr_to_writebump;
> > + if (!no_nrwrite_update)
> > + wbc->no_nrwrite_update = 0;
> > + if (!no_index_update)
> > + wbc->no_index_update = 0;
> > + wbc->nr_to_write -= nr_to_writebump;
> > return ret;
> > }
> BTW: please add following cleanup fix.
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 6efa4ca..c248cbc 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2603,7 +2603,7 @@ static int ext4_da_write_end(struct file *file,
> handle_t *handle = ext4_journal_current_handle();
> loff_t new_i_size;
> unsigned long start, end;
> - int write_mode = (int)fsdata;
> + int write_mode = (int)(unsigned long)fsdata;
>
> if (write_mode == FALL_BACK_TO_NONDELALLOC) {
> if (ext4_should_order_data(inode)) {
>

Eric Sandeen already fixed it in the pathqueue.I can also
see mainline having the fix. Which kernel are you trying ?

-aneesh

2008-10-13 13:34:52

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH 4/4] ext4: Fix file fragmentation during large file write.

On Mon, Oct 13, 2008 at 12:31:43AM +0400, Dmitri Monakhov wrote:
> "Aneesh Kumar K.V" <[email protected]> writes:
>
> > The range_cyclic writeback mode uses the address_space writeback_index
> > as the start index for writeback. With delayed allocation we were
> > updating writeback_index wrongly resulting in highly fragmented file.
> > Number of extents reduced from 4000 to 27 for a 3GB file with the below
> > patch.
> Hi i've played with fragmentation patches with following result:
> I've had several crash and deadlocks
> for example objects wasn't freed on umount:
> EXT4-fs: mballoc: 12800 blocks 13 reqs (6 success)
> EXT4-fs: mballoc: 7 extents scanned, 12 goal hits, 1 2^N hits, 0 breaks, 0 lost
> EXT4-fs: mballoc: 1 generated and it took 3024
> EXT4-fs: mballoc: 7608 preallocated, 1536 discarded
> slab error in kmem_cache_destroy(): cache `ext4_prealloc_space': Can't free all objects
> Pid: 7703, comm: rmmod Not tainted 2.6.27-rc8 #3
>
> Call Trace:
> [<ffffffff8028b011>] kmem_cache_destroy+0x7d/0xc0
> [<ffffffffa03ca057>] exit_ext4_mballoc+0x10/0x1e [ext4dev]
> [<ffffffffa03d35b3>] exit_ext4_fs+0x1f/0x2f [ext4dev]
> [<ffffffff80250dff>] sys_delete_module+0x199/0x1f3
> [<ffffffff8025d06e>] audit_syscall_entry+0x12d/0x160
> [<ffffffff8020be0b>] system_call_fastpath+0x16/0x1b

Looking at the code i found this. I haven't test the change yet.

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 2f38754..acf6a32 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2569,7 +2569,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
list_del(&pa->pa_group_list);
count++;
- kfree(pa);
+ kmem_cache_free(ext4_pspace_cachep, pa);
}
if (count)
mb_debug("mballoc: %u PAs left\n", count);

2008-10-13 15:33:27

by Dmitry Monakhov

[permalink] [raw]
Subject: Re: [PATCH 4/4] ext4: Fix file fragmentation during large file write.

"Aneesh Kumar K.V" <[email protected]> writes:

> On Mon, Oct 13, 2008 at 12:31:43AM +0400, Dmitri Monakhov wrote:
>> "Aneesh Kumar K.V" <[email protected]> writes:
>>
>> > The range_cyclic writeback mode uses the address_space writeback_index
>> > as the start index for writeback. With delayed allocation we were
>> > updating writeback_index wrongly resulting in highly fragmented file.
>> > Number of extents reduced from 4000 to 27 for a 3GB file with the below
>> > patch.
>> Hi i've played with fragmentation patches with following result:
>> I've had several crash and deadlocks
>> for example objects wasn't freed on umount:
>> EXT4-fs: mballoc: 12800 blocks 13 reqs (6 success)
>> EXT4-fs: mballoc: 7 extents scanned, 12 goal hits, 1 2^N hits, 0 breaks, 0 lost
>> EXT4-fs: mballoc: 1 generated and it took 3024
>> EXT4-fs: mballoc: 7608 preallocated, 1536 discarded
>> slab error in kmem_cache_destroy(): cache `ext4_prealloc_space': Can't free all objects
>> Pid: 7703, comm: rmmod Not tainted 2.6.27-rc8 #3
>>
>> Call Trace:
>> [<ffffffff8028b011>] kmem_cache_destroy+0x7d/0xc0
>> [<ffffffffa03ca057>] exit_ext4_mballoc+0x10/0x1e [ext4dev]
>> [<ffffffffa03d35b3>] exit_ext4_fs+0x1f/0x2f [ext4dev]
>> [<ffffffff80250dff>] sys_delete_module+0x199/0x1f3
>> [<ffffffff8025d06e>] audit_syscall_entry+0x12d/0x160
>> [<ffffffff8020be0b>] system_call_fastpath+0x16/0x1b
>> Some times sync has stuck.
>> I'm not shure is it because of current patch set, but where is at least one
>> brand new bug. See later.
>
>
> I can't reproduce this. I build ext as a module and tried to unload the
> module. Actually the umount should have released all the objects in
> slab. Can you get the /proc/slabinfo output when this happens ?
this result in invalid pointer dereference.
Your kmem_cache patch has fixed the issue. At least for writes w/o fallocate.
>
>
>> > Signed-off-by: Aneesh Kumar K.V <[email protected]>
>> > Signed-off-by: Theodore Ts'o <[email protected]>
>> > ---
>> > fs/ext4/inode.c | 83 +++++++++++++++++++++++++++++++++----------------------
>> > 1 files changed, 50 insertions(+), 33 deletions(-)
>> >
>> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> > index cba7960..844c136 100644
>> > --- a/fs/ext4/inode.c
>> > +++ b/fs/ext4/inode.c
>> > @@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
>> > int ret = 0, err, nr_pages, i;
>> > unsigned long index, end;
>> > struct pagevec pvec;
>> > + long pages_skipped;
>> >
>> > BUG_ON(mpd->next_page <= mpd->first_page);
>> > pagevec_init(&pvec, 0);
>> > @@ -1655,7 +1656,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
>> > end = mpd->next_page - 1;
>> [1] In fact mpd->next_page may be bigger whan (index + wbc->nr_to_write)
>> this result in incorrect math while exit from mpage_da_writepages()
>> Probably we have bound end_index here.
>
> write_cache_pages also will write more than requested. The
> pagevec_lookup_tag can return more than nr_to_write page which
> implies wbc->nr_to_write can go negative.
>
>
>> end = min(mpd->next_page, index +wbc->nr_to_write) -1;
>> >
>> > while (index <= end) {
>> > - /* XXX: optimize tail */
>> > /*
>> > * We can use PAGECACHE_TAG_DIRTY lookup here because
>> > * even though we have cleared the dirty flag on the page
>> > @@ -1673,8 +1673,13 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
>> > for (i = 0; i < nr_pages; i++) {
>> > struct page *page = pvec.pages[i];
>> >
>> > + pages_skipped = mpd->wbc->pages_skipped;
>> > err = mapping->a_ops->writepage(page, mpd->wbc);
>> > - if (!err)
>> > + if (!err && (pages_skipped == mpd->wbc->pages_skipped))
>> > + /*
>> > + * have successfully written the page
>> > + * without skipping the same
>> > + */
>> > mpd->pages_written++;
>> > /*
>> > * In error case, we have to continue because
>> > @@ -2110,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
>> > struct writeback_control *wbc,
>> > struct mpage_da_data *mpd)
>> > {
>> > - long to_write;
>> > int ret;
>> >
>> > if (!mpd->get_block)
>> > @@ -2125,10 +2129,7 @@ static int mpage_da_writepages(struct address_space *mapping,
>> > mpd->pages_written = 0;
>> > mpd->retval = 0;
>> >
>> > - to_write = wbc->nr_to_write;
>> > -
>> > ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
>> > -
>> > /*
>> > * Handle last extent of pages
>> > */
>> > @@ -2137,7 +2138,7 @@ static int mpage_da_writepages(struct address_space *mapping,
>> > mpage_da_submit_io(mpd);
>> > }
>> >
>> > - wbc->nr_to_write = to_write - mpd->pages_written;
>> > + wbc->nr_to_write -= mpd->pages_written;
>> due to [1] wbc->nr_write becomes negative here
>
>
> wbc->nr_ti_write can go negative right ?
>
>
>
>> > return ret;
>> > }
>> >
>> > @@ -2366,11 +2367,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
>> > static int ext4_da_writepages(struct address_space *mapping,
>> > struct writeback_control *wbc)
>> > {
>> > + pgoff_t index;
>> > + int range_whole = 0;
>> > handle_t *handle = NULL;
>> > + long pages_written = 0;
>> > struct mpage_da_data mpd;
>> > struct inode *inode = mapping->host;
>> > + int no_nrwrite_update, no_index_update;
>> > int needed_blocks, ret = 0, nr_to_writebump = 0;
>> > - long to_write, pages_skipped = 0;
>> > struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
>> >
>> > /*
>> > @@ -2390,16 +2394,27 @@ static int ext4_da_writepages(struct address_space *mapping,
>> > nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
>> > wbc->nr_to_write = sbi->s_mb_stream_request;
>> > }
>> > + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
>> > + range_whole = 1;
>> >
>> > -
>> > - pages_skipped = wbc->pages_skipped;
>> > + if (wbc->range_cyclic)
>> > + index = mapping->writeback_index;
>> > + else
>> > + index = wbc->range_start >> PAGE_CACHE_SHIFT;
>> >
>> > mpd.wbc = wbc;
>> > mpd.inode = mapping->host;
>> >
>> > -restart_loop:
>> > - to_write = wbc->nr_to_write;
>> > - while (!ret && to_write > 0) {
>> > + /*
>> > + * we don't want write_cache_pages to update
>> > + * nr_to_write and writeback_index
>> > + */
>> > + no_nrwrite_update = wbc->no_nrwrite_update;
>> > + wbc->no_nrwrite_update = 1;
>> > + no_index_update = wbc->no_index_update;
>> > + wbc->no_index_update = 1;
>> > +
>> > + while (!ret && wbc->nr_to_write > 0) {
>> >
>> > /*
>> > * we insert one extent at a time. So we need
>> > @@ -2420,46 +2435,48 @@ static int ext4_da_writepages(struct address_space *mapping,
>> > dump_stack();
>> > goto out_writepages;
>> > }
>> > - to_write -= wbc->nr_to_write;
>> > -
>> > mpd.get_block = ext4_da_get_block_write;
>> > ret = mpage_da_writepages(mapping, wbc, &mpd);
>> >
>> > ext4_journal_stop(handle);
>> >
>> > - if (mpd.retval == -ENOSPC)
>> > + if (mpd.retval == -ENOSPC) {
>> > + /* commit the transaction which would
>> > + * free blocks released in the transaction
>> > + * and try again
>> > + */
>> > jbd2_journal_force_commit_nested(sbi->s_journal);
>> > -
>> > - /* reset the retry count */
>> > - if (ret == MPAGE_DA_EXTENT_TAIL) {
>> > + ret = 0;
>> > + } else if (ret == MPAGE_DA_EXTENT_TAIL) {
>> > /*
>> > * got one extent now try with
>> > * rest of the pages
>> > */
>> > - to_write += wbc->nr_to_write;
>> > + pages_written += mpd.pages_written;
>> > ret = 0;
>> > - } else if (wbc->nr_to_write) {
>> > + } else if (wbc->nr_to_write)
>> > /*
>> > * There is no more writeout needed
>> > * or we requested for a noblocking writeout
>> > * and we found the device congested
>> > */
>> > - to_write += wbc->nr_to_write;
>> > break;
>> > - }
>> > - wbc->nr_to_write = to_write;
>> > - }
>> > -
>> > - if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) {
>> > - /* We skipped pages in this loop */
>> > - wbc->nr_to_write = to_write +
>> > - wbc->pages_skipped - pages_skipped;
>> > - wbc->pages_skipped = pages_skipped;
>> > - goto restart_loop;
>> > }
>> > + /* Update index */
>> > + index += pages_written;
>> > + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
>> > + /*
>> > + * set the writeback_index so that range_cyclic
>> > + * mode will write it back later
>> > + */
>> > + mapping->writeback_index = index;
>> >
>> > out_writepages:
>> > - wbc->nr_to_write = to_write - nr_to_writebump;
>> > + if (!no_nrwrite_update)
>> > + wbc->no_nrwrite_update = 0;
>> > + if (!no_index_update)
>> > + wbc->no_index_update = 0;
>> > + wbc->nr_to_write -= nr_to_writebump;
>> > return ret;
>> > }
>> BTW: please add following cleanup fix.
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 6efa4ca..c248cbc 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -2603,7 +2603,7 @@ static int ext4_da_write_end(struct file *file,
>> handle_t *handle = ext4_journal_current_handle();
>> loff_t new_i_size;
>> unsigned long start, end;
>> - int write_mode = (int)fsdata;
>> + int write_mode = (int)(unsigned long)fsdata;
>>
>> if (write_mode == FALL_BACK_TO_NONDELALLOC) {
>> if (ext4_should_order_data(inode)) {
>>
>
> Eric Sandeen already fixed it in the pathqueue.I can also
> see mainline having the fix. Which kernel are you trying ?
Ops.. i've missed it.
>
> -aneesh