Greetings,
This is a second try at sending ext4 fixes to 2.6.27.y, including an
important deadlock fix from kernel bugzilla #12579. The patches were
dropped the first time because they depended on a vfs patch which was
recently found to be broken on mainline. Now, the vfs patch is not
included in this set, only ext4 patches.
Here are the three patches I'm proposing for inclusion in 2.6.27.y:
Theodore Ts'o (1):
ext4: Use our own write_cache_pages()
Aneesh Kumar K.V (2):
ext4: Fix file fragmentation during large file write.
ext4: Implement range_cyclic in ext4_da_writepages instead of write_cache_pages
They will be posted as replies to this message.
I have made some changes to the first two patches to adapt them for the
current 2.6.27.47 tree. The changes are detailed between brackets above
my signed-off line for each.
Jayson
From: Aneesh Kumar K.V <[email protected]>
Date: Thu Oct 16 10:10:36 2008 -0400
Subject: ext4: Fix file fragmentation during large file write.
commit 22208dedbd7626e5fc4339c417f8d24cc21f79d7 upstream.
The range_cyclic writeback mode uses the address_space writeback_index
as the start index for writeback. With delayed allocation we were
updating writeback_index wrongly resulting in highly fragmented file.
This patch reduces the number of extents reduced from 4000 to 27 for a
3GB file.
Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: Theodore Ts'o <[email protected]>
[[email protected]: Some changed lines from the original version of this patch were dropped, since they were rolled up with another cherry-picked patch applied to 2.6.27.y earlier.]
[[email protected]: Use of wbc->no_nrwrite_index_update was dropped, since write_cache_pages_da() implies it.]
Signed-off-by: Jayson R. King <[email protected]>
---
fs/ext4/inode.c | 77 ++++++++++++++++++++++++++--------------------
1 file changed, 45 insertions(+), 32 deletions(-)
diff -udrNp linux-2.6.27.orig/fs/ext4/inode.c linux-2.6.27/fs/ext4/inode.c
--- linux-2.6.27.orig/fs/ext4/inode.c 2010-05-28 12:52:08.450963139 -0500
+++ linux-2.6.27/fs/ext4/inode.c 2010-05-28 12:52:18.258962978 -0500
@@ -1721,7 +1721,11 @@ static int mpage_da_submit_io(struct mpa
pages_skipped = mpd->wbc->pages_skipped;
err = mapping->a_ops->writepage(page, mpd->wbc);
- if (!err)
+ if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+ /*
+ * have successfully written the page
+ * without skipping the same
+ */
mpd->pages_written++;
/*
* In error case, we have to continue because
@@ -2295,7 +2299,6 @@ static int mpage_da_writepages(struct ad
struct writeback_control *wbc,
struct mpage_da_data *mpd)
{
- long to_write;
int ret;
if (!mpd->get_block)
@@ -2310,19 +2313,18 @@ static int mpage_da_writepages(struct ad
mpd->pages_written = 0;
mpd->retval = 0;
- to_write = wbc->nr_to_write;
-
ret = write_cache_pages_da(mapping, wbc, mpd);
-
/*
* Handle last extent of pages
*/
if (!mpd->io_done && mpd->next_page != mpd->first_page) {
if (mpage_da_map_blocks(mpd) == 0)
mpage_da_submit_io(mpd);
- }
- wbc->nr_to_write = to_write - mpd->pages_written;
+ mpd->io_done = 1;
+ ret = MPAGE_DA_EXTENT_TAIL;
+ }
+ wbc->nr_to_write -= mpd->pages_written;
return ret;
}
@@ -2567,11 +2569,13 @@ static int ext4_da_writepages_trans_bloc
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ pgoff_t index;
+ int range_whole = 0;
handle_t *handle = NULL;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
+ long pages_written = 0, pages_skipped;
int needed_blocks, ret = 0, nr_to_writebump = 0;
- long to_write, pages_skipped = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
/*
@@ -2605,16 +2609,20 @@ static int ext4_da_writepages(struct add
nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
wbc->nr_to_write = sbi->s_mb_stream_request;
}
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
-
- pages_skipped = wbc->pages_skipped;
+ if (wbc->range_cyclic)
+ index = mapping->writeback_index;
+ else
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
mpd.wbc = wbc;
mpd.inode = mapping->host;
-restart_loop:
- to_write = wbc->nr_to_write;
- while (!ret && to_write > 0) {
+ pages_skipped = wbc->pages_skipped;
+
+ while (!ret && wbc->nr_to_write > 0) {
/*
* we insert one extent at a time. So we need
@@ -2647,46 +2655,51 @@ restart_loop:
goto out_writepages;
}
}
- to_write -= wbc->nr_to_write;
-
mpd.get_block = ext4_da_get_block_write;
ret = mpage_da_writepages(mapping, wbc, &mpd);
ext4_journal_stop(handle);
- if (mpd.retval == -ENOSPC)
+ if (mpd.retval == -ENOSPC) {
+ /* commit the transaction which would
+ * free blocks released in the transaction
+ * and try again
+ */
jbd2_journal_force_commit_nested(sbi->s_journal);
-
- /* reset the retry count */
- if (ret == MPAGE_DA_EXTENT_TAIL) {
+ wbc->pages_skipped = pages_skipped;
+ ret = 0;
+ } else if (ret == MPAGE_DA_EXTENT_TAIL) {
/*
* got one extent now try with
* rest of the pages
*/
- to_write += wbc->nr_to_write;
+ pages_written += mpd.pages_written;
+ wbc->pages_skipped = pages_skipped;
ret = 0;
- } else if (wbc->nr_to_write) {
+ } else if (wbc->nr_to_write)
/*
* There is no more writeout needed
* or we requested for a noblocking writeout
* and we found the device congested
*/
- to_write += wbc->nr_to_write;
break;
- }
- wbc->nr_to_write = to_write;
}
+ if (pages_skipped != wbc->pages_skipped)
+ printk(KERN_EMERG "This should not happen leaving %s "
+ "with nr_to_write = %ld ret = %d\n",
+ __func__, wbc->nr_to_write, ret);
- if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) {
- /* We skipped pages in this loop */
- wbc->nr_to_write = to_write +
- wbc->pages_skipped - pages_skipped;
- wbc->pages_skipped = pages_skipped;
- goto restart_loop;
- }
+ /* Update index */
+ index += pages_written;
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ /*
+ * set the writeback_index so that range_cyclic
+ * mode will write it back later
+ */
+ mapping->writeback_index = index;
out_writepages:
- wbc->nr_to_write = to_write - nr_to_writebump;
+ wbc->nr_to_write -= nr_to_writebump;
return ret;
}
From: Aneesh Kumar K.V <[email protected]>
Date: Sat Feb 14 10:42:58 2009 -0500
Subject: ext4: Implement range_cyclic in ext4_da_writepages instead of write_cache_pages
commit 2acf2c261b823d9d9ed954f348b97620297a36b5 upstream.
With delayed allocation we lock the page in write_cache_pages() and
try to build an in memory extent of contiguous blocks. This is needed
so that we can get large contiguous blocks request. If range_cyclic
mode is enabled, write_cache_pages() will loop back to the 0 index if
no I/O has been done yet, and try to start writing from the beginning
of the range. That causes an attempt to take the page lock of lower
index page while holding the page lock of higher index page, which can
cause a dead lock with another writeback thread.
The solution is to implement the range_cyclic behavior in
ext4_da_writepages() instead.
http://bugzilla.kernel.org/show_bug.cgi?id=12579
Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
Signed-off-by: Jayson R. King <[email protected]>
---
fs/ext4/inode.c | 21 +++++++++++++++++++--
1 file changed, 19 insertions(+), 2 deletions(-)
diff -udrNp linux-2.6.27.orig/fs/ext4/inode.c linux-2.6.27/fs/ext4/inode.c
--- linux-2.6.27.orig/fs/ext4/inode.c 2010-05-28 12:53:40.603963323 -0500
+++ linux-2.6.27/fs/ext4/inode.c 2010-05-28 12:53:45.204963256 -0500
@@ -2575,6 +2575,7 @@ static int ext4_da_writepages(struct add
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
long pages_written = 0, pages_skipped;
+ int range_cyclic, cycled = 1, io_done = 0;
int needed_blocks, ret = 0, nr_to_writebump = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
@@ -2612,9 +2613,15 @@ static int ext4_da_writepages(struct add
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- if (wbc->range_cyclic)
+ range_cyclic = wbc->range_cyclic;
+ if (wbc->range_cyclic) {
index = mapping->writeback_index;
- else
+ if (index)
+ cycled = 0;
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
+ wbc->range_end = LLONG_MAX;
+ wbc->range_cyclic = 0;
+ } else
index = wbc->range_start >> PAGE_CACHE_SHIFT;
mpd.wbc = wbc;
@@ -2622,6 +2629,7 @@ static int ext4_da_writepages(struct add
pages_skipped = wbc->pages_skipped;
+retry:
while (!ret && wbc->nr_to_write > 0) {
/*
@@ -2676,6 +2684,7 @@ static int ext4_da_writepages(struct add
pages_written += mpd.pages_written;
wbc->pages_skipped = pages_skipped;
ret = 0;
+ io_done = 1;
} else if (wbc->nr_to_write)
/*
* There is no more writeout needed
@@ -2684,6 +2693,13 @@ static int ext4_da_writepages(struct add
*/
break;
}
+ if (!io_done && !cycled) {
+ cycled = 1;
+ index = 0;
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
+ wbc->range_end = mapping->writeback_index - 1;
+ goto retry;
+ }
if (pages_skipped != wbc->pages_skipped)
printk(KERN_EMERG "This should not happen leaving %s "
"with nr_to_write = %ld ret = %d\n",
@@ -2691,6 +2707,7 @@ static int ext4_da_writepages(struct add
/* Update index */
index += pages_written;
+ wbc->range_cyclic = range_cyclic;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
/*
* set the writeback_index so that range_cyclic
From: Theodore Ts'o <[email protected]>
Date: Sun May 16 18:00:00 2010 -0400
Subject: ext4: Use our own write_cache_pages()
commit 8e48dcfbd7c0892b4cfd064d682cc4c95a29df32 upstream.
Make a copy of write_cache_pages() for the benefit of
ext4_da_writepages(). This allows us to simplify the code some, and
will allow us to further customize the code in future patches.
There are some nasty hacks in write_cache_pages(), which Linus has
(correctly) characterized as vile. I've just copied it into
write_cache_pages_da(), without trying to clean those bits up lest I
break something in the ext4's delalloc implementation, which is a bit
fragile right now. This will allow Dave Chinner to clean up
write_cache_pages() in mm/page-writeback.c, without worrying about
breaking ext4. Eventually write_cache_pages_da() will go away when I
rewrite ext4's delayed allocation and create a general
ext4_writepages() which is used for all of ext4's writeback. Until
now this is the lowest risk way to clean up the core
write_cache_pages() function.
Signed-off-by: "Theodore Ts'o" <[email protected]>
Cc: Dave Chinner <[email protected]>
[[email protected]: Dropped the hunks which reverted the use of no_nrwrite_index_update, since those lines weren't ever created on 2.6.27.y]
[[email protected]: Copied from 2.6.27.y's version of write_cache_pages(), plus the changes to it from patch "vfs: Add no_nrwrite_index_update writeback control flag"]
Signed-off-by: Jayson R. King <[email protected]>
---
fs/ext4/inode.c | 144 ++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 132 insertions(+), 12 deletions(-)
diff -udrNp linux-2.6.27.orig/fs/ext4/inode.c linux-2.6.27/fs/ext4/inode.c
--- linux-2.6.27.orig/fs/ext4/inode.c 2010-05-28 12:50:17.376962920 -0500
+++ linux-2.6.27/fs/ext4/inode.c 2010-05-28 12:50:33.361963378 -0500
@@ -2059,17 +2059,6 @@ static int __mpage_da_writepage(struct p
struct buffer_head *bh, *head, fake;
sector_t logical;
- if (mpd->io_done) {
- /*
- * Rest of the page in the page_vec
- * redirty then and skip then. We will
- * try to to write them again after
- * starting a new transaction
- */
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return MPAGE_DA_EXTENT_TAIL;
- }
/*
* Can we merge this page to current extent?
*/
@@ -2160,6 +2149,137 @@ static int __mpage_da_writepage(struct p
}
/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and call the callback function (which usually writes
+ * the pages).
+ *
+ * This is a forked version of write_cache_pages(). Differences:
+ * Range cyclic is ignored.
+ * no_nrwrite_index_update is always presumed true
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct mpage_da_data *mpd)
+{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ int ret = 0;
+ int done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ long nr_to_write = wbc->nr_to_write;
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ return 0;
+ }
+
+ pagevec_init(&pvec, 0);
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+
+ while (!done && (index <= end)) {
+ int i;
+
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
+ */
+ if (page->index > end) {
+ done = 1;
+ break;
+ }
+
+ lock_page(page);
+
+ /*
+ * Page truncated or invalidated. We can freely skip it
+ * then, even for data integrity operations: the page
+ * has disappeared concurrently, so there could be no
+ * real expectation of this data interity operation
+ * even if there is now a new, dirty page at the same
+ * pagecache address.
+ */
+ if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (PageWriteback(page)) {
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+ else
+ goto continue_unlock;
+ }
+
+ BUG_ON(PageWriteback(page));
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ ret = __mpage_da_writepage(page, wbc, mpd);
+
+ if (unlikely(ret)) {
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ ret = 0;
+ } else {
+ done = 1;
+ break;
+ }
+ }
+
+ if (nr_to_write > 0) {
+ nr_to_write--;
+ if (nr_to_write == 0 &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ /*
+ * We stop writing back only if we are
+ * not doing integrity sync. In case of
+ * integrity sync we have to keep going
+ * because someone may be concurrently
+ * dirtying pages, and we might have
+ * synced a lot of newly appeared dirty
+ * pages, but have not synced all of the
+ * old dirty pages.
+ */
+ done = 1;
+ break;
+ }
+ }
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ done = 1;
+ break;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ return ret;
+}
+
+
+/*
* mpage_da_writepages - walk the list of dirty pages of the given
* address space, allocates non-allocated blocks, maps newly-allocated
* blocks to existing bhs and issue IO them
@@ -2192,7 +2312,7 @@ static int mpage_da_writepages(struct ad
to_write = wbc->nr_to_write;
- ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
+ ret = write_cache_pages_da(mapping, wbc, mpd);
/*
* Handle last extent of pages
On Fri, May 28, 2010 at 02:26:25PM -0500, Jayson R. King wrote:
> From: Theodore Ts'o <[email protected]>
> Date: Sun May 16 18:00:00 2010 -0400
> Subject: ext4: Use our own write_cache_pages()
>
> commit 8e48dcfbd7c0892b4cfd064d682cc4c95a29df32 upstream.
>
> Make a copy of write_cache_pages() for the benefit of
> ext4_da_writepages(). This allows us to simplify the code some, and
> will allow us to further customize the code in future patches.
>
> There are some nasty hacks in write_cache_pages(), which Linus has
> (correctly) characterized as vile. I've just copied it into
> write_cache_pages_da(), without trying to clean those bits up lest I
> break something in the ext4's delalloc implementation, which is a bit
> fragile right now. This will allow Dave Chinner to clean up
> write_cache_pages() in mm/page-writeback.c, without worrying about
> breaking ext4. Eventually write_cache_pages_da() will go away when I
> rewrite ext4's delayed allocation and create a general
> ext4_writepages() which is used for all of ext4's writeback. Until
> now this is the lowest risk way to clean up the core
> write_cache_pages() function.
>
> Signed-off-by: "Theodore Ts'o" <[email protected]>
This doesn't fix a bug; it's to make it easy for Dave Chinner to make
some changes to fix XFS's performance and to undo some ext4-specific
changes to write_cache_pages(). I'm not sure there's a good reason to
backport this to 2.6.27.y....
- Ted
On Fri, May 28, 2010 at 02:26:57PM -0500, Jayson R. King wrote:
> From: Aneesh Kumar K.V <[email protected]>
> Date: Thu Oct 16 10:10:36 2008 -0400
> Subject: ext4: Fix file fragmentation during large file write.
>
> commit 22208dedbd7626e5fc4339c417f8d24cc21f79d7 upstream.
>
> The range_cyclic writeback mode uses the address_space writeback_index
> as the start index for writeback. With delayed allocation we were
> updating writeback_index wrongly resulting in highly fragmented file.
> This patch reduces the number of extents reduced from 4000 to 27 for a
> 3GB file.
This isn't a critical bug fix either. I don't really care a whole
lot, since I don't plan to support ext4 with all of these patches but
if you haven't been doing a full set of testing with these patches,
I'd be very concerned about whether ext4 would be stable after
applying this patch series.
What sort of testing _have_ you done?
- Ted
On 05/28/2010 07:49 PM, [email protected] wrote:
> This doesn't fix a bug; it's to make it easy for Dave Chinner to make
> some changes to fix XFS's performance and to undo some ext4-specific
> changes to write_cache_pages(). I'm not sure there's a good reason to
> backport this to 2.6.27.y....
The difference is that, 2.6.27's write_cache_pages() in page-writeback.c
still updates wbc->nr_to_write, since the patch which changed that
behavior was dropped from .27-rc2 due to the XFS regression it causes on
mainline. ext4 appears to want the behavior of write_cache_pages which
does not update wbc->nr_to_write. This write_cache_pages_da() does what
ext4 wants, without introducing the XFS regression. So I believe it is
needed.
Did I mis-judge?
Rgds,
Jayson
On 05/28/2010 08:06 PM, [email protected] wrote:
> On Fri, May 28, 2010 at 02:26:57PM -0500, Jayson R. King wrote:
>> From: Aneesh Kumar K.V<[email protected]>
>> Date: Thu Oct 16 10:10:36 2008 -0400
>> Subject: ext4: Fix file fragmentation during large file write.
>>
>> commit 22208dedbd7626e5fc4339c417f8d24cc21f79d7 upstream.
>>
>> The range_cyclic writeback mode uses the address_space writeback_index
>> as the start index for writeback. With delayed allocation we were
>> updating writeback_index wrongly resulting in highly fragmented file.
>> This patch reduces the number of extents reduced from 4000 to 27 for a
>> 3GB file.
>
> This isn't a critical bug fix either. I don't really care a whole
> lot, since I don't plan to support ext4 with all of these patches but
> if you haven't been doing a full set of testing with these patches,
> I'd be very concerned about whether ext4 would be stable after
> applying this patch series.
>
> What sort of testing _have_ you done?
I've ran dbench for hours on an ext4 volume followed by fsck on the
volume. Without the patches (particularly, just the last patch, "ext4:
Implement range_cyclic..."), a typical, sustained moderate to high ext4
fs load on .27 would often lead to a deadlock. A good demonstration is
to run "dbench 500" which will usually cause a deadlock in a couple of
minutes.
I wasn't aware of a way to apply the deadlock fix in "ext4: Implement
range_cyclic..." without also introducing this patch, since some of the
blocks it touches are created by this patch.
Thanks for looking.
Rgds,
Jayson
On 05/28/2010 08:41 PM, Jayson R. King wrote:
> On 05/28/2010 07:49 PM, [email protected] wrote:
>> This doesn't fix a bug; it's to make it easy for Dave Chinner to make
>> some changes to fix XFS's performance and to undo some ext4-specific
>> changes to write_cache_pages(). I'm not sure there's a good reason to
>> backport this to 2.6.27.y....
>
> The difference is that, 2.6.27's write_cache_pages() in page-writeback.c
> still updates wbc->nr_to_write, since the patch which changed that
> behavior was dropped from .27-rc2 due to the XFS regression it causes on
> mainline.
I meant, it was dropped from .27.47-rc2 stable. Sorry for any confusion.
> ext4 appears to want the behavior of write_cache_pages which
> does not update wbc->nr_to_write. This write_cache_pages_da() does what
> ext4 wants, without introducing the XFS regression. So I believe it is
> needed.
>
> Did I mis-judge?
>
> Rgds,
>
> Jayson
>
On Fri, May 28, 2010 at 08:41:44PM -0500, Jayson R. King wrote:
>
> The difference is that, 2.6.27's write_cache_pages() in
> page-writeback.c still updates wbc->nr_to_write, since the patch
> which changed that behavior was dropped from .27-rc2 due to the XFS
> regression it causes on mainline. ext4 appears to want the behavior
> of write_cache_pages which does not update wbc->nr_to_write. This
> write_cache_pages_da() does what ext4 wants, without introducing the
> XFS regression. So I believe it is needed.
Ah, OK. So I understand the motivation now, and that's a valid
concern. The question is now: how much the goal of the 2.6.27 stable
branch to fix bugs, and how much is it to get the best possible
performance, at least with respect to ext4? It's going to be harder
and harder to backport fixes to 2.6.27, and I can speak from
experience that it's very easy to introduce regressions while trying
to do backports, since sometimes an individual upstream commit can end
up introducing a regression, and while we do try to document
regression fixes in later commits, sometimes the documentation isn't
complete.
I just spent the better part of a day trying to fix up a backport
series for 2.6.32. When I was engaged in this particular exercise, it
turns out a particular commit to fix a quota deadlock introduced a
regression, and the fix to that introduced yet another, and there were
three or four patches that all needed to be pulled in at once. Except
initially I missed one, and that caused an i_blocks corruption issue
when using fallocate() that took me several hours and a reverse
git-bisection to find. (And this is one set of fixes that will
probably never be able to go into 2.6.27.y, since these changes also
interlock with probably a dozen or so quota changes that have also
gone in over the last couple of kernel releases.)
I'll also add that simply testing using dbench, as you said you used
in another e-mail message, really isn't good enough to find all
possible regressions (it wouldn't have found the i_blocks corruption
problem in my initial set of 2.6.32 ext4 backports patches, for
example, since dbench only tests a very limited set of fs operations,
which doesn't include fallocate, or quotas, or mmap for that matter.)
What I would recommend is using the XFSQA (also sometimes known
xfstests) test suite to make sure that your changes are sound. Dbench
will sometimes find issues, yes, but in my experience it's not the
best tool. The fsstress program, which is called in a number of
different configurations by xfstests, has found all sorts of problems
that other thing shaven't been able to find. Run it on at least a
2-core system, or preferably a 4-core or 8-core system if you have it.
I generally run tests using both 4k and 1k blocksize file systems to
make sure there aren't problems where the fs blocksize is less than
the pagesize.
If you are willing to take on the support burden of ext4 for 2.6.27,
and do a lot of testing, I at least wouldn't have any objection to
these patches. It's really a question of risk vs. reward for the
users of the 2.6.27 stable tree, plus a question of someone willing to
take on the support/debugging burden, and how much testing is done to
appropriate tilt the risk/reward balance.
Regards,
- Ted
Am 30.05.2010 23:25, schrieb [email protected]:
> On Fri, May 28, 2010 at 08:41:44PM -0500, Jayson R. King wrote:
>>
>> The difference is that, 2.6.27's write_cache_pages() in
>> page-writeback.c still updates wbc->nr_to_write, since the patch
>> which changed that behavior was dropped from .27-rc2 due to the XFS
>> regression it causes on mainline. ext4 appears to want the behavior
>> of write_cache_pages which does not update wbc->nr_to_write. This
>> write_cache_pages_da() does what ext4 wants, without introducing the
>> XFS regression. So I believe it is needed.
>
> Ah, OK. So I understand the motivation now, and that's a valid
> concern. The question is now: how much the goal of the 2.6.27 stable
> branch to fix bugs, and how much is it to get the best possible
> performance, at least with respect to ext4? It's going to be harder
> and harder to backport fixes to 2.6.27, and I can speak from
> experience that it's very easy to introduce regressions while trying
> to do backports, since sometimes an individual upstream commit can end
> up introducing a regression, and while we do try to document
> regression fixes in later commits, sometimes the documentation isn't
> complete.
>
> I just spent the better part of a day trying to fix up a backport
> series for 2.6.32. When I was engaged in this particular exercise, it
> turns out a particular commit to fix a quota deadlock introduced a
> regression, and the fix to that introduced yet another, and there were
> three or four patches that all needed to be pulled in at once. Except
> initially I missed one, and that caused an i_blocks corruption issue
> when using fallocate() that took me several hours and a reverse
> git-bisection to find. (And this is one set of fixes that will
> probably never be able to go into 2.6.27.y, since these changes also
> interlock with probably a dozen or so quota changes that have also
> gone in over the last couple of kernel releases.)
>
> I'll also add that simply testing using dbench, as you said you used
> in another e-mail message, really isn't good enough to find all
> possible regressions (it wouldn't have found the i_blocks corruption
> problem in my initial set of 2.6.32 ext4 backports patches, for
> example, since dbench only tests a very limited set of fs operations,
> which doesn't include fallocate, or quotas, or mmap for that matter.)
>
> What I would recommend is using the XFSQA (also sometimes known
> xfstests) test suite to make sure that your changes are sound. Dbench
> will sometimes find issues, yes, but in my experience it's not the
> best tool. The fsstress program, which is called in a number of
> different configurations by xfstests, has found all sorts of problems
> that other thing shaven't been able to find. Run it on at least a
> 2-core system, or preferably a 4-core or 8-core system if you have it.
> I generally run tests using both 4k and 1k blocksize file systems to
> make sure there aren't problems where the fs blocksize is less than
> the pagesize.
>
> If you are willing to take on the support burden of ext4 for 2.6.27,
> and do a lot of testing, I at least wouldn't have any objection to
> these patches. It's really a question of risk vs. reward for the
> users of the 2.6.27 stable tree, plus a question of someone willing to
> take on the support/debugging burden, and how much testing is done to
> appropriate tilt the risk/reward balance.
>
> Regards,
>
> - Ted
For what it's worth: my 2.6.27.45 fileservers deadlock reproducibly
after 1 to 2 minutes of heavy NFS load, when using ext4 (never had a
problem with ext3). Jayson King's patch series (posted Feb 27) fixed
this, and I've been running it since May 1 without problems.
From my experience, I'd say that the ext4 deadlock needs to be fixed;
otherwise ext4 in 2.6.27 should not be called stable.
best wishes,
Kay
On Mon, May 31, 2010 at 2:35 AM, Kay Diederichs
<[email protected]> wrote:
> Am 30.05.2010 23:25, schrieb [email protected]:
>>
>> On Fri, May 28, 2010 at 08:41:44PM -0500, Jayson R. King wrote:
>>>
>>> The difference is that, 2.6.27's write_cache_pages() in
>>> page-writeback.c still updates wbc->nr_to_write, since the patch
>>> which changed that behavior was dropped from .27-rc2 due to the XFS
>>> regression it causes on mainline. ext4 appears to want the behavior
>>> of write_cache_pages which does not update wbc->nr_to_write. This
>>> write_cache_pages_da() does what ext4 wants, without introducing the
>>> XFS regression. So I believe it is needed.
>>
>> Ah, OK. ?So I understand the motivation now, and that's a valid
>> concern. ?The question is now: how much the goal of the 2.6.27 stable
>> branch to fix bugs, and how much is it to get the best possible
>> performance, at least with respect to ext4? ?It's going to be harder
>> and harder to backport fixes to 2.6.27, and I can speak from
>> experience that it's very easy to introduce regressions while trying
>> to do backports, since sometimes an individual upstream commit can end
>> up introducing a regression, and while we do try to document
>> regression fixes in later commits, sometimes the documentation isn't
>> complete.
>>
>> I just spent the better part of a day trying to fix up a backport
>> series for 2.6.32. ?When I was engaged in this particular exercise, it
>> turns out a particular commit to fix a quota deadlock introduced a
>> regression, and the fix to that introduced yet another, and there were
>> three or four patches that all needed to be pulled in at once. ?Except
>> initially I missed one, and that caused an i_blocks corruption issue
>> when using fallocate() that took me several hours and a reverse
>> git-bisection to find. ?(And this is one set of fixes that will
>> probably never be able to go into 2.6.27.y, since these changes also
>> interlock with probably a dozen or so quota changes that have also
>> gone in over the last couple of kernel releases.)
>>
>> I'll also add that simply testing using dbench, as you said you used
>> in another e-mail message, really isn't good enough to find all
>> possible regressions (it wouldn't have found the i_blocks corruption
>> problem in my initial set of 2.6.32 ext4 backports patches, for
>> example, since dbench only tests a very limited set of fs operations,
>> which doesn't include fallocate, or quotas, or mmap for that matter.)
>>
>> What I would recommend is using the XFSQA (also sometimes known
>> xfstests) test suite to make sure that your changes are sound. ?Dbench
>> will sometimes find issues, yes, but in my experience it's not the
>> best tool. ?The fsstress program, which is called in a number of
>> different configurations by xfstests, has found all sorts of problems
>> that other thing shaven't been able to find. ?Run it on at least a
>> 2-core system, or preferably a 4-core or 8-core system if you have it.
>> I generally run tests using both 4k and 1k blocksize file systems to
>> make sure there aren't problems where the fs blocksize is less than
>> the pagesize.
>>
>> If you are willing to take on the support burden of ext4 for 2.6.27,
>> and do a lot of testing, I at least wouldn't have any objection to
>> these patches. ?It's really a question of risk vs. reward for the
>> users of the 2.6.27 stable tree, plus a question of someone willing to
>> take on the support/debugging burden, and how much testing is done to
>> appropriate tilt the risk/reward balance.
>>
>> Regards,
>>
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?- Ted
>
> For what it's worth: my 2.6.27.45 fileservers deadlock reproducibly after 1
> to 2 minutes of heavy NFS load, when using ext4 (never had a problem with
> ext3). Jayson King's patch series (posted Feb 27) fixed this, and I've been
> running it since May 1 without problems.
>
> From my experience, I'd say that the ext4 deadlock needs to be fixed;
> otherwise ext4 in 2.6.27 should not be called stable.
>
> best wishes,
> Kay
It has always been marked experimental in 2.6.27, not stable so I'm
totally lost about this effort.
See http://lxr.linux.no/#linux+v2.6.27.47/fs/Kconfig
139 config EXT4DEV_FS
140 tristate "Ext4dev/ext4 extended fs support development
(EXPERIMENTAL)"
141 depends on EXPERIMENTAL
142 select JBD2
143 select CRC16
144 help
145 Ext4dev is a predecessor filesystem of the next generation
146 extended fs ext4, based on ext3 filesystem code. It will be
147 renamed ext4 fs later, once ext4dev is mature and stabilized.
...
164
165 If unsure, say N.
Greg
On Jun 1, 2010, at 9:54 AM, Greg Freemyer wrote:
>
> It has always been marked experimental in 2.6.27, not stable so I'm
> totally lost about this effort.
>
> See http://lxr.linux.no/#linux+v2.6.27.47/fs/Kconfig
This is one of the things that confuses me, actually. Why is it that there are a number of people who want to use ext4 on 2.6.27? Even the enterprise distro's have moved on; SLES 11 SP1 upgraded their users from 2.6.27 to 2.6.32, for example. I wonder if it's time to start a new "stable anchor point" around 2.6.32, given that Ubuntu's latest Long-Term Stable (Lucid LTS) is based on 2.6.32, as is SLES 11 SP1. The RHEL 6 beta is also based on 2.6.32. (And I just spent quite a bit of time over the past week backporting a lot of ext4 bug fixes to 2.6.32.y :-)
If there are people who want to work on trying to backport more ext4 fixes to 2.6.27, they're of course free to do so. I am really curious as to *why*, though.
Regards,
-- Ted
Theodore Tso schrieb:
> On Jun 1, 2010, at 9:54 AM, Greg Freemyer wrote:
>> It has always been marked experimental in 2.6.27, not stable so I'm
>> totally lost about this effort.
>>
>> See http://lxr.linux.no/#linux+v2.6.27.47/fs/Kconfig
>
> This is one of the things that confuses me, actually. Why is it that there are a number of people who want to use ext4 on 2.6.27? Even the enterprise distro's have moved on; SLES 11 SP1 upgraded their users from 2.6.27 to 2.6.32, for example. I wonder if it's time to start a new "stable anchor point" around 2.6.32, given that Ubuntu's latest Long-Term Stable (Lucid LTS) is based on 2.6.32, as is SLES 11 SP1. The RHEL 6 beta is also based on 2.6.32. (And I just spent quite a bit of time over the past week backporting a lot of ext4 bug fixes to 2.6.32.y :-)
>
> If there are people who want to work on trying to backport more ext4 fixes to 2.6.27, they're of course free to do so. I am really curious as to *why*, though.
>
> Regards,
>
> -- Ted
>
The answer is: because 2.6.27.y is supposed to be a _stable_ kernel. If
it were e.g. 2.6.28 or 2.6.29, nobody would care. But as long as there
is a flow of backported fixes (and there have been quite a few ext4
fixes in 2.6.27) I have the expectation that known bugs get fixed sooner
or later.
If a subsystem maintainer says "I'm not going to support this old stable
thing any longer" then things change. But I hear this from you for the
first time - I may have missed earlier announcements to this effect, though.
best,
Kay
On 06/01/2010 09:49 AM, Theodore Tso wrote:
> This is one of the things that confuses me, actually. Why is it that there are a number of people who want to use ext4 on 2.6.27? Even the enterprise distro's have moved on; SLES 11 SP1 upgraded their users from 2.6.27 to 2.6.32, for example. I wonder if it's time to start a new "stable anchor point" around 2.6.32, given that Ubuntu's latest Long-Term Stable (Lucid LTS) is based on 2.6.32, as is SLES 11 SP1. The RHEL 6 beta is also based on 2.6.32. (And I just spent quite a bit of time over the past week backporting a lot of ext4 bug fixes to 2.6.32.y :-)
>
> If there are people who want to work on trying to backport more ext4 fixes to 2.6.27, they're of course free to do so. I am really curious as to *why*, though.
2.6.27 is still a good kernel and ext4 is a good filesystem, IMO
(existing deadlock notwithstanding).
Like Kay Diederichs mentioned, .27 has received ext4 updates in the
past, even as recently as April this year ("ext4: Avoid null pointer
dereference..."). Though this of course does not imply that .27 should
receive ext4 fixes (or other fixes) forever, but it is nice to fix the
most serious, show-stopping problems if it is feasable.
(maybe OT?: When I made an attempt to switch to kernel .31 or .32
earlier, the kernel would not boot for me. Surely, I can do some
investigating and get it to boot some day, but I wasn't motivated to
solve it at the time and stuck with .27 instead.)
Thanks for the comments.
Rgds,
Jayson
On 05/30/2010 04:25 PM, [email protected] wrote:
> Ah, OK. So I understand the motivation now, and that's a valid
> concern. The question is now: how much the goal of the 2.6.27 stable
> branch to fix bugs, and how much is it to get the best possible
> performance, at least with respect to ext4? It's going to be harder
> and harder to backport fixes to 2.6.27, and I can speak from
> experience that it's very easy to introduce regressions while trying
> to do backports, since sometimes an individual upstream commit can end
> up introducing a regression, and while we do try to document
> regression fixes in later commits, sometimes the documentation isn't
> complete.
Apologies for not making the motivation for these patches more clear.
The first two of these three patches really only exist as support for
the last one, which fixes the deadlock that I was really concerned
about. That was the same motivation for the earlier 11 patches I sent.
While the other patches may fix some real bugs on .27, that wasn't my
goal. So I won't be offering to send any more ext4-related fixes to
.27.y either unless it is for something really serious.
I considered carefully whether to send the patches as they are now, but
the alternative would be to re-work the fix in the last patch, which I
didn't want to tackle, for fear of getting it wrong.
> I just spent the better part of a day trying to fix up a backport
> series for 2.6.32. When I was engaged in this particular exercise, it
> turns out a particular commit to fix a quota deadlock introduced a
> regression, and the fix to that introduced yet another, and there were
> three or four patches that all needed to be pulled in at once. Except
> initially I missed one, and that caused an i_blocks corruption issue
> when using fallocate() that took me several hours and a reverse
> git-bisection to find. (And this is one set of fixes that will
> probably never be able to go into 2.6.27.y, since these changes also
> interlock with probably a dozen or so quota changes that have also
> gone in over the last couple of kernel releases.)
The concern is understandable, and I agree that .27.y is no longer a
good candidate for receiving noncritical ext4 fixes.
> I'll also add that simply testing using dbench, as you said you used
> in another e-mail message, really isn't good enough to find all
> possible regressions (it wouldn't have found the i_blocks corruption
> problem in my initial set of 2.6.32 ext4 backports patches, for
> example, since dbench only tests a very limited set of fs operations,
> which doesn't include fallocate, or quotas, or mmap for that matter.)
>
> What I would recommend is using the XFSQA (also sometimes known
> xfstests) test suite to make sure that your changes are sound. Dbench
> will sometimes find issues, yes, but in my experience it's not the
> best tool. The fsstress program, which is called in a number of
> different configurations by xfstests, has found all sorts of problems
> that other thing shaven't been able to find. Run it on at least a
> 2-core system, or preferably a 4-core or 8-core system if you have it.
> I generally run tests using both 4k and 1k blocksize file systems to
> make sure there aren't problems where the fs blocksize is less than
> the pagesize.
>
> If you are willing to take on the support burden of ext4 for 2.6.27,
> and do a lot of testing, I at least wouldn't have any objection to
> these patches. It's really a question of risk vs. reward for the
> users of the 2.6.27 stable tree, plus a question of someone willing to
> take on the support/debugging burden, and how much testing is done to
> appropriate tilt the risk/reward balance.
Thanks for the tip. I will get xfstests running and report the results
in a few days.
Jayson
On Tue, Jun 01, 2010 at 03:06:37PM -0500, Jayson R. King wrote:
> Like Kay Diederichs mentioned, .27 has received ext4 updates in the
> past, even as recently as April this year ("ext4: Avoid null pointer
> dereference..."). Though this of course does not imply that .27
> should receive ext4 fixes (or other fixes) forever, but it is nice
> to fix the most serious, show-stopping problems if it is feasable.
Some people have, but I had stopped doing wholesale attempts of
backports about 6-9 months ago, due to lack of time and because 2.6.27
was just getting too hard to backport to. So what has been getting
backported has been a little bit of a scattershot. In retrospect, if
I had known people would have wanted to keep it going for this long, I
would have been more aggressive about backporting patches which might
not (strictly speaking) meet the "critical bugfix" category, but which
enables backporting of future important bugs without having to do some
pretty extreme efforts to make the backport work.
(And past a certain point, we end up needing to manually regen each
patch, and because of quota and i_blocks updates are so closely tied
together, and quota received a bunch of "clean up patches", we either
need to merge in all of the quota "clean ups", or we need to regen the
patch pretty much from scratch as part of the stable backport.)
> (maybe OT?: When I made an attempt to switch to kernel .31 or .32
> earlier, the kernel would not boot for me. Surely, I can do some
> investigating and get it to boot some day, but I wasn't motivated to
> solve it at the time and stuck with .27 instead.)
You might want to try the latest 2.6.32 stable kernel and see if it
works any better for you. Since all of the major enterprise distro's
are using 2.6.32 as a base, it's likely that a lot of problem that may
have been in the original 2.6.32 kernel has been fixed.
And, to sweeten the pot, I've done all of the backporting of the ext4
patches to 2.6.32 already, and will probably keep it up for a while,
just because the enterprise distros that are focusing on 2.6.32.
- Ted
This is a note to let you know that I've just added the patch titled
ext4: Implement range_cyclic in ext4_da_writepages instead of write_cache_pages
to the 2.6.27-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary
The filename of the patch is:
ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch
and it can be found in the queue-2.6.27 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <[email protected]> know about it.
>From [email protected] Fri Jun 25 15:33:41 2010
From: Aneesh Kumar K.V <[email protected]>
Date: Fri, 28 May 2010 14:27:23 -0500
Subject: ext4: Implement range_cyclic in ext4_da_writepages instead of write_cache_pages
To: Stable team <[email protected]>, LKML <[email protected]>, Greg Kroah-Hartman <[email protected]>
Cc: "Theodore Ts'o" <[email protected]>, Dave Chinner <[email protected]>, "Jayson R. King" <[email protected]>, Kay Diederichs <[email protected]>, Ext4 Developers List <[email protected]>, "Aneesh Kumar K.V" <[email protected]>
Message-ID: <[email protected]>
From: Aneesh Kumar K.V <[email protected]>
commit 2acf2c261b823d9d9ed954f348b97620297a36b5 upstream.
With delayed allocation we lock the page in write_cache_pages() and
try to build an in memory extent of contiguous blocks. This is needed
so that we can get large contiguous blocks request. If range_cyclic
mode is enabled, write_cache_pages() will loop back to the 0 index if
no I/O has been done yet, and try to start writing from the beginning
of the range. That causes an attempt to take the page lock of lower
index page while holding the page lock of higher index page, which can
cause a dead lock with another writeback thread.
The solution is to implement the range_cyclic behavior in
ext4_da_writepages() instead.
http://bugzilla.kernel.org/show_bug.cgi?id=12579
Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
Signed-off-by: Jayson R. King <[email protected]>
Signed-off-by: Greg Kroah-Hartman <[email protected]>
---
fs/ext4/inode.c | 21 +++++++++++++++++++--
1 file changed, 19 insertions(+), 2 deletions(-)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2575,6 +2575,7 @@ static int ext4_da_writepages(struct add
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
long pages_written = 0, pages_skipped;
+ int range_cyclic, cycled = 1, io_done = 0;
int needed_blocks, ret = 0, nr_to_writebump = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
@@ -2612,9 +2613,15 @@ static int ext4_da_writepages(struct add
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- if (wbc->range_cyclic)
+ range_cyclic = wbc->range_cyclic;
+ if (wbc->range_cyclic) {
index = mapping->writeback_index;
- else
+ if (index)
+ cycled = 0;
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
+ wbc->range_end = LLONG_MAX;
+ wbc->range_cyclic = 0;
+ } else
index = wbc->range_start >> PAGE_CACHE_SHIFT;
mpd.wbc = wbc;
@@ -2622,6 +2629,7 @@ static int ext4_da_writepages(struct add
pages_skipped = wbc->pages_skipped;
+retry:
while (!ret && wbc->nr_to_write > 0) {
/*
@@ -2676,6 +2684,7 @@ static int ext4_da_writepages(struct add
pages_written += mpd.pages_written;
wbc->pages_skipped = pages_skipped;
ret = 0;
+ io_done = 1;
} else if (wbc->nr_to_write)
/*
* There is no more writeout needed
@@ -2684,6 +2693,13 @@ static int ext4_da_writepages(struct add
*/
break;
}
+ if (!io_done && !cycled) {
+ cycled = 1;
+ index = 0;
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
+ wbc->range_end = mapping->writeback_index - 1;
+ goto retry;
+ }
if (pages_skipped != wbc->pages_skipped)
printk(KERN_EMERG "This should not happen leaving %s "
"with nr_to_write = %ld ret = %d\n",
@@ -2691,6 +2707,7 @@ static int ext4_da_writepages(struct add
/* Update index */
index += pages_written;
+ wbc->range_cyclic = range_cyclic;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
/*
* set the writeback_index so that range_cyclic
Patches currently in stable-queue which might be from [email protected] are
queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch
queue-2.6.27/ext4-use-our-own-write_cache_pages.patch
queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch
This is a note to let you know that I've just added the patch titled
ext4: Fix file fragmentation during large file write.
to the 2.6.27-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary
The filename of the patch is:
ext4-fix-file-fragmentation-during-large-file-write.patch
and it can be found in the queue-2.6.27 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <[email protected]> know about it.
>From [email protected] Fri Jun 25 15:33:09 2010
From: Aneesh Kumar K.V <[email protected]>
Date: Fri, 28 May 2010 14:26:57 -0500
Subject: ext4: Fix file fragmentation during large file write.
Cc: "Jayson R. King" <[email protected]>, Theodore Ts'o <[email protected]>, "Aneesh Kumar K.V" <[email protected]>, Dave Chinner <[email protected]>, Ext4 Developers List <[email protected]>, Kay Diederichs <[email protected]>
Message-ID: <[email protected]>
From: Aneesh Kumar K.V <[email protected]>
commit 22208dedbd7626e5fc4339c417f8d24cc21f79d7 upstream.
The range_cyclic writeback mode uses the address_space writeback_index
as the start index for writeback. With delayed allocation we were
updating writeback_index wrongly resulting in highly fragmented file.
This patch reduces the number of extents reduced from 4000 to 27 for a
3GB file.
Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: Theodore Ts'o <[email protected]>
[[email protected]: Some changed lines from the original version of this patch were dropped, since they were rolled up with another cherry-picked patch applied to 2.6.27.y earlier.]
[[email protected]: Use of wbc->no_nrwrite_index_update was dropped, since write_cache_pages_da() implies it.]
Signed-off-by: Jayson R. King <[email protected]>
Signed-off-by: Greg Kroah-Hartman <[email protected]>
---
fs/ext4/inode.c | 79 ++++++++++++++++++++++++++++++++------------------------
1 file changed, 46 insertions(+), 33 deletions(-)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1721,7 +1721,11 @@ static int mpage_da_submit_io(struct mpa
pages_skipped = mpd->wbc->pages_skipped;
err = mapping->a_ops->writepage(page, mpd->wbc);
- if (!err)
+ if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+ /*
+ * have successfully written the page
+ * without skipping the same
+ */
mpd->pages_written++;
/*
* In error case, we have to continue because
@@ -2295,7 +2299,6 @@ static int mpage_da_writepages(struct ad
struct writeback_control *wbc,
struct mpage_da_data *mpd)
{
- long to_write;
int ret;
if (!mpd->get_block)
@@ -2310,19 +2313,18 @@ static int mpage_da_writepages(struct ad
mpd->pages_written = 0;
mpd->retval = 0;
- to_write = wbc->nr_to_write;
-
ret = write_cache_pages_da(mapping, wbc, mpd);
-
/*
* Handle last extent of pages
*/
if (!mpd->io_done && mpd->next_page != mpd->first_page) {
if (mpage_da_map_blocks(mpd) == 0)
mpage_da_submit_io(mpd);
- }
- wbc->nr_to_write = to_write - mpd->pages_written;
+ mpd->io_done = 1;
+ ret = MPAGE_DA_EXTENT_TAIL;
+ }
+ wbc->nr_to_write -= mpd->pages_written;
return ret;
}
@@ -2567,11 +2569,13 @@ static int ext4_da_writepages_trans_bloc
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ pgoff_t index;
+ int range_whole = 0;
handle_t *handle = NULL;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
+ long pages_written = 0, pages_skipped;
int needed_blocks, ret = 0, nr_to_writebump = 0;
- long to_write, pages_skipped = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
/*
@@ -2605,16 +2609,20 @@ static int ext4_da_writepages(struct add
nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
wbc->nr_to_write = sbi->s_mb_stream_request;
}
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
-
- pages_skipped = wbc->pages_skipped;
+ if (wbc->range_cyclic)
+ index = mapping->writeback_index;
+ else
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
mpd.wbc = wbc;
mpd.inode = mapping->host;
-restart_loop:
- to_write = wbc->nr_to_write;
- while (!ret && to_write > 0) {
+ pages_skipped = wbc->pages_skipped;
+
+ while (!ret && wbc->nr_to_write > 0) {
/*
* we insert one extent at a time. So we need
@@ -2647,46 +2655,51 @@ restart_loop:
goto out_writepages;
}
}
- to_write -= wbc->nr_to_write;
-
mpd.get_block = ext4_da_get_block_write;
ret = mpage_da_writepages(mapping, wbc, &mpd);
ext4_journal_stop(handle);
- if (mpd.retval == -ENOSPC)
+ if (mpd.retval == -ENOSPC) {
+ /* commit the transaction which would
+ * free blocks released in the transaction
+ * and try again
+ */
jbd2_journal_force_commit_nested(sbi->s_journal);
-
- /* reset the retry count */
- if (ret == MPAGE_DA_EXTENT_TAIL) {
+ wbc->pages_skipped = pages_skipped;
+ ret = 0;
+ } else if (ret == MPAGE_DA_EXTENT_TAIL) {
/*
* got one extent now try with
* rest of the pages
*/
- to_write += wbc->nr_to_write;
+ pages_written += mpd.pages_written;
+ wbc->pages_skipped = pages_skipped;
ret = 0;
- } else if (wbc->nr_to_write) {
+ } else if (wbc->nr_to_write)
/*
* There is no more writeout needed
* or we requested for a noblocking writeout
* and we found the device congested
*/
- to_write += wbc->nr_to_write;
break;
- }
- wbc->nr_to_write = to_write;
- }
-
- if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) {
- /* We skipped pages in this loop */
- wbc->nr_to_write = to_write +
- wbc->pages_skipped - pages_skipped;
- wbc->pages_skipped = pages_skipped;
- goto restart_loop;
}
+ if (pages_skipped != wbc->pages_skipped)
+ printk(KERN_EMERG "This should not happen leaving %s "
+ "with nr_to_write = %ld ret = %d\n",
+ __func__, wbc->nr_to_write, ret);
+
+ /* Update index */
+ index += pages_written;
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ /*
+ * set the writeback_index so that range_cyclic
+ * mode will write it back later
+ */
+ mapping->writeback_index = index;
out_writepages:
- wbc->nr_to_write = to_write - nr_to_writebump;
+ wbc->nr_to_write -= nr_to_writebump;
return ret;
}
Patches currently in stable-queue which might be from [email protected] are
queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch
queue-2.6.27/ext4-use-our-own-write_cache_pages.patch
queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch
This is a note to let you know that I've just added the patch titled
ext4: Use our own write_cache_pages()
to the 2.6.27-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary
The filename of the patch is:
ext4-use-our-own-write_cache_pages.patch
and it can be found in the queue-2.6.27 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <[email protected]> know about it.
>From [email protected] Fri Jun 25 15:32:26 2010
From: Theodore Ts'o <[email protected]>
Date: Fri, 28 May 2010 14:26:25 -0500
Subject: ext4: Use our own write_cache_pages()
Cc: "Theodore Ts'o" <[email protected]>, Dave Chinner <[email protected]>, "Jayson R. King" <[email protected]>, Kay Diederichs <[email protected]>, Ext4 Developers List <[email protected]>, "Aneesh Kumar K.V" <[email protected]>
Message-ID: <[email protected]>
From: Theodore Ts'o <[email protected]>
commit 8e48dcfbd7c0892b4cfd064d682cc4c95a29df32 upstream.
Make a copy of write_cache_pages() for the benefit of
ext4_da_writepages(). This allows us to simplify the code some, and
will allow us to further customize the code in future patches.
There are some nasty hacks in write_cache_pages(), which Linus has
(correctly) characterized as vile. I've just copied it into
write_cache_pages_da(), without trying to clean those bits up lest I
break something in the ext4's delalloc implementation, which is a bit
fragile right now. This will allow Dave Chinner to clean up
write_cache_pages() in mm/page-writeback.c, without worrying about
breaking ext4. Eventually write_cache_pages_da() will go away when I
rewrite ext4's delayed allocation and create a general
ext4_writepages() which is used for all of ext4's writeback. Until
now this is the lowest risk way to clean up the core
write_cache_pages() function.
Signed-off-by: "Theodore Ts'o" <[email protected]>
Cc: Dave Chinner <[email protected]>
[[email protected]: Dropped the hunks which reverted the use of no_nrwrite_index_update, since those lines weren't ever created on 2.6.27.y]
[[email protected]: Copied from 2.6.27.y's version of write_cache_pages(), plus the changes to it from patch "vfs: Add no_nrwrite_index_update writeback control flag"]
Signed-off-by: Jayson R. King <[email protected]>
Signed-off-by: Greg Kroah-Hartman <[email protected]>
---
fs/ext4/inode.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 132 insertions(+), 12 deletions(-)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2059,17 +2059,6 @@ static int __mpage_da_writepage(struct p
struct buffer_head *bh, *head, fake;
sector_t logical;
- if (mpd->io_done) {
- /*
- * Rest of the page in the page_vec
- * redirty then and skip then. We will
- * try to to write them again after
- * starting a new transaction
- */
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return MPAGE_DA_EXTENT_TAIL;
- }
/*
* Can we merge this page to current extent?
*/
@@ -2160,6 +2149,137 @@ static int __mpage_da_writepage(struct p
}
/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and call the callback function (which usually writes
+ * the pages).
+ *
+ * This is a forked version of write_cache_pages(). Differences:
+ * Range cyclic is ignored.
+ * no_nrwrite_index_update is always presumed true
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct mpage_da_data *mpd)
+{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ int ret = 0;
+ int done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ long nr_to_write = wbc->nr_to_write;
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ return 0;
+ }
+
+ pagevec_init(&pvec, 0);
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+
+ while (!done && (index <= end)) {
+ int i;
+
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
+ */
+ if (page->index > end) {
+ done = 1;
+ break;
+ }
+
+ lock_page(page);
+
+ /*
+ * Page truncated or invalidated. We can freely skip it
+ * then, even for data integrity operations: the page
+ * has disappeared concurrently, so there could be no
+ * real expectation of this data interity operation
+ * even if there is now a new, dirty page at the same
+ * pagecache address.
+ */
+ if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (PageWriteback(page)) {
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+ else
+ goto continue_unlock;
+ }
+
+ BUG_ON(PageWriteback(page));
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ ret = __mpage_da_writepage(page, wbc, mpd);
+
+ if (unlikely(ret)) {
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ ret = 0;
+ } else {
+ done = 1;
+ break;
+ }
+ }
+
+ if (nr_to_write > 0) {
+ nr_to_write--;
+ if (nr_to_write == 0 &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ /*
+ * We stop writing back only if we are
+ * not doing integrity sync. In case of
+ * integrity sync we have to keep going
+ * because someone may be concurrently
+ * dirtying pages, and we might have
+ * synced a lot of newly appeared dirty
+ * pages, but have not synced all of the
+ * old dirty pages.
+ */
+ done = 1;
+ break;
+ }
+ }
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ done = 1;
+ break;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ return ret;
+}
+
+
+/*
* mpage_da_writepages - walk the list of dirty pages of the given
* address space, allocates non-allocated blocks, maps newly-allocated
* blocks to existing bhs and issue IO them
@@ -2192,7 +2312,7 @@ static int mpage_da_writepages(struct ad
to_write = wbc->nr_to_write;
- ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
+ ret = write_cache_pages_da(mapping, wbc, mpd);
/*
* Handle last extent of pages
Patches currently in stable-queue which might be from [email protected] are
queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch
queue-2.6.27/ext4-use-our-own-write_cache_pages.patch
queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch
queue-2.6.27/ext4-check-s_log_groups_per_flex-in-online-resize-code.patch