2008-05-29 19:30:18

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH] ext4: Fix delalloc sync hang with journal lock inversion

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/inode.c | 102 ++++++++++++++++++++++++++++++++++----------------
fs/mpage.c | 14 +++---
mm/page-writeback.c | 7 +++-
3 files changed, 82 insertions(+), 41 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1a2959d..7aa4903 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1480,50 +1480,76 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
up_write(&EXT4_I(inode)->i_data_sem);

if (EXT4_I(inode)->i_disksize == disksize) {
- if (handle == NULL)
- handle = ext4_journal_start(inode, 1);
- if (!IS_ERR(handle))
- ext4_mark_inode_dirty(handle, inode);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ return ret;
}
}
-
ret = 0;
}
-
return ret;
}
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh);
+}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+ return (!buffer_mapped(bh) || buffer_delay(bh));
+}
/* FIXME!! only support data=writeback mode */
-static int __ext4_da_writepage(struct page *page,
+/*
+ * get called vi ext4_da_writepages after taking page lock
+ * We may end up doing block allocation here in case
+ * mpage_da_map_blocks failed to allocate blocks.
+ */
+static int ext4_da_writepage(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ int ret = 0, start_handle = 0;
+ loff_t size;
+ unsigned long len;
handle_t *handle = NULL;
- int ret = 0;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;

handle = ext4_journal_current_handle();
+ if (!handle) {
+ /* This can happen when we aren't called via
+ * ext4_da_writepages() but directly (shrink_page_lis).
+ * We cannot easily start a transaction here so we just skip
+ * writing the page in case we would have to do so.
+ */
+ size = i_size_read(inode);

- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_get_block, wbc);
- else
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ page_bufs = page_buffers(page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;

- if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
- EXT4_I(inode)->i_disksize = inode->i_size;
- ext4_mark_inode_dirty(handle, inode);
+ if (walk_page_buffers(NULL, page_bufs, 0,
+ len, NULL, ext4_bh_unmapped_or_delay)) {
+ /*
+ * We can't do block allocation under
+ * page lock without a handle . So redirty
+ * the page and return
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
}

+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_writepage(page, ext4_da_get_block_write, wbc);
+ else
+ ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
+
return ret;
}
-static int ext4_da_writepage(struct page *page,
- struct writeback_control *wbc)
-{
- if (!ext4_journal_current_handle())
- return __ext4_da_writepage(page, wbc);

- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
-}

/*
* For now just follow the DIO way to estimate the max credits
@@ -1547,6 +1573,7 @@ static int ext4_da_writepages(struct address_space *mapping,
int ret = 0;
unsigned range_cyclic;
long to_write;
+ pgoff_t index;

/*
* Estimate the worse case needed credits to write out
@@ -1557,6 +1584,14 @@ static int ext4_da_writepages(struct address_space *mapping,
to_write = wbc->nr_to_write;
range_cyclic = wbc->range_cyclic;
wbc->range_cyclic = 1;
+ index = mapping->writeback_index;
+ if (!range_cyclic) {
+ /* We force cyclic write out of pages. If the
+ * caller didn't request for range_cyclic update
+ * the writeback_index to what the caller requested.
+ */
+ mapping->writeback_index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ }

while (!ret && to_write) {
/* start a new transaction*/
@@ -1571,16 +1606,22 @@ static int ext4_da_writepages(struct address_space *mapping,
*/
if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
- to_write -= wbc->nr_to_write;

+ to_write -= wbc->nr_to_write;
ret = mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
ext4_journal_stop(handle);
- to_write +=wbc->nr_to_write;
+ if (wbc->nr_to_write) {
+ /* There is no more writeout needed */
+ to_write += wbc->nr_to_write;
+ break;
+ }
+ wbc->nr_to_write = to_write;
}
-
out_writepages:
wbc->nr_to_write = to_write;
wbc->range_cyclic = range_cyclic;
+ if (!range_cyclic)
+ mapping->writeback_index = index;
return ret;
}

@@ -1718,11 +1759,6 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
return 0;
}

-static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
-{
- return !buffer_mapped(bh);
-}
-
/*
* Note that we don't need to start a transaction unless we're journaling
* data because we should have holes filled from ext4_page_mkwrite(). If
diff --git a/fs/mpage.c b/fs/mpage.c
index cde7f11..c107728 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -849,13 +849,12 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
do {
if (cur_logical >= logical + blocks)
break;
-
if (buffer_delay(bh)) {
bh->b_blocknr = pblock;
clear_buffer_delay(bh);
- } else if (buffer_mapped(bh)) {
+ set_buffer_mapped(bh);
+ } else if (buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
- }

cur_logical++;
pblock++;
@@ -930,10 +929,10 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
if (buffer_delay(lbh))
mpage_put_bnr_to_bhs(mpd, next, &new);

- /* go for the remaining blocks */
- next += new.b_size >> mpd->inode->i_blkbits;
- remain -= new.b_size;
- }
+ /* go for the remaining blocks */
+ next += new.b_size >> mpd->inode->i_blkbits;
+ remain -= new.b_size;
+ }
}

#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
@@ -1052,6 +1051,7 @@ static int __mpage_da_writepage(struct page *page,
head = page_buffers(page);
bh = head;
do {
+
BUG_ON(buffer_locked(bh));
if (buffer_dirty(bh))
mpage_add_bh_to_extent(mpd, logical, bh);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 789b6ad..655b8bf 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -881,7 +881,12 @@ int write_cache_pages(struct address_space *mapping,
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
- end = -1;
+ /*
+ * write only till the specified range_end even in cyclic mode
+ */
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ if (!end)
+ end = -1;
} else {
index = wbc->range_start >> PAGE_CACHE_SHIFT;
end = wbc->range_end >> PAGE_CACHE_SHIFT;
--
1.5.5.1.357.g1af8b.dirty