From: "Aneesh Kumar K.V" Subject: [RFC PATCH] ext4: Fix delalloc sync hang with journal lock inversion Date: Fri, 30 May 2008 01:00:10 +0530 Message-ID: <1212089410-12048-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com> Cc: linux-ext4@vger.kernel.org, "Aneesh Kumar K.V" To: cmm@us.ibm.com Return-path: Received: from E23SMTP01.au.ibm.com ([202.81.18.162]:51778 "EHLO e23smtp01.au.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752321AbYE2TaS (ORCPT ); Thu, 29 May 2008 15:30:18 -0400 Received: from sd0109e.au.ibm.com (d23rh905.au.ibm.com [202.81.18.225]) by e23smtp01.au.ibm.com (8.13.1/8.13.1) with ESMTP id m4TJUqBs003665 for ; Fri, 30 May 2008 05:30:52 +1000 Received: from d23av04.au.ibm.com (d23av04.au.ibm.com [9.190.235.139]) by sd0109e.au.ibm.com (8.13.8/8.13.8/NCO v8.7) with ESMTP id m4TJYPZ5254014 for ; Fri, 30 May 2008 05:34:25 +1000 Received: from d23av04.au.ibm.com (loopback [127.0.0.1]) by d23av04.au.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id m4TJUEOv006103 for ; Fri, 30 May 2008 05:30:14 +1000 Sender: linux-ext4-owner@vger.kernel.org List-ID: Signed-off-by: Aneesh Kumar K.V --- fs/ext4/inode.c | 102 ++++++++++++++++++++++++++++++++++---------------- fs/mpage.c | 14 +++--- mm/page-writeback.c | 7 +++- 3 files changed, 82 insertions(+), 41 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1a2959d..7aa4903 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1480,50 +1480,76 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, up_write(&EXT4_I(inode)->i_data_sem); if (EXT4_I(inode)->i_disksize == disksize) { - if (handle == NULL) - handle = ext4_journal_start(inode, 1); - if (!IS_ERR(handle)) - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); + return ret; } } - ret = 0; } - return ret; } + +static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) +{ + return !buffer_mapped(bh); +} + +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) +{ + return (!buffer_mapped(bh) || buffer_delay(bh)); +} /* FIXME!! only support data=writeback mode */ -static int __ext4_da_writepage(struct page *page, +/* + * get called vi ext4_da_writepages after taking page lock + * We may end up doing block allocation here in case + * mpage_da_map_blocks failed to allocate blocks. + */ +static int ext4_da_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + int ret = 0, start_handle = 0; + loff_t size; + unsigned long len; handle_t *handle = NULL; - int ret = 0; + struct buffer_head *page_bufs; + struct inode *inode = page->mapping->host; handle = ext4_journal_current_handle(); + if (!handle) { + /* This can happen when we aren't called via + * ext4_da_writepages() but directly (shrink_page_lis). + * We cannot easily start a transaction here so we just skip + * writing the page in case we would have to do so. + */ + size = i_size_read(inode); - if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) - ret = nobh_writepage(page, ext4_get_block, wbc); - else - ret = block_write_full_page(page, ext4_get_block, wbc); + page_bufs = page_buffers(page); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; - if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) { - EXT4_I(inode)->i_disksize = inode->i_size; - ext4_mark_inode_dirty(handle, inode); + if (walk_page_buffers(NULL, page_bufs, 0, + len, NULL, ext4_bh_unmapped_or_delay)) { + /* + * We can't do block allocation under + * page lock without a handle . So redirty + * the page and return + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } } + if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) + ret = nobh_writepage(page, ext4_da_get_block_write, wbc); + else + ret = block_write_full_page(page, ext4_da_get_block_write, wbc); + return ret; } -static int ext4_da_writepage(struct page *page, - struct writeback_control *wbc) -{ - if (!ext4_journal_current_handle()) - return __ext4_da_writepage(page, wbc); - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} /* * For now just follow the DIO way to estimate the max credits @@ -1547,6 +1573,7 @@ static int ext4_da_writepages(struct address_space *mapping, int ret = 0; unsigned range_cyclic; long to_write; + pgoff_t index; /* * Estimate the worse case needed credits to write out @@ -1557,6 +1584,14 @@ static int ext4_da_writepages(struct address_space *mapping, to_write = wbc->nr_to_write; range_cyclic = wbc->range_cyclic; wbc->range_cyclic = 1; + index = mapping->writeback_index; + if (!range_cyclic) { + /* We force cyclic write out of pages. If the + * caller didn't request for range_cyclic update + * the writeback_index to what the caller requested. + */ + mapping->writeback_index = wbc->range_start >> PAGE_CACHE_SHIFT; + } while (!ret && to_write) { /* start a new transaction*/ @@ -1571,16 +1606,22 @@ static int ext4_da_writepages(struct address_space *mapping, */ if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; - to_write -= wbc->nr_to_write; + to_write -= wbc->nr_to_write; ret = mpage_da_writepages(mapping, wbc, ext4_da_get_block_write); ext4_journal_stop(handle); - to_write +=wbc->nr_to_write; + if (wbc->nr_to_write) { + /* There is no more writeout needed */ + to_write += wbc->nr_to_write; + break; + } + wbc->nr_to_write = to_write; } - out_writepages: wbc->nr_to_write = to_write; wbc->range_cyclic = range_cyclic; + if (!range_cyclic) + mapping->writeback_index = index; return ret; } @@ -1718,11 +1759,6 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) return 0; } -static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) -{ - return !buffer_mapped(bh); -} - /* * Note that we don't need to start a transaction unless we're journaling * data because we should have holes filled from ext4_page_mkwrite(). If diff --git a/fs/mpage.c b/fs/mpage.c index cde7f11..c107728 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -849,13 +849,12 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, do { if (cur_logical >= logical + blocks) break; - if (buffer_delay(bh)) { bh->b_blocknr = pblock; clear_buffer_delay(bh); - } else if (buffer_mapped(bh)) { + set_buffer_mapped(bh); + } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); - } cur_logical++; pblock++; @@ -930,10 +929,10 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) if (buffer_delay(lbh)) mpage_put_bnr_to_bhs(mpd, next, &new); - /* go for the remaining blocks */ - next += new.b_size >> mpd->inode->i_blkbits; - remain -= new.b_size; - } + /* go for the remaining blocks */ + next += new.b_size >> mpd->inode->i_blkbits; + remain -= new.b_size; + } } #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) @@ -1052,6 +1051,7 @@ static int __mpage_da_writepage(struct page *page, head = page_buffers(page); bh = head; do { + BUG_ON(buffer_locked(bh)); if (buffer_dirty(bh)) mpage_add_bh_to_extent(mpd, logical, bh); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 789b6ad..655b8bf 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -881,7 +881,12 @@ int write_cache_pages(struct address_space *mapping, pagevec_init(&pvec, 0); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ - end = -1; + /* + * write only till the specified range_end even in cyclic mode + */ + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (!end) + end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; -- 1.5.5.1.357.g1af8b.dirty