2008-05-30 13:39:37

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -v2] delalloc and journal locking order inversion fixes

The below set of patches gets the delalloc work with journal locking
order inversion patches.

The series file look like

+ ext4-new-defm-options
+ ext4-call-blkdev_issue_flush-on-fsync.patch
0001-ext4-Use-page_mkwrite-vma_operations-to-get-mmap-wr.patch
+ ext4_ialloc-flexbg.patch
0002-ext4-Inverse-locking-order-of-page_lock-and-transac.patch
0003-vfs-Move-mark_inode_dirty-from-under-page-lock-in.patch
0004-ext4-Add-validation-to-jbd-lock-inversion-patch-and.patch
+ delalloc-vfs.patch
+ ext4-fix-fs-corruption-with-delalloc.patch
+ delalloc-ext4.patch
+ delalloc-ext4-release-page-when-write_begin-failed.patch
+ delalloc-ext4-preallocation-handling.patch
...
...
.....
+ vfs-fiemap.patch
+ ext4-add-ext4_ext_walk_space.patch
+ ext4-fiemap.patch
0005-ext4-inverse-locking-ordering-of-page_lock-and-tra.patch
0006-ext4-Fix-delalloc-sync-hang-with-journal-lock-inver.patch

I have pushed the lock inversion patches and related changes to the
top of the patch queue expecting it can be pushed upstream before
delalloc changes.


-aneesh




2008-05-30 13:39:45

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext4: Use page_mkwrite vma_operations to get mmap write notification.

We would like to get notified when we are doing a write on mmap section.
This is needed with respect to preallocated area. We split the preallocated
area into initialzed extent and uninitialzed extent in the call back. This
let us handle ENOSPC better. Otherwise we get ENOSPC in the writepage and
that would result in data loss. The changes are also needed to handle ENOSPC
when writing to an mmap section of files with holes.

Acked-by: Jan Kara <[email protected]>
Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: Mingming Cao <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4.h | 1 +
fs/ext4/file.c | 19 +++++++++++++-
fs/ext4/inode.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 95 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6605076..77cbb28 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1053,6 +1053,7 @@ extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);

/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6..b9510ba 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
return ret;
}

+static struct vm_operations_struct ext4_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ext4_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
+
const struct file_operations ext4_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -133,7 +150,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ext4_file_mmap,
.open = generic_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4a7ed29..bc52ef5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3555,3 +3555,79 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)

return err;
}
+
+static int ext4_bh_prepare_fill(handle_t *handle, struct buffer_head *bh)
+{
+ if (!buffer_mapped(bh)) {
+ /*
+ * Mark buffer as dirty so that
+ * block_write_full_page() writes it
+ */
+ set_buffer_dirty(bh);
+ }
+ return 0;
+}
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ loff_t size;
+ unsigned long len;
+ int ret = -EINVAL;
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = 1 };
+
+ /*
+ * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+ * get i_mutex because we are already holding mmap_sem.
+ */
+ down_read(&inode->i_alloc_sem);
+ size = i_size_read(inode);
+ if (page->mapping != mapping || size <= page_offset(page)
+ || !PageUptodate(page)) {
+ /* page got truncated from under us? */
+ goto out_unlock;
+ }
+ ret = 0;
+ if (PageMappedToDisk(page))
+ goto out_unlock;
+
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* return if we have all the buffers mapped */
+ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped))
+ goto out_unlock;
+ /*
+ * Now mark all the buffer head dirty so
+ * that writepage can write it
+ */
+ walk_page_buffers(NULL, page_buffers(page), 0, len,
+ NULL, ext4_bh_prepare_fill);
+ }
+ /*
+ * OK, we need to fill the hole... Lock the page and do writepage.
+ * We can't do write_begin and write_end here because we don't
+ * have inode_mutex and that allow parallel write_begin, write_end call.
+ * (lock_page prevent this from happening on the same page though)
+ */
+ lock_page(page);
+ wbc.range_start = page_offset(page);
+ wbc.range_end = page_offset(page) + len;
+ ret = mapping->a_ops->writepage(page, &wbc);
+ /* writepage unlocks the page */
+out_unlock:
+ up_read(&inode->i_alloc_sem);
+ return ret;
+}
--
1.5.5.1.357.g1af8b.dirty


2008-05-30 13:40:03

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext4: Inverse locking order of page_lock and transaction start.

From: Jan Kara <[email protected]>

Signed-off-by: Jan Kara <[email protected]>
Signed-off-by: Mingming Cao <[email protected]>
Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/ext4.h | 4 +-
fs/ext4/extents.c | 15 +--
fs/ext4/inode.c | 274 ++++++++++++++++++++++++-----------------------------
3 files changed, 132 insertions(+), 161 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 66cdd5c..e7da2cb 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1063,7 +1063,7 @@ extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);

@@ -1222,7 +1222,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock,
unsigned long max_blocks, struct buffer_head *bh_result,
int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c58ebd8..1a90a23 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2744,7 +2744,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
return err ? err : allocated;
}

-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
{
struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
@@ -2757,18 +2757,11 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
*/
err = ext4_writepage_trans_blocks(inode) + 3;
handle = ext4_journal_start(inode, err);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return;
- }

- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);

down_write(&EXT4_I(inode)->i_data_sem);
ext4_ext_invalidate_cache(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bc52ef5..a96c325 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1239,19 +1239,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
to = from + len;

retry:
- page = __grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
-
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
- unlock_page(page);
- page_cache_release(page);
ret = PTR_ERR(handle);
goto out;
}

+ page = __grab_cache_page(mapping, index);
+ if (!page) {
+ ext4_journal_stop(handle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ *pagep = page;
+
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
ext4_get_block);

@@ -1261,8 +1262,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
}

if (ret) {
- ext4_journal_stop(handle);
unlock_page(page);
+ ext4_journal_stop(handle);
page_cache_release(page);
}

@@ -1291,29 +1292,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
}

/*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = file->f_mapping->host;
-
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
- }
-
- return copied;
-}
-
-/*
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
*
@@ -1326,7 +1304,7 @@ static int ext4_ordered_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = mapping->host;
unsigned from, to;
int ret = 0, ret2;

@@ -1347,7 +1325,7 @@ static int ext4_ordered_write_end(struct file *file,
new_i_size = pos + copied;
if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1356,8 +1334,6 @@ static int ext4_ordered_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);

return ret ? ret : copied;
}
@@ -1368,7 +1344,7 @@ static int ext4_writeback_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = mapping->host;
int ret = 0, ret2;
loff_t new_i_size;

@@ -1376,7 +1352,7 @@ static int ext4_writeback_write_end(struct file *file,
if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;

- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1385,8 +1361,6 @@ static int ext4_writeback_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);

return ret ? ret : copied;
}
@@ -1425,10 +1399,10 @@ static int ext4_journalled_write_end(struct file *file,
ret = ret2;
}

+ unlock_page(page);
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
page_cache_release(page);

return ret ? ret : copied;
@@ -1506,11 +1480,10 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
}

/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling
+ * data because we should have holes filled from ext4_page_mkwrite(). If
+ * we are journaling data, we cannot start transaction directly because
+ * transaction start ranks above page lock so we have to do some magic...
*
* In all journalling modes block_write_full_page() will start the I/O.
*
@@ -1554,10 +1527,8 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
* disastrous. Any write() or metadata operation will sync the fs for
* us.
*
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
*/
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_ordered_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
@@ -1566,22 +1537,6 @@ static int ext4_ordered_writepage(struct page *page,
int ret = 0;
int err;

- J_ASSERT(PageLocked(page));
-
- /*
- * We give up here if we're reentered, because it might be for a
- * different filesystem.
- */
- if (ext4_journal_current_handle())
- goto out_fail;
-
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
- }
-
if (!page_has_buffers(page)) {
create_empty_buffers(page, inode->i_sb->s_blocksize,
(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1605,114 +1560,139 @@ static int ext4_ordered_writepage(struct page *page,
* and generally junk.
*/
if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+ handle = ext4_journal_start(inode,
+ ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_put;
+ }
+
+ ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
NULL, jbd2_journal_dirty_data_fn);
+ err = ext4_journal_stop(handle);
if (!ret)
ret = err;
}
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bput_one);
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
+out_put:
+ walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+ bput_one);
return ret;
+}
+
+static int ext4_ordered_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ J_ASSERT(PageLocked(page));
+
+ /*
+ * We give up here if we're reentered, because it might be for a
+ * different filesystem.
+ */
+ if (!ext4_journal_current_handle())
+ return __ext4_ordered_writepage(page, wbc);

-out_fail:
redirty_page_for_writepage(wbc, page);
unlock_page(page);
- return ret;
+ return 0;
}

-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_writeback_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
+
+ if (test_opt(inode->i_sb, NOBH))
+ return nobh_writepage(page, ext4_get_block, wbc);
+ else
+ return block_write_full_page(page, ext4_get_block, wbc);
+}
+
+
+static int ext4_writeback_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ if (!ext4_journal_current_handle())
+ return __ext4_writeback_writepage(page, wbc);
+
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+}
+
+static int __ext4_journalled_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head *page_bufs;
handle_t *handle = NULL;
int ret = 0;
int err;

- if (ext4_journal_current_handle())
- goto out_fail;
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
+ if (ret != 0)
+ goto out_unlock;
+
+ page_bufs = page_buffers(page);
+ walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+ bget_one);
+ /* As soon as we unlock the page, it can go away, but we have
+ * references to buffers so we are safe */
+ unlock_page(page);

handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_fail;
+ goto out;
}

- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_get_block, wbc);
- else
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ ret = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);

+ err = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, write_end_fn);
+ if (ret == 0)
+ ret = err;
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
- return ret;

-out_fail:
- redirty_page_for_writepage(wbc, page);
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bput_one);
+ EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ goto out;
+
+out_unlock:
unlock_page(page);
+out:
return ret;
}

static int ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
-
if (ext4_journal_current_handle())
goto no_write;

- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto no_write;
- }
-
if (!page_has_buffers(page) || PageChecked(page)) {
/*
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
*/
ClearPageChecked(page);
- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext4_get_block);
- if (ret != 0) {
- ext4_journal_stop(handle);
- goto out_unlock;
- }
- ret = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
- err = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, write_end_fn);
- if (ret == 0)
- ret = err;
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
- unlock_page(page);
+ return __ext4_journalled_writepage(page, wbc);
} else {
/*
* It may be a page full of checkpoint-mode buffers. We don't
* really know unless we go poke around in the buffer_heads.
* But block_write_full_page will do the right thing.
*/
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ return block_write_full_page(page, ext4_get_block, wbc);
}
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
-out:
- return ret;
-
no_write:
redirty_page_for_writepage(wbc, page);
-out_unlock:
unlock_page(page);
- goto out;
+ return 0;
}

static int ext4_readpage(struct file *file, struct page *page)
@@ -1922,7 +1902,7 @@ void ext4_set_aops(struct inode *inode)
* This required during truncate. We need to physically zero the tail end
* of that block so it doesn't yield old data if the file is later grown.
*/
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1931,8 +1911,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
+ struct page *page;
int err = 0;

+ page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return -EINVAL;
+
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -2396,7 +2381,6 @@ void ext4_truncate(struct inode *inode)
int n;
ext4_lblk_t last_block;
unsigned blocksize = inode->i_sb->s_blocksize;
- struct page *page;

if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)))
@@ -2406,41 +2390,21 @@ void ext4_truncate(struct inode *inode)
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return;

- /*
- * We have to lock the EOF page here, because lock_page() nests
- * outside jbd2_journal_start().
- */
- if ((inode->i_size & (blocksize - 1)) == 0) {
- /* Block boundary? Nothing to do */
- page = NULL;
- } else {
- page = grab_cache_page(mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
- if (!page)
- return;
- }
-
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- ext4_ext_truncate(inode, page);
+ ext4_ext_truncate(inode);
return;
}

handle = start_transaction(inode);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return; /* AKPM: return what? */
- }

last_block = (inode->i_size + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);

- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (blocksize - 1))
+ if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+ goto out_stop;

n = ext4_block_to_path(inode, last_block, offsets, NULL);
if (n == 0)
@@ -3577,7 +3541,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
{
loff_t size;
unsigned long len;
- int ret = -EINVAL;
+ int err, ret = -EINVAL;
+ handle_t *handle;
struct file *file = vma->vm_file;
struct inode *inode = file->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
@@ -3622,11 +3587,24 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
* have inode_mutex and that allow parallel write_begin, write_end call.
* (lock_page prevent this from happening on the same page though)
*/
+ handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_unlock;
+ }
lock_page(page);
wbc.range_start = page_offset(page);
wbc.range_end = page_offset(page) + len;
- ret = mapping->a_ops->writepage(page, &wbc);
- /* writepage unlocks the page */
+ if (ext4_should_writeback_data(inode))
+ ret = __ext4_writeback_writepage(page, &wbc);
+ else if (ext4_should_order_data(inode))
+ ret = __ext4_ordered_writepage(page, &wbc);
+ else
+ ret = __ext4_journalled_writepage(page, &wbc);
+ /* Page got unlocked in writepage */
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
out_unlock:
up_read(&inode->i_alloc_sem);
return ret;
--
1.5.5.1.357.g1af8b.dirty


2008-05-30 13:40:09

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] vfs: Move mark_inode_dirty() from under page lock in generic_write_end()

From: Jan Kara <[email protected]>

There's no need to call mark_inode_dirty() under page lock in
generic_write_end(). It unnecessarily makes hold time of page lock longer
and more importantly it forces locking order of page lock and transaction
start for journaling filesystems.

Signed-off-by: Jan Kara <[email protected]>
Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/buffer.c | 12 +++++++++++-
1 files changed, 11 insertions(+), 1 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index a073f3f..24116f0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2061,6 +2061,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
+ int i_size_changed = 0;

copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

@@ -2073,12 +2074,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
*/
if (pos+copied > inode->i_size) {
i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
+ i_size_changed = 1;
}

unlock_page(page);
page_cache_release(page);

+ /*
+ * We don't mark inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ mark_inode_dirty(inode);
+
return copied;
}
EXPORT_SYMBOL(generic_write_end);
--
1.5.5.1.357.g1af8b.dirty


2008-05-30 13:40:21

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/inode.c | 181 +++++++++++++++++++++++++++++++++++++++++++++++--------
1 files changed, 156 insertions(+), 25 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a96c325..b122425 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1479,6 +1479,11 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
return 0;
}

+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+ return (!buffer_mapped(bh) || buffer_delay(bh));
+}
+
/*
* Note that we don't need to start a transaction unless we're journaling
* data because we should have holes filled from ext4_page_mkwrite(). If
@@ -1531,18 +1536,26 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
static int __ext4_ordered_writepage(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
- struct buffer_head *page_bufs;
+ int ret = 0, err;
+ unsigned long len;
handle_t *handle = NULL;
- int ret = 0;
- int err;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;
+ loff_t size = i_size_read(inode);

- if (!page_has_buffers(page)) {
- create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
- }
page_bufs = page_buffers(page);
- walk_page_buffers(handle, page_bufs, 0,
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (walk_page_buffers(NULL, page_bufs, 0,
+ len, NULL, ext4_bh_unmapped_or_delay)) {
+ printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
+ __func__);
+ BUG();
+ }
+ walk_page_buffers(NULL, page_bufs, 0,
PAGE_CACHE_SIZE, NULL, bget_one);

ret = block_write_full_page(page, ext4_get_block, wbc);
@@ -1574,8 +1587,8 @@ static int __ext4_ordered_writepage(struct page *page,
ret = err;
}
out_put:
- walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
- bput_one);
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bput_one);
return ret;
}

@@ -1583,7 +1596,7 @@ static int ext4_ordered_writepage(struct page *page,
struct writeback_control *wbc)
{
J_ASSERT(PageLocked(page));
-
+ BUG_ON(!page_has_buffers(page));
/*
* We give up here if we're reentered, because it might be for a
* different filesystem.
@@ -1599,18 +1612,34 @@ static int ext4_ordered_writepage(struct page *page,
static int __ext4_writeback_writepage(struct page *page,
struct writeback_control *wbc)
{
+ unsigned long len;
+ struct buffer_head *page_bufs;
struct inode *inode = page->mapping->host;
+ loff_t size = i_size_read(inode);
+
+ page_bufs = page_buffers(page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;

+ if (walk_page_buffers(NULL, page_bufs, 0,
+ len, NULL, ext4_bh_unmapped_or_delay)) {
+ printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
+ __func__);
+ BUG();
+ }
if (test_opt(inode->i_sb, NOBH))
return nobh_writepage(page, ext4_get_block, wbc);
else
return block_write_full_page(page, ext4_get_block, wbc);
}

-
static int ext4_writeback_writepage(struct page *page,
struct writeback_control *wbc)
{
+ BUG_ON(!page_has_buffers(page));
+
if (!ext4_journal_current_handle())
return __ext4_writeback_writepage(page, wbc);

@@ -1622,18 +1651,31 @@ static int ext4_writeback_writepage(struct page *page,
static int __ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
+ int ret = 0, err;
+ unsigned long len;
+ handle_t *handle = NULL;
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
struct buffer_head *page_bufs;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
+ loff_t size = i_size_read(inode);
+
+ page_bufs = page_buffers(page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;

+ if (walk_page_buffers(NULL, page_bufs, 0,
+ len, NULL, ext4_bh_unmapped_or_delay)) {
+ printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
+ __func__);
+ BUG();
+ }
+ /* FIXME!! do we need to call prepare_write for a mapped buffer */
ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
if (ret != 0)
goto out_unlock;

- page_bufs = page_buffers(page);
walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
bget_one);
/* As soon as we unlock the page, it can go away, but we have
@@ -1671,14 +1713,13 @@ static int __ext4_journalled_writepage(struct page *page,
static int ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
+ BUG_ON(!page_has_buffers(page));
+
if (ext4_journal_current_handle())
goto no_write;

- if (!page_has_buffers(page) || PageChecked(page)) {
- /*
- * It's mmapped pagecache. Add buffers and journal it. There
- * doesn't seem much point in redirtying the page here.
- */
+ if (PageChecked(page)) {
+ /* dirty pages in data=journal mode */
ClearPageChecked(page);
return __ext4_journalled_writepage(page, wbc);
} else {
@@ -3520,6 +3561,96 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}

+static int __ext4_journalled_allocpage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret = 0, err;
+ handle_t *handle = NULL;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head *page_bufs;
+
+ /* if alloc we are called after statring a journal */
+ handle = ext4_journal_current_handle();
+ BUG_ON(!handle);
+
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
+ if (ret != 0)
+ goto out_unlock;
+
+ /* FIXME!! should we do a bget_one */
+ page_bufs = page_buffers(page);
+ ret = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+
+ err = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, write_end_fn);
+ if (ret == 0)
+ ret = err;
+ EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+
+out_unlock:
+ unlock_page(page);
+ return ret;
+}
+
+static int __ext4_ordered_allocpage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ handle_t *handle = NULL;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;
+
+ /* if alloc we are called after statring a journal */
+ handle = ext4_journal_current_handle();
+ BUG_ON(!handle);
+ if (!page_has_buffers(page)) {
+ create_empty_buffers(page, inode->i_sb->s_blocksize,
+ (1 << BH_Dirty)|(1 << BH_Uptodate));
+ }
+ page_bufs = page_buffers(page);
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bget_one);
+
+ ret = block_write_full_page(page, ext4_get_block, wbc);
+
+ /*
+ * The page can become unlocked at any point now, and
+ * truncate can then come in and change things. So we
+ * can't touch *page from now on. But *page_bufs is
+ * safe due to elevated refcount.
+ */
+
+ /*
+ * And attach them to the current transaction. But only if
+ * block_write_full_page() succeeded. Otherwise they are unmapped,
+ * and generally junk.
+ */
+ if (ret == 0) {
+ ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+ NULL, jbd2_journal_dirty_data_fn);
+ }
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bput_one);
+ return ret;
+}
+
+static int __ext4_writeback_allocpage(struct page *page,
+ struct writeback_control *wbc)
+{
+ handle_t *handle = NULL;
+ struct inode *inode = page->mapping->host;
+ /* if alloc we are called after statring a journal */
+ handle = ext4_journal_current_handle();
+ BUG_ON(!handle);
+
+ if (test_opt(inode->i_sb, NOBH))
+ return nobh_writepage(page, ext4_get_block, wbc);
+ else
+ return block_write_full_page(page, ext4_get_block, wbc);
+}
+
static int ext4_bh_prepare_fill(handle_t *handle, struct buffer_head *bh)
{
if (!buffer_mapped(bh)) {
@@ -3596,11 +3727,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
wbc.range_start = page_offset(page);
wbc.range_end = page_offset(page) + len;
if (ext4_should_writeback_data(inode))
- ret = __ext4_writeback_writepage(page, &wbc);
+ ret = __ext4_writeback_allocpage(page, &wbc);
else if (ext4_should_order_data(inode))
- ret = __ext4_ordered_writepage(page, &wbc);
+ ret = __ext4_ordered_allocpage(page, &wbc);
else
- ret = __ext4_journalled_writepage(page, &wbc);
+ ret = __ext4_journalled_allocpage(page, &wbc);
/* Page got unlocked in writepage */
err = ext4_journal_stop(handle);
if (!ret)
--
1.5.5.1.357.g1af8b.dirty


2008-05-30 13:40:37

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext4: inverse locking ordering of page_lock and transaction start in delalloc

From: Mingming Cao <[email protected]>

Inverse locking ordering of page_lock and transaction start in delalloc

Signed-off-by: Mingming Cao <[email protected]>
Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/inode.c | 96 +++++++++++++++++++++++++++++++++++++++----------------
1 files changed, 68 insertions(+), 28 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 62b38b6..65e02a3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1448,18 +1448,14 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+ int ret;
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
loff_t disksize = EXT4_I(inode)->i_disksize;
handle_t *handle = NULL;

- if (create) {
- handle = ext4_journal_start(inode, needed_blocks);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- }
+ handle = ext4_journal_current_handle();
+ BUG_ON(handle == 0);
+ BUG_ON(create == 0);

ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
bh_result, create, 0);
@@ -1494,29 +1490,17 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
ret = 0;
}

-out:
- if (handle && !IS_ERR(handle))
- ext4_journal_stop(handle);
-
return ret;
}
/* FIXME!! only support data=writeback mode */
-static int ext4_da_writepage(struct page *page,
+static int __ext4_da_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
handle_t *handle = NULL;
int ret = 0;
- int err;
-
- if (ext4_journal_current_handle())
- goto out_fail;

- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
- }
+ handle = ext4_journal_current_handle();

if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
ret = nobh_writepage(page, ext4_get_block, wbc);
@@ -1528,21 +1512,77 @@ static int ext4_da_writepage(struct page *page,
ext4_mark_inode_dirty(handle, inode);
}

- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
return ret;
+}
+static int ext4_da_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ if (!ext4_journal_current_handle())
+ return __ext4_da_writepage(page, wbc);

-out_fail:
redirty_page_for_writepage(wbc, page);
unlock_page(page);
- return ret;
+ return 0;
}

+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS
+
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
+ struct inode *inode = mapping->host;
+ handle_t *handle = NULL;
+ int needed_blocks;
+ int ret = 0;
+ unsigned range_cyclic;
+ long to_write;
+
+ /*
+ * Estimate the worse case needed credits to write out
+ * EXT4_MAX_BUF_BLOCKS pages
+ */
+ needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+ to_write = wbc->nr_to_write;
+ range_cyclic = wbc->range_cyclic;
+ wbc->range_cyclic = 1;
+
+ while (!ret && to_write) {
+ /* start a new transaction*/
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ /*
+ * set the max dirty pages could be write at a time
+ * to fit into the reserved transaction credits
+ */
+ if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+ wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+ to_write -= wbc->nr_to_write;
+
+ ret = mpage_da_writepages(mapping, wbc,
+ ext4_da_get_block_write);
+ ext4_journal_stop(handle);
+ to_write += wbc->nr_to_write;
+ }
+
+out_writepages:
+ wbc->nr_to_write = to_write;
+ wbc->range_cyclic = range_cyclic;
+ return ret;
}

static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
--
1.5.5.1.357.g1af8b.dirty


2008-05-30 13:40:55

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/inode.c | 99 ++++++++++++++++++++++++++++++++++----------------
fs/mpage.c | 14 ++++----
mm/page-writeback.c | 7 +++-
3 files changed, 80 insertions(+), 40 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 65e02a3..2194aa7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1480,50 +1480,73 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
up_write(&EXT4_I(inode)->i_data_sem);

if (EXT4_I(inode)->i_disksize == disksize) {
- if (handle == NULL)
- handle = ext4_journal_start(inode, 1);
- if (!IS_ERR(handle))
- ext4_mark_inode_dirty(handle, inode);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ return ret;
}
}
-
ret = 0;
}
-
return ret;
}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+ return (!buffer_mapped(bh) || buffer_delay(bh));
+}
+
/* FIXME!! only support data=writeback mode */
-static int __ext4_da_writepage(struct page *page,
+/*
+ * get called vi ext4_da_writepages after taking page lock
+ * We may end up doing block allocation here in case
+ * mpage_da_map_blocks failed to allocate blocks.
+ */
+static int ext4_da_writepage(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
int ret = 0;
+ loff_t size;
+ unsigned long len;
+ handle_t *handle = NULL;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;

handle = ext4_journal_current_handle();
+ if (!handle) {
+ /*
+ * This can happen when we aren't called via
+ * ext4_da_writepages() but directly (shrink_page_lis).
+ * We cannot easily start a transaction here so we just skip
+ * writing the page in case we would have to do so.
+ */
+ size = i_size_read(inode);

- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_get_block, wbc);
- else
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ page_bufs = page_buffers(page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;

- if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
- EXT4_I(inode)->i_disksize = inode->i_size;
- ext4_mark_inode_dirty(handle, inode);
+ if (walk_page_buffers(NULL, page_bufs, 0,
+ len, NULL, ext4_bh_unmapped_or_delay)) {
+ /*
+ * We can't do block allocation under
+ * page lock without a handle . So redirty
+ * the page and return
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
}

+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_writepage(page, ext4_da_get_block_write, wbc);
+ else
+ ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
+
return ret;
}
-static int ext4_da_writepage(struct page *page,
- struct writeback_control *wbc)
-{
- if (!ext4_journal_current_handle())
- return __ext4_da_writepage(page, wbc);

- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
-}

/*
* For now just follow the DIO way to estimate the max credits
@@ -1547,6 +1570,7 @@ static int ext4_da_writepages(struct address_space *mapping,
int ret = 0;
unsigned range_cyclic;
long to_write;
+ pgoff_t index;

/*
* Estimate the worse case needed credits to write out
@@ -1557,6 +1581,15 @@ static int ext4_da_writepages(struct address_space *mapping,
to_write = wbc->nr_to_write;
range_cyclic = wbc->range_cyclic;
wbc->range_cyclic = 1;
+ index = mapping->writeback_index;
+ if (!range_cyclic) {
+ /*
+ * We force cyclic write out of pages. If the
+ * caller didn't request for range_cyclic update
+ * set the writeback_index to what the caller requested.
+ */
+ mapping->writeback_index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ }

while (!ret && to_write) {
/* start a new transaction*/
@@ -1571,17 +1604,24 @@ static int ext4_da_writepages(struct address_space *mapping,
*/
if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
- to_write -= wbc->nr_to_write;

+ to_write -= wbc->nr_to_write;
ret = mpage_da_writepages(mapping, wbc,
ext4_da_get_block_write);
ext4_journal_stop(handle);
- to_write += wbc->nr_to_write;
+ if (wbc->nr_to_write) {
+ /* There is no more writeout needed */
+ to_write += wbc->nr_to_write;
+ break;
+ }
+ wbc->nr_to_write = to_write;
}

out_writepages:
wbc->nr_to_write = to_write;
wbc->range_cyclic = range_cyclic;
+ if (!range_cyclic)
+ mapping->writeback_index = index;
return ret;
}

@@ -1719,11 +1759,6 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
return 0;
}

-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
-{
- return (!buffer_mapped(bh) || buffer_delay(bh));
-}
-
/*
* Note that we don't need to start a transaction unless we're journaling
* data because we should have holes filled from ext4_page_mkwrite(). If
diff --git a/fs/mpage.c b/fs/mpage.c
index cde7f11..c107728 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -849,13 +849,12 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
do {
if (cur_logical >= logical + blocks)
break;
-
if (buffer_delay(bh)) {
bh->b_blocknr = pblock;
clear_buffer_delay(bh);
- } else if (buffer_mapped(bh)) {
+ set_buffer_mapped(bh);
+ } else if (buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
- }

cur_logical++;
pblock++;
@@ -930,10 +929,10 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
if (buffer_delay(lbh))
mpage_put_bnr_to_bhs(mpd, next, &new);

- /* go for the remaining blocks */
- next += new.b_size >> mpd->inode->i_blkbits;
- remain -= new.b_size;
- }
+ /* go for the remaining blocks */
+ next += new.b_size >> mpd->inode->i_blkbits;
+ remain -= new.b_size;
+ }
}

#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
@@ -1052,6 +1051,7 @@ static int __mpage_da_writepage(struct page *page,
head = page_buffers(page);
bh = head;
do {
+
BUG_ON(buffer_locked(bh));
if (buffer_dirty(bh))
mpage_add_bh_to_extent(mpd, logical, bh);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 789b6ad..655b8bf 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -881,7 +881,12 @@ int write_cache_pages(struct address_space *mapping,
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
- end = -1;
+ /*
+ * write only till the specified range_end even in cyclic mode
+ */
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ if (!end)
+ end = -1;
} else {
index = wbc->range_start >> PAGE_CACHE_SHIFT;
end = wbc->range_end >> PAGE_CACHE_SHIFT;
--
1.5.5.1.357.g1af8b.dirty


2008-05-30 17:51:09

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH -v2] delalloc and journal locking order inversion fixes

On Fri, 2008-05-30 at 19:09 +0530, Aneesh Kumar K.V wrote:
> The below set of patches gets the delalloc work with journal locking
> order inversion patches.
>
> The series file look like
>
> + ext4-new-defm-options
> + ext4-call-blkdev_issue_flush-on-fsync.patch
> 0001-ext4-Use-page_mkwrite-vma_operations-to-get-mmap-wr.patch
> + ext4_ialloc-flexbg.patch
> 0002-ext4-Inverse-locking-order-of-page_lock-and-transac.patch
> 0003-vfs-Move-mark_inode_dirty-from-under-page-lock-in.patch
> 0004-ext4-Add-validation-to-jbd-lock-inversion-patch-and.patch
> + delalloc-vfs.patch
> + ext4-fix-fs-corruption-with-delalloc.patch
> + delalloc-ext4.patch
> + delalloc-ext4-release-page-when-write_begin-failed.patch
> + delalloc-ext4-preallocation-handling.patch
> ...
> ...
> .....
> + vfs-fiemap.patch
> + ext4-add-ext4_ext_walk_space.patch
> + ext4-fiemap.patch
> 0005-ext4-inverse-locking-ordering-of-page_lock-and-tra.patch
> 0006-ext4-Fix-delalloc-sync-hang-with-journal-lock-inver.patch
>
> I have pushed the lock inversion patches and related changes to the
> top of the patch queue expecting it can be pushed upstream before
> delalloc changes.
>
>
Thanks for fixing this up.

Reviewed-by: Mingming Cao <[email protected]>

Added the series to patch queue.

Mingming
> -aneesh
>
>


2008-06-01 21:10:10

by Mingming Cao

[permalink] [raw]
Subject: [PATCH] ext4: Need clear buffer_delay after page writeout for delayed allocation

ext4: Need clear buffer_delay after page writeout for delayed allocation

From: Mingming Cao <[email protected]>

Need clear buffer_delay in ext4_da_writepage() after page has been writeout

Signed-off-by: Mingming Cao <[email protected]>

---
fs/ext4/inode.c | 27 +++++++++++++++++----------
1 file changed, 17 insertions(+), 10 deletions(-)

Index: linux-2.6.26-rc4/fs/ext4/inode.c
===================================================================
--- linux-2.6.26-rc4.orig/fs/ext4/inode.c 2008-06-01 13:39:14.000000000 -0700
+++ linux-2.6.26-rc4/fs/ext4/inode.c 2008-06-01 13:40:00.000000000 -0700
@@ -1495,6 +1495,13 @@ static int ext4_bh_unmapped_or_delay(han
return (!buffer_mapped(bh) || buffer_delay(bh));
}

+static int ext4_bh_clear_delay(handle_t *handle, struct buffer_head *bh)
+{
+ clear_buffer_delay(bh);
+ return 1;
+}
+
+
/* FIXME!! only support data=writeback mode */
/*
* get called vi ext4_da_writepages after taking page lock
@@ -1508,10 +1515,16 @@ static int ext4_da_writepage(struct page
loff_t size;
unsigned long len;
handle_t *handle = NULL;
- struct buffer_head *page_bufs;
+ struct buffer_head *page_bufs = page_buffers(page);
struct inode *inode = page->mapping->host;

+ size = i_size_read(inode);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
handle = ext4_journal_current_handle();
+
if (!handle) {
/*
* This can happen when we aren't called via
@@ -1519,14 +1532,6 @@ static int ext4_da_writepage(struct page
* We cannot easily start a transaction here so we just skip
* writing the page in case we would have to do so.
*/
- size = i_size_read(inode);
-
- page_bufs = page_buffers(page);
- if (page->index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
- else
- len = PAGE_CACHE_SIZE;
-
if (walk_page_buffers(NULL, page_bufs, 0,
len, NULL, ext4_bh_unmapped_or_delay)) {
/*
@@ -1544,7 +1549,9 @@ static int ext4_da_writepage(struct page
ret = nobh_writepage(page, ext4_da_get_block_write, wbc);
else
ret = block_write_full_page(page, ext4_da_get_block_write, wbc);

2008-06-02 03:14:49

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Need clear buffer_delay after page writeout for delayed allocation

On Sun, Jun 01, 2008 at 02:10:02PM -0700, Mingming Cao wrote:
> ext4: Need clear buffer_delay after page writeout for delayed allocation
>
> From: Mingming Cao <[email protected]>
>
> Need clear buffer_delay in ext4_da_writepage() after page has been writeout
>
> Signed-off-by: Mingming Cao <[email protected]>
>
> ---

We do that in mpage_put_bnr_to_bhs.


-aneesh

2008-06-02 03:50:56

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH] ext4: Need clear buffer_delay after page writeout for delayed allocation

On Mon, 2008-06-02 at 08:44 +0530, Aneesh Kumar K.V wrote:
> On Sun, Jun 01, 2008 at 02:10:02PM -0700, Mingming Cao wrote:
> > ext4: Need clear buffer_delay after page writeout for delayed allocation
> >
> > From: Mingming Cao <[email protected]>
> >
> > Need clear buffer_delay in ext4_da_writepage() after page has been writeout
> >
> > Signed-off-by: Mingming Cao <[email protected]>
> >
> > ---
>
> We do that in mpage_put_bnr_to_bhs.
>
Normally delayed buffer could be cleared in that case, but if allocation
failed in __mapge_da_writepages(), it will keep buffer_delay marked and
deferring to later ext4_da_writepage() to do block allocation. This
patch handles clear bh delay bit in this case.

Mingming


2008-06-02 04:10:12

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Need clear buffer_delay after page writeout for delayed allocation

On Sun, Jun 01, 2008 at 08:50:32PM -0700, Mingming Cao wrote:
> On Mon, 2008-06-02 at 08:44 +0530, Aneesh Kumar K.V wrote:
> > On Sun, Jun 01, 2008 at 02:10:02PM -0700, Mingming Cao wrote:
> > > ext4: Need clear buffer_delay after page writeout for delayed allocation
> > >
> > > From: Mingming Cao <[email protected]>
> > >
> > > Need clear buffer_delay in ext4_da_writepage() after page has been writeout
> > >
> > > Signed-off-by: Mingming Cao <[email protected]>
> > >
> > > ---
> >
> > We do that in mpage_put_bnr_to_bhs.
> >
> Normally delayed buffer could be cleared in that case, but if allocation
> failed in __mapge_da_writepages(), it will keep buffer_delay marked and
> deferring to later ext4_da_writepage() to do block allocation. This
> patch handles clear bh delay bit in this case.
>

Why not do it in ext4_da_get_block_write then. The reason being
block_write_full_page can return an error even though we have some of
the blocks allocated. With 1K block size we could allocate 3 blocks and
fail for the last block in that case with the above patch we don't
clear the delay bit of the buffer head of all the blocks allocated.

-aneesh

2008-06-02 05:38:42

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH] ext4: Need clear buffer_delay after page writeout for delayed allocation

On Mon, 2008-06-02 at 09:39 +0530, Aneesh Kumar K.V wrote:
> On Sun, Jun 01, 2008 at 08:50:32PM -0700, Mingming Cao wrote:
> > On Mon, 2008-06-02 at 08:44 +0530, Aneesh Kumar K.V wrote:
> > > On Sun, Jun 01, 2008 at 02:10:02PM -0700, Mingming Cao wrote:
> > > > ext4: Need clear buffer_delay after page writeout for delayed allocation
> > > >
> > > > From: Mingming Cao <[email protected]>
> > > >
> > > > Need clear buffer_delay in ext4_da_writepage() after page has been writeout
> > > >
> > > > Signed-off-by: Mingming Cao <[email protected]>
> > > >
> > > > ---
> > >
> > > We do that in mpage_put_bnr_to_bhs.
> > >
> > Normally delayed buffer could be cleared in that case, but if allocation
> > failed in __mapge_da_writepages(), it will keep buffer_delay marked and
> > deferring to later ext4_da_writepage() to do block allocation. This
> > patch handles clear bh delay bit in this case.
> >
>
> Why not do it in ext4_da_get_block_write then.

The buffer head passed to ext4_da_get_block_write() calling from
mpage_da_map_blocks is a dummy one, to store the allocated extent, not
the bh that need map.

> The reason being
> block_write_full_page can return an error even though we have some of
> the blocks allocated. With 1K block size we could allocate 3 blocks and
> fail for the last block in that case with the above patch we don't
> clear the delay bit of the buffer head of all the blocks allocated.
>
It only clear the delayed bits if block_write_full_page() returns
successfully. But that reminds me to do some block reservation handling
for the error case.

> -aneesh
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html


2008-06-02 06:35:39

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Need clear buffer_delay after page writeout for delayed allocation

On Sun, Jun 01, 2008 at 10:38:35PM -0700, Mingming Cao wrote:
> On Mon, 2008-06-02 at 09:39 +0530, Aneesh Kumar K.V wrote:
> > On Sun, Jun 01, 2008 at 08:50:32PM -0700, Mingming Cao wrote:
> > > On Mon, 2008-06-02 at 08:44 +0530, Aneesh Kumar K.V wrote:
> > > > On Sun, Jun 01, 2008 at 02:10:02PM -0700, Mingming Cao wrote:
> > > > > ext4: Need clear buffer_delay after page writeout for delayed allocation
> > > > >
> > > > > From: Mingming Cao <[email protected]>
> > > > >
> > > > > Need clear buffer_delay in ext4_da_writepage() after page has been writeout
> > > > >
> > > > > Signed-off-by: Mingming Cao <[email protected]>
> > > > >
> > > > > ---
> > > >
> > > > We do that in mpage_put_bnr_to_bhs.
> > > >
> > > Normally delayed buffer could be cleared in that case, but if allocation
> > > failed in __mapge_da_writepages(), it will keep buffer_delay marked and
> > > deferring to later ext4_da_writepage() to do block allocation. This
> > > patch handles clear bh delay bit in this case.
> > >
> >
> > Why not do it in ext4_da_get_block_write then.
>
> The buffer head passed to ext4_da_get_block_write() calling from
> mpage_da_map_blocks is a dummy one, to store the allocated extent, not
> the bh that need map.


ie true when ext4_da_get_block_write is called via writepages. In
that case mpage_put_bnr_to_bhs clears the delay bit properly. How about
the changes below.


diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 26f3626..9d25255 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2873,6 +2873,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
allocated = max_blocks;
ext4_ext_show_leaf(inode, path);
__set_bit(BH_Mapped, &bh_result->b_state);
+ __clear_bit(BH_Delay, &bh_result->b_state);
bh_result->b_bdev = inode->i_sb->s_bdev;
bh_result->b_blocknr = newblock;
out2:
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2194aa7..2a2f832 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -917,6 +917,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
goto cleanup;

set_buffer_new(bh_result);
+ clear_buffer_delay(bh_result);
got_it:
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
if (count > blocks_to_boundary)
diff --git a/fs/mpage.c b/fs/mpage.c
index c107728..0d05d7a 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -897,6 +897,14 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
int err = 0, remain = lbh->b_size;
sector_t next = lbh->b_blocknr;
struct buffer_head new;
+ int delay_buf = 0;
+
+ /*
+ * We clear the delay bit in get_block so remember
+ * whether we are doing a delay allocation
+ */
+ if (buffer_delay(lbh))
+ delay_buf = 1;

/*
* We consider only non-mapped and non-allocated blocks
@@ -926,7 +934,7 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
* If blocks are delayed marked, we need to
* put actual blocknr and drop delayed bit
*/
- if (buffer_delay(lbh))
+ if (delay_buf)
mpage_put_bnr_to_bhs(mpd, next, &new);

/* go for the remaining blocks */


2008-06-02 07:04:14

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH] ext4: Need clear buffer_delay after page writeout for delayed allocation

On Mon, 2008-06-02 at 12:05 +0530, Aneesh Kumar K.V wrote:
> On Sun, Jun 01, 2008 at 10:38:35PM -0700, Mingming Cao wrote:
> > On Mon, 2008-06-02 at 09:39 +0530, Aneesh Kumar K.V wrote:
> > > On Sun, Jun 01, 2008 at 08:50:32PM -0700, Mingming Cao wrote:
> > > > On Mon, 2008-06-02 at 08:44 +0530, Aneesh Kumar K.V wrote:
> > > > > On Sun, Jun 01, 2008 at 02:10:02PM -0700, Mingming Cao wrote:
> > > > > > ext4: Need clear buffer_delay after page writeout for delayed allocation
> > > > > >
> > > > > > From: Mingming Cao <[email protected]>
> > > > > >
> > > > > > Need clear buffer_delay in ext4_da_writepage() after page has been writeout
> > > > > >
> > > > > > Signed-off-by: Mingming Cao <[email protected]>
> > > > > >
> > > > > > ---
> > > > >
> > > > > We do that in mpage_put_bnr_to_bhs.
> > > > >
> > > > Normally delayed buffer could be cleared in that case, but if allocation
> > > > failed in __mapge_da_writepages(), it will keep buffer_delay marked and
> > > > deferring to later ext4_da_writepage() to do block allocation. This
> > > > patch handles clear bh delay bit in this case.
> > > >
> > >
> > > Why not do it in ext4_da_get_block_write then.
> >
> > The buffer head passed to ext4_da_get_block_write() calling from
> > mpage_da_map_blocks is a dummy one, to store the allocated extent, not
> > the bh that need map.
>
>
> ie true when ext4_da_get_block_write is called via writepages. In
> that case mpage_put_bnr_to_bhs clears the delay bit properly. How about
> the changes below.
>
I see your patch below is trying to address how to detect and assign
blocks with your suggestion(i.e clear delayed bit in get_block). But I
don;t think it's needed.

My last email I mean the buffer head new in mpage_da_map_blocks() is a
dummy bh, the real buffer head lbh is not passed to get_block. We could
clear the delayed bit on successful return of get_block,
mpage_put_bnr_to_bhs() ignore that dummy bh anyway. But that seems
twisted, unccessary.

I still think clear the bit in the ext4_da_write_page() is more clean
way. the original patch clears the delayed bit on success case.

For the error case I think we could handle properly by only clear the
delayed bit if buffer is mapped.

Mingming
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 26f3626..9d25255 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2873,6 +2873,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
> allocated = max_blocks;
> ext4_ext_show_leaf(inode, path);
> __set_bit(BH_Mapped, &bh_result->b_state);
> + __clear_bit(BH_Delay, &bh_result->b_state);
> bh_result->b_bdev = inode->i_sb->s_bdev;
> bh_result->b_blocknr = newblock;
> out2:
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 2194aa7..2a2f832 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -917,6 +917,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
> goto cleanup;
>
> set_buffer_new(bh_result);
> + clear_buffer_delay(bh_result);
> got_it:
> map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
> if (count > blocks_to_boundary)
> diff --git a/fs/mpage.c b/fs/mpage.c
> index c107728..0d05d7a 100644
> --- a/fs/mpage.c
> +++ b/fs/mpage.c
> @@ -897,6 +897,14 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
> int err = 0, remain = lbh->b_size;
> sector_t next = lbh->b_blocknr;
> struct buffer_head new;
> + int delay_buf = 0;
> +
> + /*
> + * We clear the delay bit in get_block so remember
> + * whether we are doing a delay allocation
> + */
> + if (buffer_delay(lbh))
> + delay_buf = 1;
>
> /*
> * We consider only non-mapped and non-allocated blocks
> @@ -926,7 +934,7 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
> * If blocks are delayed marked, we need to
> * put actual blocknr and drop delayed bit
> */
> - if (buffer_delay(lbh))
> + if (delay_buf)
> mpage_put_bnr_to_bhs(mpd, next, &new);
>
> /* go for the remaining blocks */
>


2008-06-02 08:05:49

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Need clear buffer_delay after page writeout for delayed allocation

On Mon, Jun 02, 2008 at 12:04:07AM -0700, Mingming Cao wrote:
> On Mon, 2008-06-02 at 12:05 +0530, Aneesh Kumar K.V wrote:
> > On Sun, Jun 01, 2008 at 10:38:35PM -0700, Mingming Cao wrote:
> > > On Mon, 2008-06-02 at 09:39 +0530, Aneesh Kumar K.V wrote:
> > > > On Sun, Jun 01, 2008 at 08:50:32PM -0700, Mingming Cao wrote:
> > > > > On Mon, 2008-06-02 at 08:44 +0530, Aneesh Kumar K.V wrote:
> > > > > > On Sun, Jun 01, 2008 at 02:10:02PM -0700, Mingming Cao wrote:
> > > > > > > ext4: Need clear buffer_delay after page writeout for delayed allocation
> > > > > > >
> > > > > > > From: Mingming Cao <[email protected]>
> > > > > > >
> > > > > > > Need clear buffer_delay in ext4_da_writepage() after page has been writeout
> > > > > > >
> > > > > > > Signed-off-by: Mingming Cao <[email protected]>
> > > > > > >
> > > > > > > ---
> > > > > >
> > > > > > We do that in mpage_put_bnr_to_bhs.
> > > > > >
> > > > > Normally delayed buffer could be cleared in that case, but if allocation
> > > > > failed in __mapge_da_writepages(), it will keep buffer_delay marked and
> > > > > deferring to later ext4_da_writepage() to do block allocation. This
> > > > > patch handles clear bh delay bit in this case.
> > > > >
> > > >
> > > > Why not do it in ext4_da_get_block_write then.
> > >
> > > The buffer head passed to ext4_da_get_block_write() calling from
> > > mpage_da_map_blocks is a dummy one, to store the allocated extent, not
> > > the bh that need map.
> >
> >
> > ie true when ext4_da_get_block_write is called via writepages. In
> > that case mpage_put_bnr_to_bhs clears the delay bit properly. How about
> > the changes below.
> >
> I see your patch below is trying to address how to detect and assign
> blocks with your suggestion(i.e clear delayed bit in get_block). But I
> don;t think it's needed.
>
> My last email I mean the buffer head new in mpage_da_map_blocks() is a
> dummy bh, the real buffer head lbh is not passed to get_block. We could
> clear the delayed bit on successful return of get_block,
> mpage_put_bnr_to_bhs() ignore that dummy bh anyway. But that seems
> twisted, unccessary.
>
> I still think clear the bit in the ext4_da_write_page() is more clean
> way. the original patch clears the delayed bit on success case.
>
> For the error case I think we could handle properly by only clear the
> delayed bit if buffer is mapped.


Buffer marked as delay is also mapped
fs/ext4/inode.c

1437 map_bh(bh_result, inode->i_sb, 0);
1438 set_buffer_new(bh_result);
1439 set_buffer_delay(bh_result);



>
> Mingming
> >
> > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > index 26f3626..9d25255 100644
> > --- a/fs/ext4/extents.c
> > +++ b/fs/ext4/extents.c
> > @@ -2873,6 +2873,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
> > allocated = max_blocks;
> > ext4_ext_show_leaf(inode, path);
> > __set_bit(BH_Mapped, &bh_result->b_state);
> > + __clear_bit(BH_Delay, &bh_result->b_state);
> > bh_result->b_bdev = inode->i_sb->s_bdev;
> > bh_result->b_blocknr = newblock;
> > out2:
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index 2194aa7..2a2f832 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -917,6 +917,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
> > goto cleanup;
> >
> > set_buffer_new(bh_result);
> > + clear_buffer_delay(bh_result);
> > got_it:
> > map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
> > if (count > blocks_to_boundary)


I think we need only the above chage. since we are using buffer_head new
for block request in writepages. The below change can be dropped. It
handles all the error case.



> > diff --git a/fs/mpage.c b/fs/mpage.c
> > index c107728..0d05d7a 100644
> > --- a/fs/mpage.c
> > +++ b/fs/mpage.c
> > @@ -897,6 +897,14 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
> > int err = 0, remain = lbh->b_size;
> > sector_t next = lbh->b_blocknr;
> > struct buffer_head new;
> > + int delay_buf = 0;
> > +
> > + /*
> > + * We clear the delay bit in get_block so remember
> > + * whether we are doing a delay allocation
> > + */
> > + if (buffer_delay(lbh))
> > + delay_buf = 1;
> >
> > /*
> > * We consider only non-mapped and non-allocated blocks
> > @@ -926,7 +934,7 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
> > * If blocks are delayed marked, we need to
> > * put actual blocknr and drop delayed bit
> > */
> > - if (buffer_delay(lbh))
> > + if (delay_buf)
> > mpage_put_bnr_to_bhs(mpd, next, &new);
> >
> > /* go for the remaining blocks */
> >
>

-aneesh

2008-06-02 09:31:09

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage

Hi Aneesh,

Thanks for the patch but I though we decided to do this a bit differently -
see below.

On Fri 30-05-08 19:09:27, Aneesh Kumar K.V wrote:
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/inode.c | 181 +++++++++++++++++++++++++++++++++++++++++++++++--------
> 1 files changed, 156 insertions(+), 25 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index a96c325..b122425 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1479,6 +1479,11 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
> return 0;
> }
>
> +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
> +{
> + return (!buffer_mapped(bh) || buffer_delay(bh));
> +}
> +
> /*
> * Note that we don't need to start a transaction unless we're journaling
> * data because we should have holes filled from ext4_page_mkwrite(). If
> @@ -1531,18 +1536,26 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
> static int __ext4_ordered_writepage(struct page *page,
> struct writeback_control *wbc)
> {
> - struct inode *inode = page->mapping->host;
> - struct buffer_head *page_bufs;
> + int ret = 0, err;
> + unsigned long len;
> handle_t *handle = NULL;
> - int ret = 0;
> - int err;
> + struct buffer_head *page_bufs;
> + struct inode *inode = page->mapping->host;
> + loff_t size = i_size_read(inode);
>
> - if (!page_has_buffers(page)) {
> - create_empty_buffers(page, inode->i_sb->s_blocksize,
> - (1 << BH_Dirty)|(1 << BH_Uptodate));
> - }
This is OK.

> page_bufs = page_buffers(page);
> - walk_page_buffers(handle, page_bufs, 0,
> + if (page->index == size >> PAGE_CACHE_SHIFT)
> + len = size & ~PAGE_CACHE_MASK;
> + else
> + len = PAGE_CACHE_SIZE;
> +
> + if (walk_page_buffers(NULL, page_bufs, 0,
> + len, NULL, ext4_bh_unmapped_or_delay)) {
> + printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
> + __func__);
> + BUG();
> + }
But I'd move this check to ext4_ordered_writepage().

> + walk_page_buffers(NULL, page_bufs, 0,
> PAGE_CACHE_SIZE, NULL, bget_one);
>
> ret = block_write_full_page(page, ext4_get_block, wbc);
> @@ -1574,8 +1587,8 @@ static int __ext4_ordered_writepage(struct page *page,
> ret = err;
> }
> out_put:
> - walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
> - bput_one);
> + walk_page_buffers(handle, page_bufs, 0,
> + PAGE_CACHE_SIZE, NULL, bput_one);
> return ret;
> }
>
> @@ -1583,7 +1596,7 @@ static int ext4_ordered_writepage(struct page *page,
> struct writeback_control *wbc)
> {
> J_ASSERT(PageLocked(page));
> -
> + BUG_ON(!page_has_buffers(page));
> /*
> * We give up here if we're reentered, because it might be for a
> * different filesystem.
> @@ -1599,18 +1612,34 @@ static int ext4_ordered_writepage(struct page *page,
> static int __ext4_writeback_writepage(struct page *page,
> struct writeback_control *wbc)
> {
> + unsigned long len;
> + struct buffer_head *page_bufs;
> struct inode *inode = page->mapping->host;
> + loff_t size = i_size_read(inode);
> +
> + page_bufs = page_buffers(page);
> + if (page->index == size >> PAGE_CACHE_SHIFT)
> + len = size & ~PAGE_CACHE_MASK;
> + else
> + len = PAGE_CACHE_SIZE;
>
> + if (walk_page_buffers(NULL, page_bufs, 0,
> + len, NULL, ext4_bh_unmapped_or_delay)) {
> + printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
> + __func__);
> + BUG();
> + }
And similarly move this check to ext4_writeback_writepage().

> if (test_opt(inode->i_sb, NOBH))
> return nobh_writepage(page, ext4_get_block, wbc);
> else
> return block_write_full_page(page, ext4_get_block, wbc);
> }
>
> -
> static int ext4_writeback_writepage(struct page *page,
> struct writeback_control *wbc)
> {
> + BUG_ON(!page_has_buffers(page));
> +
> if (!ext4_journal_current_handle())
> return __ext4_writeback_writepage(page, wbc);
>
> @@ -1622,18 +1651,31 @@ static int ext4_writeback_writepage(struct page *page,
> static int __ext4_journalled_writepage(struct page *page,
> struct writeback_control *wbc)
> {
> + int ret = 0, err;
> + unsigned long len;
> + handle_t *handle = NULL;
> struct address_space *mapping = page->mapping;
> struct inode *inode = mapping->host;
> struct buffer_head *page_bufs;
> - handle_t *handle = NULL;
> - int ret = 0;
> - int err;
> + loff_t size = i_size_read(inode);
> +
> + page_bufs = page_buffers(page);
> + if (page->index == size >> PAGE_CACHE_SHIFT)
> + len = size & ~PAGE_CACHE_MASK;
> + else
> + len = PAGE_CACHE_SIZE;
>
> + if (walk_page_buffers(NULL, page_bufs, 0,
> + len, NULL, ext4_bh_unmapped_or_delay)) {
> + printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
> + __func__);
> + BUG();
> + }
> + /* FIXME!! do we need to call prepare_write for a mapped buffer */
This can go to ext4_journalled_writepage(). What is actually this FIXME
about? I'm not sure I get it...

> ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
> if (ret != 0)
> goto out_unlock;
>
> - page_bufs = page_buffers(page);
> walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
> bget_one);
> /* As soon as we unlock the page, it can go away, but we have
> @@ -1671,14 +1713,13 @@ static int __ext4_journalled_writepage(struct page *page,
> static int ext4_journalled_writepage(struct page *page,
> struct writeback_control *wbc)
> {
> + BUG_ON(!page_has_buffers(page));
> +
> if (ext4_journal_current_handle())
> goto no_write;
>
> - if (!page_has_buffers(page) || PageChecked(page)) {
> - /*
> - * It's mmapped pagecache. Add buffers and journal it. There
> - * doesn't seem much point in redirtying the page here.
> - */
> + if (PageChecked(page)) {
> + /* dirty pages in data=journal mode */
> ClearPageChecked(page);
> return __ext4_journalled_writepage(page, wbc);
> } else {
> @@ -3520,6 +3561,96 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
> return err;
> }
>
> +static int __ext4_journalled_allocpage(struct page *page,
> + struct writeback_control *wbc)
> +{
> + int ret = 0, err;
> + handle_t *handle = NULL;
> + struct address_space *mapping = page->mapping;
> + struct inode *inode = mapping->host;
> + struct buffer_head *page_bufs;
> +
> + /* if alloc we are called after statring a journal */
> + handle = ext4_journal_current_handle();
> + BUG_ON(!handle);
> +
> + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
> + if (ret != 0)
> + goto out_unlock;
> +
> + /* FIXME!! should we do a bget_one */
> + page_bufs = page_buffers(page);
> + ret = walk_page_buffers(handle, page_bufs, 0,
> + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
> +
> + err = walk_page_buffers(handle, page_bufs, 0,
> + PAGE_CACHE_SIZE, NULL, write_end_fn);
> + if (ret == 0)
> + ret = err;
> + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
> +
> +out_unlock:
> + unlock_page(page);
> + return ret;
> +}
> +
> +static int __ext4_ordered_allocpage(struct page *page,
> + struct writeback_control *wbc)
> +{
> + int ret = 0;
> + handle_t *handle = NULL;
> + struct buffer_head *page_bufs;
> + struct inode *inode = page->mapping->host;
> +
> + /* if alloc we are called after statring a journal */
> + handle = ext4_journal_current_handle();
> + BUG_ON(!handle);
> + if (!page_has_buffers(page)) {
> + create_empty_buffers(page, inode->i_sb->s_blocksize,
> + (1 << BH_Dirty)|(1 << BH_Uptodate));
> + }
> + page_bufs = page_buffers(page);
> + walk_page_buffers(handle, page_bufs, 0,
> + PAGE_CACHE_SIZE, NULL, bget_one);
> +
> + ret = block_write_full_page(page, ext4_get_block, wbc);
> +
> + /*
> + * The page can become unlocked at any point now, and
> + * truncate can then come in and change things. So we
> + * can't touch *page from now on. But *page_bufs is
> + * safe due to elevated refcount.
> + */
> +
> + /*
> + * And attach them to the current transaction. But only if
> + * block_write_full_page() succeeded. Otherwise they are unmapped,
> + * and generally junk.
> + */
> + if (ret == 0) {
> + ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
> + NULL, jbd2_journal_dirty_data_fn);
> + }
> + walk_page_buffers(handle, page_bufs, 0,
> + PAGE_CACHE_SIZE, NULL, bput_one);
> + return ret;
> +}
> +
> +static int __ext4_writeback_allocpage(struct page *page,
> + struct writeback_control *wbc)
> +{
> + handle_t *handle = NULL;
> + struct inode *inode = page->mapping->host;
> + /* if alloc we are called after statring a journal */
> + handle = ext4_journal_current_handle();
> + BUG_ON(!handle);
> +
> + if (test_opt(inode->i_sb, NOBH))
> + return nobh_writepage(page, ext4_get_block, wbc);
> + else
> + return block_write_full_page(page, ext4_get_block, wbc);
> +}
> +
And then you don't need these __ext4_..._allocpage() calls because that
is what's left in __ext4_..._writepage(), isn't it? It seems also logically
more consistent - you do checks in ext4_..._writepage() and then you do the
real work in __ext4_..._writepage().
For data=journaled mode, it may be better to really have
ext4_journaled_allocpage() because we don't have to do nasty locking
tricks. But for writeback and ordered mode I really see no need for these
special functions. So at least for these two, I'd change the code as I
suggest.

> static int ext4_bh_prepare_fill(handle_t *handle, struct buffer_head *bh)
> {
> if (!buffer_mapped(bh)) {
> @@ -3596,11 +3727,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
> wbc.range_start = page_offset(page);
> wbc.range_end = page_offset(page) + len;
> if (ext4_should_writeback_data(inode))
> - ret = __ext4_writeback_writepage(page, &wbc);
> + ret = __ext4_writeback_allocpage(page, &wbc);
> else if (ext4_should_order_data(inode))
> - ret = __ext4_ordered_writepage(page, &wbc);
> + ret = __ext4_ordered_allocpage(page, &wbc);
> else
> - ret = __ext4_journalled_writepage(page, &wbc);
> + ret = __ext4_journalled_allocpage(page, &wbc);
> /* Page got unlocked in writepage */
> err = ext4_journal_stop(handle);
> if (!ret)
> --
> 1.5.5.1.357.g1af8b.dirty

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2008-06-02 09:35:01

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion

> @@ -1052,6 +1051,7 @@ static int __mpage_da_writepage(struct page *page,
> head = page_buffers(page);
> bh = head;
> do {
> +
I guess this line is a typo.

> BUG_ON(buffer_locked(bh));
> if (buffer_dirty(bh))
> mpage_add_bh_to_extent(mpd, logical, bh);
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 789b6ad..655b8bf 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -881,7 +881,12 @@ int write_cache_pages(struct address_space *mapping,
> pagevec_init(&pvec, 0);
> if (wbc->range_cyclic) {
> index = mapping->writeback_index; /* Start from prev offset */
> - end = -1;
> + /*
> + * write only till the specified range_end even in cyclic mode
> + */
> + end = wbc->range_end >> PAGE_CACHE_SHIFT;
> + if (!end)
> + end = -1;
> } else {
> index = wbc->range_start >> PAGE_CACHE_SHIFT;
> end = wbc->range_end >> PAGE_CACHE_SHIFT;
Are you sure you won't break other users of range_cyclic with this
change?

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2008-06-02 09:52:47

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage

Hi Jan,


On Mon, Jun 02, 2008 at 11:31:06AM +0200, Jan Kara wrote:
> Hi Aneesh,
>
> Thanks for the patch but I though we decided to do this a bit differently -
> see below.

Please feel free to merge the changes back to the lock inversion
patches. I sent it as a separate the patches to make it easier for review.

>
> On Fri 30-05-08 19:09:27, Aneesh Kumar K.V wrote:
> > Signed-off-by: Aneesh Kumar K.V <[email protected]>
> > ---
> > fs/ext4/inode.c | 181 +++++++++++++++++++++++++++++++++++++++++++++++--------
> > 1 files changed, 156 insertions(+), 25 deletions(-)
> >
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index a96c325..b122425 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -1479,6 +1479,11 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
> > return 0;
> > }
> >
> > +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
> > +{
> > + return (!buffer_mapped(bh) || buffer_delay(bh));
> > +}
> > +
> > /*
> > * Note that we don't need to start a transaction unless we're journaling
> > * data because we should have holes filled from ext4_page_mkwrite(). If
> > @@ -1531,18 +1536,26 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
> > static int __ext4_ordered_writepage(struct page *page,
> > struct writeback_control *wbc)
> > {
> > - struct inode *inode = page->mapping->host;
> > - struct buffer_head *page_bufs;
> > + int ret = 0, err;
> > + unsigned long len;
> > handle_t *handle = NULL;
> > - int ret = 0;
> > - int err;
> > + struct buffer_head *page_bufs;
> > + struct inode *inode = page->mapping->host;
> > + loff_t size = i_size_read(inode);
> >
> > - if (!page_has_buffers(page)) {
> > - create_empty_buffers(page, inode->i_sb->s_blocksize,
> > - (1 << BH_Dirty)|(1 << BH_Uptodate));
> > - }
> This is OK.
>
> > page_bufs = page_buffers(page);
> > - walk_page_buffers(handle, page_bufs, 0,
> > + if (page->index == size >> PAGE_CACHE_SHIFT)
> > + len = size & ~PAGE_CACHE_MASK;
> > + else
> > + len = PAGE_CACHE_SIZE;
> > +
> > + if (walk_page_buffers(NULL, page_bufs, 0,
> > + len, NULL, ext4_bh_unmapped_or_delay)) {
> > + printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
> > + __func__);
> > + BUG();
> > + }
> But I'd move this check to ext4_ordered_writepage().
>

Please feel free to do so. The only reason for me to do it as a separate
function is to clarify that with the changes writepage doesn't do any
block allocation at all. And calling writepage via page_mkwrite goes
against that. So i renamed the page_mkwrite call out to *_allocpage
and made sure writepage doesn't do any block allocation.



> > + walk_page_buffers(NULL, page_bufs, 0,
> > PAGE_CACHE_SIZE, NULL, bget_one);
> >

[.... snip ..... ]

> >
> > + if (walk_page_buffers(NULL, page_bufs, 0,
> > + len, NULL, ext4_bh_unmapped_or_delay)) {
> > + printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
> > + __func__);
> > + BUG();
> > + }
> > + /* FIXME!! do we need to call prepare_write for a mapped buffer */
> This can go to ext4_journalled_writepage(). What is actually this FIXME
> about? I'm not sure I get it...
>


I was wondering whether we need to call prepare_write in writepage ?. We
are not allocating any new blocks in writepage with these changes.


> > ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
> > if (ret != 0)
> > goto out_unlock;
> >
> > - page_bufs = page_buffers(page);
> > walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
> > bget_one);
> > /* As soon as we unlock the page, it can go away, but we have
> > @@ -1671,14 +1713,13 @@ static int __ext4_journalled_writepage(struct page *page,
> > static int ext4_journalled_writepage(struct page *page,
> > struct writeback_control *wbc)
> > {
> > + BUG_ON(!page_has_buffers(page));
> > +
> > if (ext4_journal_current_handle())
> > goto no_write;
> >
> > - if (!page_has_buffers(page) || PageChecked(page)) {
> > - /*
> > - * It's mmapped pagecache. Add buffers and journal it. There
> > - * doesn't seem much point in redirtying the page here.
> > - */
> > + if (PageChecked(page)) {
> > + /* dirty pages in data=journal mode */
> > ClearPageChecked(page);
> > return __ext4_journalled_writepage(page, wbc);
> > } else {
> > @@ -3520,6 +3561,96 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
> > return err;
> > }
> >
> > +static int __ext4_journalled_allocpage(struct page *page,
> > + struct writeback_control *wbc)
> > +{
> > + int ret = 0, err;
> > + handle_t *handle = NULL;
> > + struct address_space *mapping = page->mapping;
> > + struct inode *inode = mapping->host;
> > + struct buffer_head *page_bufs;
> > +
> > + /* if alloc we are called after statring a journal */
> > + handle = ext4_journal_current_handle();
> > + BUG_ON(!handle);
> > +
> > + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
> > + if (ret != 0)
> > + goto out_unlock;
> > +
> > + /* FIXME!! should we do a bget_one */
> > + page_bufs = page_buffers(page);
> > + ret = walk_page_buffers(handle, page_bufs, 0,
> > + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
> > +


I also have a FIXME here. I am not sure whether unlocking the page have
some effect. Can you verify this ?



> > + err = walk_page_buffers(handle, page_bufs, 0,
> > + PAGE_CACHE_SIZE, NULL, write_end_fn);
> > + if (ret == 0)
> > + ret = err;
> > + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
> > +
> > +out_unlock:
> > + unlock_page(page);
> > + return ret;
> > +}
> > +


-aneesh

2008-06-02 10:00:21

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion

On Mon, Jun 02, 2008 at 11:35:00AM +0200, Jan Kara wrote:
> > @@ -1052,6 +1051,7 @@ static int __mpage_da_writepage(struct page *page,
> > head = page_buffers(page);
> > bh = head;
> > do {
> > +
> I guess this line is a typo.
>

Yes, Mostly some debug lines I removed, but missed the newline added.


> > BUG_ON(buffer_locked(bh));
> > if (buffer_dirty(bh))
> > mpage_add_bh_to_extent(mpd, logical, bh);
> > diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> > index 789b6ad..655b8bf 100644
> > --- a/mm/page-writeback.c
> > +++ b/mm/page-writeback.c
> > @@ -881,7 +881,12 @@ int write_cache_pages(struct address_space *mapping,
> > pagevec_init(&pvec, 0);
> > if (wbc->range_cyclic) {
> > index = mapping->writeback_index; /* Start from prev offset */
> > - end = -1;
> > + /*
> > + * write only till the specified range_end even in cyclic mode
> > + */
> > + end = wbc->range_end >> PAGE_CACHE_SHIFT;
> > + if (!end)
> > + end = -1;
> > } else {
> > index = wbc->range_start >> PAGE_CACHE_SHIFT;
> > end = wbc->range_end >> PAGE_CACHE_SHIFT;
> Are you sure you won't break other users of range_cyclic with this
> change?
>

I haven't run any specific test to verify that. The concern was that if
we force cyclic mode for writeout in delalloc we may be starting the
writeout from a different offset than specified and would be writing
more. So the changes was to use the offset specified. A quick look at
the kernel suggested most of them had range_end as 0 with cyclic_mode.
I haven't audited the full kernel. I will do that. Meanwhile if you
think it is risky to make this changes i guess we should drop this
part. But i guess we can keep the below change

+ index = mapping->writeback_index;
+ if (!range_cyclic) {
+ /*
+ * We force cyclic write out of pages. If the
+ * caller didn't request for range_cyclic update
+ * set the writeback_index to what the caller requested.
+ */
+ mapping->writeback_index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ }


-aneesh

2008-06-02 10:28:01

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion

On Mon 02-06-08 15:29:56, Aneesh Kumar K.V wrote:
> On Mon, Jun 02, 2008 at 11:35:00AM +0200, Jan Kara wrote:
> > > BUG_ON(buffer_locked(bh));
> > > if (buffer_dirty(bh))
> > > mpage_add_bh_to_extent(mpd, logical, bh);
> > > diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> > > index 789b6ad..655b8bf 100644
> > > --- a/mm/page-writeback.c
> > > +++ b/mm/page-writeback.c
> > > @@ -881,7 +881,12 @@ int write_cache_pages(struct address_space *mapping,
> > > pagevec_init(&pvec, 0);
> > > if (wbc->range_cyclic) {
> > > index = mapping->writeback_index; /* Start from prev offset */
> > > - end = -1;
> > > + /*
> > > + * write only till the specified range_end even in cyclic mode
> > > + */
> > > + end = wbc->range_end >> PAGE_CACHE_SHIFT;
> > > + if (!end)
> > > + end = -1;
> > > } else {
> > > index = wbc->range_start >> PAGE_CACHE_SHIFT;
> > > end = wbc->range_end >> PAGE_CACHE_SHIFT;
> > Are you sure you won't break other users of range_cyclic with this
> > change?
> >
> I haven't run any specific test to verify that. The concern was that if
> we force cyclic mode for writeout in delalloc we may be starting the
> writeout from a different offset than specified and would be writing
> more. So the changes was to use the offset specified. A quick look at
> the kernel suggested most of them had range_end as 0 with cyclic_mode.
> I haven't audited the full kernel. I will do that. Meanwhile if you
> think it is risky to make this changes i guess we should drop this
> part. But i guess we can keep the below change
Hmm, I've just got an idea that it may be better to introduce a new flag
for wbc like range_cont and it would mean that we start scan at
writeback_index (we use range_start if writeback_index is not set) and
end with range_end. That way we don't have to be afraid of interference
with other range_cyclic users and in principle, range_cyclic is originally
meant for other uses...

> + index = mapping->writeback_index;
> + if (!range_cyclic) {
> + /*
> + * We force cyclic write out of pages. If the
> + * caller didn't request for range_cyclic update
> + * set the writeback_index to what the caller requested.
> + */
> + mapping->writeback_index = wbc->range_start >> PAGE_CACHE_SHIFT;
> + }

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2008-06-02 10:40:49

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH] ext4: Add validation to jbd lock inversion patch and split and writepage

On Mon 02-06-08 15:22:22, Aneesh Kumar K.V wrote:
> Hi Jan,
>
>
> On Mon, Jun 02, 2008 at 11:31:06AM +0200, Jan Kara wrote:
> > Hi Aneesh,
> >
> > Thanks for the patch but I though we decided to do this a bit differently -
> > see below.
>
> Please feel free to merge the changes back to the lock inversion
> patches. I sent it as a separate the patches to make it easier for review.
OK, thanks. I'll look into this.

> > >
> > > + if (walk_page_buffers(NULL, page_bufs, 0,
> > > + len, NULL, ext4_bh_unmapped_or_delay)) {
> > > + printk(KERN_CRIT "%s called with unmapped or delay buffer\n",
> > > + __func__);
> > > + BUG();
> > > + }
> > > + /* FIXME!! do we need to call prepare_write for a mapped buffer */
> > This can go to ext4_journalled_writepage(). What is actually this FIXME
> > about? I'm not sure I get it...
> >
> I was wondering whether we need to call prepare_write in writepage ?. We
> are not allocating any new blocks in writepage with these changes.
Ah, I see. I'm only not sure whether we can rely on all buffers in the
page being uptodate... Otherwise I think block_prepare_write() is not
needed.

> > > ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
> > > if (ret != 0)
> > > goto out_unlock;
> > >
> > > - page_bufs = page_buffers(page);
> > > walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
> > > bget_one);
> > > /* As soon as we unlock the page, it can go away, but we have
> > > @@ -1671,14 +1713,13 @@ static int __ext4_journalled_writepage(struct page *page,
> > > static int ext4_journalled_writepage(struct page *page,
> > > struct writeback_control *wbc)
> > > {
> > > + BUG_ON(!page_has_buffers(page));
> > > +
> > > if (ext4_journal_current_handle())
> > > goto no_write;
> > >
> > > - if (!page_has_buffers(page) || PageChecked(page)) {
> > > - /*
> > > - * It's mmapped pagecache. Add buffers and journal it. There
> > > - * doesn't seem much point in redirtying the page here.
> > > - */
> > > + if (PageChecked(page)) {
> > > + /* dirty pages in data=journal mode */
> > > ClearPageChecked(page);
> > > return __ext4_journalled_writepage(page, wbc);
> > > } else {
> > > @@ -3520,6 +3561,96 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
> > > return err;
> > > }
> > >
> > > +static int __ext4_journalled_allocpage(struct page *page,
> > > + struct writeback_control *wbc)
> > > +{
> > > + int ret = 0, err;
> > > + handle_t *handle = NULL;
> > > + struct address_space *mapping = page->mapping;
> > > + struct inode *inode = mapping->host;
> > > + struct buffer_head *page_bufs;
> > > +
> > > + /* if alloc we are called after statring a journal */
> > > + handle = ext4_journal_current_handle();
> > > + BUG_ON(!handle);
> > > +
> > > + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
> > > + if (ret != 0)
> > > + goto out_unlock;
> > > +
> > > + /* FIXME!! should we do a bget_one */
> > > + page_bufs = page_buffers(page);
> > > + ret = walk_page_buffers(handle, page_bufs, 0,
> > > + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
> > > +
> I also have a FIXME here. I am not sure whether unlocking the page have
> some effect. Can you verify this ?
Well, you unlock the page only after you're completely done with it as
far as I read the code. So that is correct. You only need to get references
to buffers when you need to access them after you unlock the page.

> > > + err = walk_page_buffers(handle, page_bufs, 0,
> > > + PAGE_CACHE_SIZE, NULL, write_end_fn);
> > > + if (ret == 0)
> > > + ret = err;
> > > + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
> > > +
> > > +out_unlock:
> > > + unlock_page(page);
> > > + return ret;
> > > +}
> > > +

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2008-06-03 04:43:24

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH] ext4: Need clear buffer_delay after page writeout for delayed allocation

On Mon, 2008-06-02 at 13:35 +0530, Aneesh Kumar K.V wrote:
> On Mon, Jun 02, 2008 at 12:04:07AM -0700, Mingming Cao wrote:
> > On Mon, 2008-06-02 at 12:05 +0530, Aneesh Kumar K.V wrote:
> > > On Sun, Jun 01, 2008 at 10:38:35PM -0700, Mingming Cao wrote:
> > > > On Mon, 2008-06-02 at 09:39 +0530, Aneesh Kumar K.V wrote:
> > > > > On Sun, Jun 01, 2008 at 08:50:32PM -0700, Mingming Cao wrote:
> > > > > > On Mon, 2008-06-02 at 08:44 +0530, Aneesh Kumar K.V wrote:
> > > > > > > On Sun, Jun 01, 2008 at 02:10:02PM -0700, Mingming Cao wrote:
> > > > > > > > ext4: Need clear buffer_delay after page writeout for delayed allocation
> > > > > > > >
> > > > > > > > From: Mingming Cao <[email protected]>
> > > > > > > >
> > > > > > > > Need clear buffer_delay in ext4_da_writepage() after page has been writeout
> > > > > > > >
> > > > > > > > Signed-off-by: Mingming Cao <[email protected]>
> > > > > > > >
> > > > > > > > ---
> > > > > > >
> > > > > > > We do that in mpage_put_bnr_to_bhs.
> > > > > > >
> > > > > > Normally delayed buffer could be cleared in that case, but if allocation
> > > > > > failed in __mapge_da_writepages(), it will keep buffer_delay marked and
> > > > > > deferring to later ext4_da_writepage() to do block allocation. This
> > > > > > patch handles clear bh delay bit in this case.
> > > > > >
> > > > >
> > > > > Why not do it in ext4_da_get_block_write then.
> > > >
> > > > The buffer head passed to ext4_da_get_block_write() calling from
> > > > mpage_da_map_blocks is a dummy one, to store the allocated extent, not
> > > > the bh that need map.
> > >
> > >
> > > ie true when ext4_da_get_block_write is called via writepages. In
> > > that case mpage_put_bnr_to_bhs clears the delay bit properly. How about
> > > the changes below.
> > >
> > I see your patch below is trying to address how to detect and assign
> > blocks with your suggestion(i.e clear delayed bit in get_block). But I
> > don;t think it's needed.
> >
> > My last email I mean the buffer head new in mpage_da_map_blocks() is a
> > dummy bh, the real buffer head lbh is not passed to get_block. We could
> > clear the delayed bit on successful return of get_block,
> > mpage_put_bnr_to_bhs() ignore that dummy bh anyway. But that seems
> > twisted, unccessary.
> >
> > I still think clear the bit in the ext4_da_write_page() is more clean
> > way. the original patch clears the delayed bit on success case.
> >
> > For the error case I think we could handle properly by only clear the
> > delayed bit if buffer is mapped.
>
>
> Buffer marked as delay is also mapped
> fs/ext4/inode.c
>
> 1437 map_bh(bh_result, inode->i_sb, 0);
> 1438 set_buffer_new(bh_result);
> 1439 set_buffer_delay(bh_result);
>
>
I find a better place to handle this, it make sense to clear this bit in
block_write_full_page() after get_block() returns successfully. This
handles partial error case smoothly. Updated patch below (to replace the
original patch)

ext4: Need clear buffer_delay in block_write_full_page() after allocation

From: Mingming Cao <[email protected]>

Normally delayed buffer could be cleared in mpage_da_map_blocks(), after
blocks are successfully allocated. But if allocation failed, it will
keep buffer_delay marked and deferring to later
ext4_da_writepage()(via block_write_full_page()) to do block allocation.
This patch handles clear bh delay bit in this case. Clear buffer_delay
in block_write_full_page() after the block is allocated.

This patch also fixed a bug in block_write_full_page() error case, we need
to check the delayed flag before flush bh to disk when trying to recover from
error.

Signed-off-by: Mingming Cao <[email protected]>

---
fs/buffer.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)

Index: linux-2.6.26-rc4/fs/buffer.c
===================================================================
--- linux-2.6.26-rc4.orig/fs/buffer.c 2008-06-02 21:34:30.000000000 -0700
+++ linux-2.6.26-rc4/fs/buffer.c 2008-06-02 21:35:17.000000000 -0700
@@ -1697,6 +1697,7 @@ static int __block_write_full_page(struc
err = get_block(inode, block, bh, 1);
if (err)
goto recover;
+ clear_buffer_delay(bh);
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
@@ -1775,7 +1776,8 @@ recover:
bh = head;
/* Recovery: lock and submit the mapped buffers */
do {
- if (buffer_mapped(bh) && buffer_dirty(bh)) {
+ if (buffer_mapped(bh) && buffer_dirty(bh)
+ && !buffer_delay(bh)) {
lock_buffer(bh);
mark_buffer_async_write(bh);
} else {



2008-06-03 10:07:58

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Need clear buffer_delay after page writeout for delayed allocation

On Mon, Jun 02, 2008 at 09:43:18PM -0700, Mingming Cao wrote:
> >
> > Buffer marked as delay is also mapped
> > fs/ext4/inode.c
> >
> > 1437 map_bh(bh_result, inode->i_sb, 0);
> > 1438 set_buffer_new(bh_result);
> > 1439 set_buffer_delay(bh_result);
> >
> >
> I find a better place to handle this, it make sense to clear this bit in
> block_write_full_page() after get_block() returns successfully. This
> handles partial error case smoothly. Updated patch below (to replace the
> original patch)
>
> ext4: Need clear buffer_delay in block_write_full_page() after allocation
>
> From: Mingming Cao <[email protected]>
>
> Normally delayed buffer could be cleared in mpage_da_map_blocks(), after
> blocks are successfully allocated. But if allocation failed, it will
> keep buffer_delay marked and deferring to later
> ext4_da_writepage()(via block_write_full_page()) to do block allocation.
> This patch handles clear bh delay bit in this case. Clear buffer_delay
> in block_write_full_page() after the block is allocated.
>
> This patch also fixed a bug in block_write_full_page() error case, we need
> to check the delayed flag before flush bh to disk when trying to recover from
> error.
>
> Signed-off-by: Mingming Cao <[email protected]>
>
> ---
> fs/buffer.c | 4 +++-
> 1 file changed, 3 insertions(+), 1 deletion(-)
>
> Index: linux-2.6.26-rc4/fs/buffer.c
> ===================================================================
> --- linux-2.6.26-rc4.orig/fs/buffer.c 2008-06-02 21:34:30.000000000 -0700
> +++ linux-2.6.26-rc4/fs/buffer.c 2008-06-02 21:35:17.000000000 -0700
> @@ -1697,6 +1697,7 @@ static int __block_write_full_page(struc
> err = get_block(inode, block, bh, 1);
> if (err)
> goto recover;
> + clear_buffer_delay(bh);

I still think we should clear the buffer_head delay bit in the get_block
rather than the callers. That way all the callers will be returned the
buffer_head with delay bit cleared. We do that for mapped and new bit
already.


> if (buffer_new(bh)) {
> /* blockdev mappings never come here */
> clear_buffer_new(bh);
> @@ -1775,7 +1776,8 @@ recover:
> bh = head;
> /* Recovery: lock and submit the mapped buffers */
> do {
> - if (buffer_mapped(bh) && buffer_dirty(bh)) {
> + if (buffer_mapped(bh) && buffer_dirty(bh)
> + && !buffer_delay(bh)) {
> lock_buffer(bh);
> mark_buffer_async_write(bh);
> } else {
>
>


This should be a separate bug fix patch. If we fail the allocation of a
block marked delay we should not write it.

-aneesh

2008-06-05 13:54:53

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion

On Mon, Jun 02, 2008 at 12:27:59PM +0200, Jan Kara wrote:
> On Mon 02-06-08 15:29:56, Aneesh Kumar K.V wrote:
> > On Mon, Jun 02, 2008 at 11:35:00AM +0200, Jan Kara wrote:
> > > > BUG_ON(buffer_locked(bh));
> > > > if (buffer_dirty(bh))
> > > > mpage_add_bh_to_extent(mpd, logical, bh);
> > > > diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> > > > index 789b6ad..655b8bf 100644
> > > > --- a/mm/page-writeback.c
> > > > +++ b/mm/page-writeback.c
> > > > @@ -881,7 +881,12 @@ int write_cache_pages(struct address_space *mapping,
> > > > pagevec_init(&pvec, 0);
> > > > if (wbc->range_cyclic) {
> > > > index = mapping->writeback_index; /* Start from prev offset */
> > > > - end = -1;
> > > > + /*
> > > > + * write only till the specified range_end even in cyclic mode
> > > > + */
> > > > + end = wbc->range_end >> PAGE_CACHE_SHIFT;
> > > > + if (!end)
> > > > + end = -1;
> > > > } else {
> > > > index = wbc->range_start >> PAGE_CACHE_SHIFT;
> > > > end = wbc->range_end >> PAGE_CACHE_SHIFT;
> > > Are you sure you won't break other users of range_cyclic with this
> > > change?
> > >
> > I haven't run any specific test to verify that. The concern was that if
> > we force cyclic mode for writeout in delalloc we may be starting the
> > writeout from a different offset than specified and would be writing
> > more. So the changes was to use the offset specified. A quick look at
> > the kernel suggested most of them had range_end as 0 with cyclic_mode.
> > I haven't audited the full kernel. I will do that. Meanwhile if you
> > think it is risky to make this changes i guess we should drop this
> > part. But i guess we can keep the below change
> Hmm, I've just got an idea that it may be better to introduce a new flag
> for wbc like range_cont and it would mean that we start scan at
> writeback_index (we use range_start if writeback_index is not set) and
> end with range_end. That way we don't have to be afraid of interference
> with other range_cyclic users and in principle, range_cyclic is originally
> meant for other uses...
>

something like below ?. With this ext4_da_writepages have

pgoff_t writeback_index = 0;
.....
if (!wbc->range_cyclic) {
/*
* If range_cyclic is not set force range_cont
* and save the old writeback_index
*/
wbc->range_cont = 1;
writeback_index = mapping->writeback_index;
mapping->writeback_index = 0;
}
...
mpage_da_writepages(..)
..
if (writeback_index)
mapping->writeback_index = writeback_index;
return ret;



mm: Add range_cont mode for writeback.

From: Aneesh Kumar K.V <[email protected]>

Filesystems like ext4 needs to start a new transaction in
the writepages for block allocation. This happens with delayed
allocation and there is limit to how many credits we can request
from the journal layer. So we call write_cache_pages multiple
times with wbc->nr_to_write set to the maximum possible value
limitted by the max journal credits available.

Add a new mode to writeback that enables us to handle this
behaviour. If mapping->writeback_index is not set we use
wbc->range_start to find the start index and then at the end
of write_cache_pages we store the index in writeback_index. Next
call to write_cache_pages will start writeout from writeback_index.
Also we limit writing to the specified wbc->range_end.


Signed-off-by: Aneesh Kumar K.V <[email protected]>
---

include/linux/writeback.h | 1 +
mm/page-writeback.c | 10 +++++++++-
2 files changed, 10 insertions(+), 1 deletions(-)


diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index f462439..0d8573e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,6 +63,7 @@ struct writeback_control {
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */
+ unsigned range_cont:1;
};

/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 789b6ad..014a9f2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -882,6 +882,12 @@ int write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
+ } else if (wbc->range_cont) {
+ if (!mapping->writeback_index)
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ else
+ index = mapping->writeback_index;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
} else {
index = wbc->range_start >> PAGE_CACHE_SHIFT;
end = wbc->range_end >> PAGE_CACHE_SHIFT;
@@ -954,7 +960,9 @@ int write_cache_pages(struct address_space *mapping,
index = 0;
goto retry;
}
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ if (wbc->range_cyclic ||
+ (range_whole && wbc->nr_to_write > 0) ||
+ wbc->range_cont)
mapping->writeback_index = index;
return ret;
}

2008-06-05 16:22:11

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion

On Thu 05-06-08 19:24:13, Aneesh Kumar K.V wrote:
> > Hmm, I've just got an idea that it may be better to introduce a new flag
> > for wbc like range_cont and it would mean that we start scan at
> > writeback_index (we use range_start if writeback_index is not set) and
> > end with range_end. That way we don't have to be afraid of interference
> > with other range_cyclic users and in principle, range_cyclic is originally
> > meant for other uses...
> >
>
> something like below ?. With this ext4_da_writepages have
>
> pgoff_t writeback_index = 0;
> .....
> if (!wbc->range_cyclic) {
> /*
> * If range_cyclic is not set force range_cont
> * and save the old writeback_index
> */
> wbc->range_cont = 1;
> writeback_index = mapping->writeback_index;
> mapping->writeback_index = 0;
> }
> ...
> mpage_da_writepages(..)
> ..
> if (writeback_index)
> mapping->writeback_index = writeback_index;
> return ret;
>
>
>
> mm: Add range_cont mode for writeback.
>
> From: Aneesh Kumar K.V <[email protected]>
>
> Filesystems like ext4 needs to start a new transaction in
> the writepages for block allocation. This happens with delayed
> allocation and there is limit to how many credits we can request
> from the journal layer. So we call write_cache_pages multiple
> times with wbc->nr_to_write set to the maximum possible value
> limitted by the max journal credits available.
>
> Add a new mode to writeback that enables us to handle this
> behaviour. If mapping->writeback_index is not set we use
> wbc->range_start to find the start index and then at the end
> of write_cache_pages we store the index in writeback_index. Next
> call to write_cache_pages will start writeout from writeback_index.
> Also we limit writing to the specified wbc->range_end.
>
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
>
> include/linux/writeback.h | 1 +
> mm/page-writeback.c | 10 +++++++++-
> 2 files changed, 10 insertions(+), 1 deletions(-)
I like it. I'm only not sure whether there cannot be two users of
write_cache_pages() operating on the same mapping at the same time. Because
then they could alter writeback_index under each other and that would
probably result in unpleasant behavior. I think there can be two parallel
calls for example from sync_single_inode() and sync_page_range().
In that case we'd need something like writeback_index inside wbc (or
maybe just alter range_start automatically when range_cont is set?) so that
parallel callers do no influence each other.

Honza

> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index f462439..0d8573e 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -63,6 +63,7 @@ struct writeback_control {
> unsigned for_writepages:1; /* This is a writepages() call */
> unsigned range_cyclic:1; /* range_start is cyclic */
> unsigned more_io:1; /* more io to be dispatched */
> + unsigned range_cont:1;
> };
>
> /*
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 789b6ad..014a9f2 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -882,6 +882,12 @@ int write_cache_pages(struct address_space *mapping,
> if (wbc->range_cyclic) {
> index = mapping->writeback_index; /* Start from prev offset */
> end = -1;
> + } else if (wbc->range_cont) {
> + if (!mapping->writeback_index)
> + index = wbc->range_start >> PAGE_CACHE_SHIFT;
> + else
> + index = mapping->writeback_index;
> + end = wbc->range_end >> PAGE_CACHE_SHIFT;
> } else {
> index = wbc->range_start >> PAGE_CACHE_SHIFT;
> end = wbc->range_end >> PAGE_CACHE_SHIFT;
> @@ -954,7 +960,9 @@ int write_cache_pages(struct address_space *mapping,
> index = 0;
> goto retry;
> }
> - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> + if (wbc->range_cyclic ||
> + (range_whole && wbc->nr_to_write > 0) ||
> + wbc->range_cont)
> mapping->writeback_index = index;
> return ret;
> }
--
Jan Kara <[email protected]>
SUSE Labs, CR

2008-06-05 19:19:19

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion

On Thu, Jun 05, 2008 at 06:22:09PM +0200, Jan Kara wrote:
> I like it. I'm only not sure whether there cannot be two users of
> write_cache_pages() operating on the same mapping at the same time. Because
> then they could alter writeback_index under each other and that would
> probably result in unpleasant behavior. I think there can be two parallel
> calls for example from sync_single_inode() and sync_page_range().
> In that case we'd need something like writeback_index inside wbc (or
> maybe just alter range_start automatically when range_cont is set?) so that
> parallel callers do no influence each other.
>

commit e56edfdeea0d336e496962782f08e1224a101cf2
Author: Aneesh Kumar K.V <[email protected]>
Date: Fri Jun 6 00:47:35 2008 +0530

mm: Add range_cont mode for writeback.

Filesystems like ext4 needs to start a new transaction in
the writepages for block allocation. This happens with delayed
allocation and there is limit to how many credits we can request
from the journal layer. So we call write_cache_pages multiple
times with wbc->nr_to_write set to the maximum possible value
limitted by the max journal credits available.

Add a new mode to writeback that enables us to handle this
behaviour. If mapping->writeback_index is not set we use
wbc->range_start to find the start index and then at the end
of write_cache_pages we store the index in writeback_index. Next
call to write_cache_pages will start writeout from writeback_index.
Also we limit writing to the specified wbc->range_end.


Signed-off-by: Aneesh Kumar K.V <[email protected]>

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index f462439..0d8573e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,6 +63,7 @@ struct writeback_control {
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */
+ unsigned range_cont:1;
};

/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 789b6ad..182233b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -882,6 +882,9 @@ int write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
+ } else if (wbc->range_cont) {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
} else {
index = wbc->range_start >> PAGE_CACHE_SHIFT;
end = wbc->range_end >> PAGE_CACHE_SHIFT;
@@ -956,6 +959,9 @@ int write_cache_pages(struct address_space *mapping,
}
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
+
+ if (wbc->range_cont)
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
return ret;
}
EXPORT_SYMBOL(write_cache_pages);

2008-06-11 12:41:58

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion

On Fri 06-06-08 00:49:09, Aneesh Kumar K.V wrote:
> On Thu, Jun 05, 2008 at 06:22:09PM +0200, Jan Kara wrote:
> > I like it. I'm only not sure whether there cannot be two users of
> > write_cache_pages() operating on the same mapping at the same time. Because
> > then they could alter writeback_index under each other and that would
> > probably result in unpleasant behavior. I think there can be two parallel
> > calls for example from sync_single_inode() and sync_page_range().
> > In that case we'd need something like writeback_index inside wbc (or
> > maybe just alter range_start automatically when range_cont is set?) so that
> > parallel callers do no influence each other.
> >
>
> commit e56edfdeea0d336e496962782f08e1224a101cf2
> Author: Aneesh Kumar K.V <[email protected]>
> Date: Fri Jun 6 00:47:35 2008 +0530
>
> mm: Add range_cont mode for writeback.
>
> Filesystems like ext4 needs to start a new transaction in
> the writepages for block allocation. This happens with delayed
> allocation and there is limit to how many credits we can request
> from the journal layer. So we call write_cache_pages multiple
> times with wbc->nr_to_write set to the maximum possible value
> limitted by the max journal credits available.
>
> Add a new mode to writeback that enables us to handle this
> behaviour. If mapping->writeback_index is not set we use
> wbc->range_start to find the start index and then at the end
> of write_cache_pages we store the index in writeback_index. Next
> call to write_cache_pages will start writeout from writeback_index.
> Also we limit writing to the specified wbc->range_end.
I think this changelog is out of date...

> Signed-off-by: Aneesh Kumar K.V <[email protected]>
>
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index f462439..0d8573e 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -63,6 +63,7 @@ struct writeback_control {
> unsigned for_writepages:1; /* This is a writepages() call */
> unsigned range_cyclic:1; /* range_start is cyclic */
> unsigned more_io:1; /* more io to be dispatched */
> + unsigned range_cont:1;
> };
>
> /*
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 789b6ad..182233b 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -882,6 +882,9 @@ int write_cache_pages(struct address_space *mapping,
> if (wbc->range_cyclic) {
> index = mapping->writeback_index; /* Start from prev offset */
> end = -1;
> + } else if (wbc->range_cont) {
> + index = wbc->range_start >> PAGE_CACHE_SHIFT;
> + end = wbc->range_end >> PAGE_CACHE_SHIFT;
Hmm, why isn't this in the next else?

> } else {
> index = wbc->range_start >> PAGE_CACHE_SHIFT;
> end = wbc->range_end >> PAGE_CACHE_SHIFT;
> @@ -956,6 +959,9 @@ int write_cache_pages(struct address_space *mapping,
> }
> if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> mapping->writeback_index = index;
> +
> + if (wbc->range_cont)
> + wbc->range_start = index << PAGE_CACHE_SHIFT;
> return ret;
> }
> EXPORT_SYMBOL(write_cache_pages);

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2008-06-11 13:57:00

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion

On Wed, Jun 11, 2008 at 02:41:57PM +0200, Jan Kara wrote:
> On Fri 06-06-08 00:49:09, Aneesh Kumar K.V wrote:
> > On Thu, Jun 05, 2008 at 06:22:09PM +0200, Jan Kara wrote:
> > > I like it. I'm only not sure whether there cannot be two users of
> > > write_cache_pages() operating on the same mapping at the same time. Because
> > > then they could alter writeback_index under each other and that would
> > > probably result in unpleasant behavior. I think there can be two parallel
> > > calls for example from sync_single_inode() and sync_page_range().
> > > In that case we'd need something like writeback_index inside wbc (or
> > > maybe just alter range_start automatically when range_cont is set?) so that
> > > parallel callers do no influence each other.
> > >
> >
> > commit e56edfdeea0d336e496962782f08e1224a101cf2
> > Author: Aneesh Kumar K.V <[email protected]>
> > Date: Fri Jun 6 00:47:35 2008 +0530
> >
> > mm: Add range_cont mode for writeback.
> >
> > Filesystems like ext4 needs to start a new transaction in
> > the writepages for block allocation. This happens with delayed
> > allocation and there is limit to how many credits we can request
> > from the journal layer. So we call write_cache_pages multiple
> > times with wbc->nr_to_write set to the maximum possible value
> > limitted by the max journal credits available.
> >
> > Add a new mode to writeback that enables us to handle this
> > behaviour. If mapping->writeback_index is not set we use
> > wbc->range_start to find the start index and then at the end
> > of write_cache_pages we store the index in writeback_index. Next
> > call to write_cache_pages will start writeout from writeback_index.
> > Also we limit writing to the specified wbc->range_end.
> I think this changelog is out of date...

The patch in the patchqueue have an updated changelog.


>
> > Signed-off-by: Aneesh Kumar K.V <[email protected]>
> >
> > diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> > index f462439..0d8573e 100644
> > --- a/include/linux/writeback.h
> > +++ b/include/linux/writeback.h
> > @@ -63,6 +63,7 @@ struct writeback_control {
> > unsigned for_writepages:1; /* This is a writepages() call */
> > unsigned range_cyclic:1; /* range_start is cyclic */
> > unsigned more_io:1; /* more io to be dispatched */
> > + unsigned range_cont:1;
> > };
> >
> > /*
> > diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> > index 789b6ad..182233b 100644
> > --- a/mm/page-writeback.c
> > +++ b/mm/page-writeback.c
> > @@ -882,6 +882,9 @@ int write_cache_pages(struct address_space *mapping,
> > if (wbc->range_cyclic) {
> > index = mapping->writeback_index; /* Start from prev offset */
> > end = -1;
> > + } else if (wbc->range_cont) {
> > + index = wbc->range_start >> PAGE_CACHE_SHIFT;
> > + end = wbc->range_end >> PAGE_CACHE_SHIFT;
> Hmm, why isn't this in the next else?

The patch in the patchqueue have

+ } else if (wbc->range_cont) {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ /*
+ * we want to set the writeback_index when congested
+ * and we are requesting for nonblocking mode,
+ * because we won't force the range_cont mode then
+ */
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;


I was not clear about setting scanned = 1; Now that I read it again I
guess it makes sense to set scanned = 1. We don't need to start the
writeout from index=0 when range_cont is set.


>
> > } else {
> > index = wbc->range_start >> PAGE_CACHE_SHIFT;
> > end = wbc->range_end >> PAGE_CACHE_SHIFT;
> > @@ -956,6 +959,9 @@ int write_cache_pages(struct address_space *mapping,
> > }
> > if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> > mapping->writeback_index = index;
> > +
> > + if (wbc->range_cont)
> > + wbc->range_start = index << PAGE_CACHE_SHIFT;
> > return ret;
> > }
> > EXPORT_SYMBOL(write_cache_pages);
>
> Honza

Attaching the updated patch.

Mingming,

Can you update the patchqueu with the below attached patch ?

-aneesh

mm: Add range_cont mode for writeback.

From: Aneesh Kumar K.V <[email protected]>

Filesystems like ext4 needs to start a new transaction in
the writepages for block allocation. This happens with delayed
allocation and there is limit to how many credits we can request
from the journal layer. So we call write_cache_pages multiple
times with wbc->nr_to_write set to the maximum possible value
limitted by the max journal credits available.

Add a new mode to writeback that enables us to handle this
behaviour. In the new mode we update the wbc->range_start
to point to the new offset to be written. Next call to
call to write_cache_pages will start writeout from specified
range_start offset. In the new mode we also limit writing
to the specified wbc->range_end.


Signed-off-by: Aneesh Kumar K.V <[email protected]>
---

include/linux/writeback.h | 1 +
mm/page-writeback.c | 3 +++
2 files changed, 4 insertions(+), 0 deletions(-)


diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index f462439..0d8573e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,6 +63,7 @@ struct writeback_control {
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */
+ unsigned range_cont:1;
};

/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 789b6ad..ded57d5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -956,6 +956,9 @@ int write_cache_pages(struct address_space *mapping,
}
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
+
+ if (wbc->range_cont)
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
return ret;
}
EXPORT_SYMBOL(write_cache_pages);

2008-06-11 17:48:16

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion

> Attaching the updated patch.
>
> Mingming,
>
> Can you update the patchqueu with the below attached patch ?
>
> -aneesh
>
> mm: Add range_cont mode for writeback.
>
> From: Aneesh Kumar K.V <[email protected]>
>
> Filesystems like ext4 needs to start a new transaction in
> the writepages for block allocation. This happens with delayed
> allocation and there is limit to how many credits we can request
> from the journal layer. So we call write_cache_pages multiple
> times with wbc->nr_to_write set to the maximum possible value
> limitted by the max journal credits available.
>
> Add a new mode to writeback that enables us to handle this
> behaviour. In the new mode we update the wbc->range_start
> to point to the new offset to be written. Next call to
> call to write_cache_pages will start writeout from specified
> range_start offset. In the new mode we also limit writing
> to the specified wbc->range_end.
>
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
Acked-by: Jan Kara <[email protected]>

Honza
> ---
>
> include/linux/writeback.h | 1 +
> mm/page-writeback.c | 3 +++
> 2 files changed, 4 insertions(+), 0 deletions(-)
>
>
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index f462439..0d8573e 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -63,6 +63,7 @@ struct writeback_control {
> unsigned for_writepages:1; /* This is a writepages() call */
> unsigned range_cyclic:1; /* range_start is cyclic */
> unsigned more_io:1; /* more io to be dispatched */
> + unsigned range_cont:1;
> };
>
> /*
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 789b6ad..ded57d5 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -956,6 +956,9 @@ int write_cache_pages(struct address_space *mapping,
> }
> if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> mapping->writeback_index = index;
> +
> + if (wbc->range_cont)
> + wbc->range_start = index << PAGE_CACHE_SHIFT;
> return ret;
> }
> EXPORT_SYMBOL(write_cache_pages);
--
Jan Kara <[email protected]>
SUSE Labs, CR

2008-06-12 23:10:49

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion

On Wed, 2008-06-11 at 19:26 +0530, Aneesh Kumar K.V wrote:

> Attaching the updated patch.
>
> Mingming,
>
> Can you update the patchqueu with the below attached patch ?
>

Sure, just did.

Mingming
> -aneesh
>
> mm: Add range_cont mode for writeback.
>
> From: Aneesh Kumar K.V <[email protected]>
>
> Filesystems like ext4 needs to start a new transaction in
> the writepages for block allocation. This happens with delayed
> allocation and there is limit to how many credits we can request
> from the journal layer. So we call write_cache_pages multiple
> times with wbc->nr_to_write set to the maximum possible value
> limitted by the max journal credits available.
>
> Add a new mode to writeback that enables us to handle this
> behaviour. In the new mode we update the wbc->range_start
> to point to the new offset to be written. Next call to
> call to write_cache_pages will start writeout from specified
> range_start offset. In the new mode we also limit writing
> to the specified wbc->range_end.
>
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
>
> include/linux/writeback.h | 1 +
> mm/page-writeback.c | 3 +++
> 2 files changed, 4 insertions(+), 0 deletions(-)
>
>
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index f462439..0d8573e 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -63,6 +63,7 @@ struct writeback_control {
> unsigned for_writepages:1; /* This is a writepages() call */
> unsigned range_cyclic:1; /* range_start is cyclic */
> unsigned more_io:1; /* more io to be dispatched */
> + unsigned range_cont:1;
> };
>
> /*
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 789b6ad..ded57d5 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -956,6 +956,9 @@ int write_cache_pages(struct address_space *mapping,
> }
> if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> mapping->writeback_index = index;
> +
> + if (wbc->range_cont)
> + wbc->range_start = index << PAGE_CACHE_SHIFT;
> return ret;
> }
> EXPORT_SYMBOL(write_cache_pages);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html