2008-06-06 18:25:02

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Patches for the patchqueue

I address most of the comments from the last review. The updated patches are
sent as a follow up to this mail. Also the patches and the series file
wich indicate their respective ordering in the patchqueue can be found at

http://www.radian.org/~kvaneesh/ext4/jun-6-2008/

-aneesh




2008-06-06 18:25:07

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext4: cleanup blockallocator

Move the code for block allocation to a single function and add helpers
for the allocation of data and meta data blocks

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 74 ++++++++++++++++++++--------------------------------
fs/ext4/ext4.h | 2 +-
fs/ext4/mballoc.c | 2 +-
3 files changed, 31 insertions(+), 47 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index b961ad1..10c2d49 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1645,7 +1645,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
}

/**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_orlov_new_blocks() -- core block(s) allocation function
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
@@ -1658,7 +1658,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* any specific goal block.
*
*/
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_orlov_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
struct buffer_head *bitmap_bh = NULL;
@@ -1928,55 +1928,17 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
return 0;
}

-ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int *errp)
-{
- struct ext4_allocation_request ar;
- ext4_fsblk_t ret;
-
- if (!test_opt(inode->i_sb, MBALLOC)) {
- unsigned long count = 1;
- ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
- return ret;
- }
+#define EXT4_META_BLOCK 0x1

- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = 1;
- ret = ext4_mb_new_blocks(handle, &ar, errp);
- return ret;
-}
-ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, unsigned long *count, int *errp)
-{
- struct ext4_allocation_request ar;
- ext4_fsblk_t ret;
-
- if (!test_opt(inode->i_sb, MBALLOC)) {
- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
- return ret;
- }
-
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = *count;
- ret = ext4_mb_new_blocks(handle, &ar, errp);
- *count = ar.len;
- return ret;
-}
-
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock, ext4_fsblk_t goal,
- unsigned long *count, int *errp)
+ unsigned long *count, int *errp, int flags)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;

if (!test_opt(inode->i_sb, MBALLOC)) {
- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
- return ret;
+ return ext4_orlov_new_blocks(handle, inode, goal, count, errp);
}

memset(&ar, 0, sizeof(ar));
@@ -1990,7 +1952,7 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
ar.goal = goal;
ar.len = *count;
ar.logical = iblock;
- if (S_ISREG(inode->i_mode))
+ if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
ar.flags = EXT4_MB_HINT_DATA;
else
/* disable in-core preallocation for non-regular files */
@@ -2001,6 +1963,28 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
}


+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, int *errp)
+{
+ unsigned long count = 1;
+ return do_blk_alloc(handle, inode, 0, goal,
+ &count, errp, EXT4_META_BLOCK);
+}
+
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, unsigned long *count, int *errp)
+{
+ return do_blk_alloc(handle, inode, 0, goal,
+ count, errp, EXT4_META_BLOCK);
+}
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp)
+{
+ return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
+
/**
* ext4_count_free_blocks() -- count filesystem free blocks
* @sb: superblock
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b3e62b7..e70ab6e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -977,7 +977,7 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock, ext4_fsblk_t goal,
unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_orlov_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count, int metadata);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 21a9e04..0011374 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4035,7 +4035,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
sbi = EXT4_SB(sb);

if (!test_opt(sb, MBALLOC)) {
- block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+ block = ext4_orlov_new_blocks(handle, ar->inode, ar->goal,
&(ar->len), errp);
return block;
}
--
1.5.5.1.357.g1af8b.dirty


2008-06-06 18:25:21

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext3: Use page_mkwrite vma_operations to get mmap write notification.

We would like to get notified when we are doing a write on mmap section.
The changes are needed to handle ENOSPC when writing to an mmap section
of files with holes.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext3/file.c | 19 +++++++++++-
fs/ext3/inode.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++
include/linux/ext3_fs.h | 1 +
3 files changed, 95 insertions(+), 1 deletions(-)

diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index acc4913..09e22e4 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -106,6 +106,23 @@ ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
return ret;
}

+static struct vm_operations_struct ext3_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ext3_page_mkwrite,
+};
+
+static int ext3_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ext3_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
+
const struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -116,7 +133,7 @@ ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ext3_file_mmap,
.open = generic_file_open,
.release = ext3_release_file,
.fsync = ext3_sync_file,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 6ae4ecf..c8261f0 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3295,3 +3295,79 @@ int ext3_change_inode_journal_flag(struct inode *inode, int val)

return err;
}
+
+static int ext3_bh_prepare_fill(handle_t *handle, struct buffer_head *bh)
+{
+ if (!buffer_mapped(bh)) {
+ /*
+ * Mark buffer as dirty so that
+ * block_write_full_page() writes it
+ */
+ set_buffer_dirty(bh);
+ }
+ return 0;
+}
+
+static int ext3_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh);
+}
+
+int ext3_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ loff_t size;
+ unsigned long len;
+ int ret = -EINVAL;
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = 1 };
+
+ /*
+ * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+ * get i_mutex because we are already holding mmap_sem.
+ */
+ down_read(&inode->i_alloc_sem);
+ size = i_size_read(inode);
+ if (page->mapping != mapping || size <= page_offset(page)
+ || !PageUptodate(page)) {
+ /* page got truncated from under us? */
+ goto out_unlock;
+ }
+ ret = 0;
+ if (PageMappedToDisk(page))
+ goto out_unlock;
+
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* return if we have all the buffers mapped */
+ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext3_bh_unmapped))
+ goto out_unlock;
+ /*
+ * Now mark all the buffer head dirty so
+ * that writepage can write it
+ */
+ walk_page_buffers(NULL, page_buffers(page), 0, len,
+ NULL, ext3_bh_prepare_fill);
+ }
+ /*
+ * OK, we need to fill the hole... Lock the page and do writepage.
+ * We can't do write_begin and write_end here because we don't
+ * have inode_mutex and that allow parallel write_begin, write_end call.
+ * (lock_page prevent this from happening on the same page though)
+ */
+ lock_page(page);
+ wbc.range_start = page_offset(page);
+ wbc.range_end = page_offset(page) + len;
+ ret = mapping->a_ops->writepage(page, &wbc);
+ /* writepage unlocks the page */
+out_unlock:
+ up_read(&inode->i_alloc_sem);
+ return ret;
+}
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 36c5403..715c35e 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -836,6 +836,7 @@ extern void ext3_truncate (struct inode *);
extern void ext3_set_inode_flags(struct inode *);
extern void ext3_get_inode_flags(struct ext3_inode_info *);
extern void ext3_set_aops(struct inode *inode);
+extern int ext3_page_mkwrite(struct vm_area_struct *vma, struct page *page);

/* ioctl.c */
extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
--
1.5.5.1.357.g1af8b.dirty


2008-06-06 18:25:14

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext2: Use page_mkwrite vma_operations to get mmap write notification.

We would like to get notified when we are doing a write on mmap
section. The changes are needed to handle ENOSPC when writing to an
mmap section of files with holes.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext2/ext2.h | 1 +
fs/ext2/file.c | 21 ++++++++++++++++++++-
fs/ext2/inode.c | 5 +++++
3 files changed, 26 insertions(+), 1 deletions(-)

diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 47d88da..cc2e106 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -136,6 +136,7 @@ extern void ext2_get_inode_flags(struct ext2_inode_info *);
int __ext2_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata);
+extern int ext2_page_mkwrite(struct vm_area_struct *vma, struct page *page);

/* ioctl.c */
extern long ext2_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5f2fa9c..d539dcf 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -18,6 +18,7 @@
* ([email protected])
*/

+#include <linux/mm.h>
#include <linux/time.h>
#include "ext2.h"
#include "xattr.h"
@@ -38,6 +39,24 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
return 0;
}

+static struct vm_operations_struct ext2_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ext2_page_mkwrite,
+};
+
+static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ext2_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
+
+
/*
* We have mostly NULL's here: the current defaults are ok for
* the ext2 filesystem.
@@ -52,7 +71,7 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ext2_file_mmap,
.open = generic_file_open,
.release = ext2_release_file,
.fsync = ext2_sync_file,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 384fc0d..d4c5c23 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1443,3 +1443,8 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
error = ext2_acl_chmod(inode);
return error;
}
+
+int ext2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ return block_page_mkwrite(vma, page, ext2_get_block);
+}
--
1.5.5.1.357.g1af8b.dirty


2008-06-06 18:25:28

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] vfs: Don't flush delay buffer to disk

From: Mingming Cao <[email protected]>

In block_write_full_page() error case, we need to check the
delayed flag before flush bh to disk when trying to recover from
error.

Signed-off-by: Mingming Cao <[email protected]>
Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/buffer.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 2f86ca5..06b887d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1775,7 +1775,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
bh = head;
/* Recovery: lock and submit the mapped buffers */
do {
- if (buffer_mapped(bh) && buffer_dirty(bh)) {
+ if (buffer_mapped(bh) && buffer_dirty(bh) &&
+ !buffer_delay(bh)) {
lock_buffer(bh);
mark_buffer_async_write(bh);
} else {
--
1.5.5.1.357.g1af8b.dirty


2008-06-06 18:25:36

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] mm: Add range_cont mode for writeback.

Filesystems like ext4 needs to start a new transaction in
the writepages for block allocation. This happens with delayed
allocation and there is limit to how many credits we can request
from the journal layer. So we call write_cache_pages multiple
times with wbc->nr_to_write set to the maximum possible value
limitted by the max journal credits available.

Add a new mode to writeback that enables us to handle this
behaviour. In the new mode we update the wbc->range_start
to point to the new offset to be written. Next call to
call to write_cache_pages will start writeout from specified
range_start offset. In the new mode we also limit writing
to the specified wbc->range_end.


Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
include/linux/writeback.h | 1 +
mm/page-writeback.c | 13 +++++++++++++
2 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index f462439..0d8573e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,6 +63,7 @@ struct writeback_control {
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */
+ unsigned range_cont:1;
};

/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 789b6ad..7306902 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -882,6 +882,16 @@ int write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
+ } else if (wbc->range_cont) {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ /*
+ * we want to set the writeback_index when congested
+ * and we are requesting for nonblocking mode,
+ * because we won't force the range_cont mode then
+ */
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
} else {
index = wbc->range_start >> PAGE_CACHE_SHIFT;
end = wbc->range_end >> PAGE_CACHE_SHIFT;
@@ -956,6 +966,9 @@ int write_cache_pages(struct address_space *mapping,
}
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
+
+ if (wbc->range_cont)
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
return ret;
}
EXPORT_SYMBOL(write_cache_pages);
--
1.5.5.1.357.g1af8b.dirty


2008-06-06 18:25:44

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: Jan Kara <[email protected]>
---
fs/ext4/inode.c | 107 ++++++++++++++++++++++++++++++++++++------------------
fs/mpage.c | 12 +++----
2 files changed, 76 insertions(+), 43 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0f8d071..b5bc627 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1480,50 +1480,74 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
up_write(&EXT4_I(inode)->i_data_sem);

if (EXT4_I(inode)->i_disksize == disksize) {
- if (handle == NULL)
- handle = ext4_journal_start(inode, 1);
- if (!IS_ERR(handle))
- ext4_mark_inode_dirty(handle, inode);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ return ret;
}
}
-
ret = 0;
}
-
return ret;
}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh) || buffer_delay(bh);
+}
+
/* FIXME!! only support data=writeback mode */
-static int __ext4_da_writepage(struct page *page,
+/*
+ * get called vi ext4_da_writepages after taking page lock
+ * We may end up doing block allocation here in case
+ * mpage_da_map_blocks failed to allocate blocks.
+ */
+static int ext4_da_writepage(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
int ret = 0;
+ loff_t size;
+ unsigned long len;
+ handle_t *handle = NULL;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;

handle = ext4_journal_current_handle();
+ if (!handle) {
+ /*
+ * This can happen when we aren't called via
+ * ext4_da_writepages() but directly (shrink_page_list).
+ * We cannot easily start a transaction here so we just skip
+ * writing the page in case we would have to do so.
+ */
+ size = i_size_read(inode);

- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_get_block, wbc);
- else
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ page_bufs = page_buffers(page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;

- if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
- EXT4_I(inode)->i_disksize = inode->i_size;
- ext4_mark_inode_dirty(handle, inode);
+ if (walk_page_buffers(NULL, page_bufs, 0,
+ len, NULL, ext4_bh_unmapped_or_delay)) {
+ /*
+ * We can't do block allocation under
+ * page lock without a handle . So redirty
+ * the page and return
+ */
+ BUG_ON(wbc->sync_mode != WB_SYNC_NONE);
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
}

+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_writepage(page, ext4_da_get_block_write, wbc);
+ else
+ ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
+
return ret;
}
-static int ext4_da_writepage(struct page *page,
- struct writeback_control *wbc)
-{
- if (!ext4_journal_current_handle())
- return __ext4_da_writepage(page, wbc);

- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
-}

/*
* For now just follow the DIO way to estimate the max credits
@@ -1545,8 +1569,8 @@ static int ext4_da_writepages(struct address_space *mapping,
handle_t *handle = NULL;
int needed_blocks;
int ret = 0;
- unsigned range_cyclic;
long to_write;
+ loff_t range_start = 0;

/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -1563,8 +1587,14 @@ static int ext4_da_writepages(struct address_space *mapping,
needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;

to_write = wbc->nr_to_write;
- range_cyclic = wbc->range_cyclic;
- wbc->range_cyclic = 1;
+ if (!wbc->range_cyclic) {
+ /*
+ * If range_cyclic is not set force range_cont
+ * and save the old writeback_index
+ */
+ wbc->range_cont = 1;
+ range_start = wbc->range_start;
+ }

while (!ret && to_write) {
/* start a new transaction*/
@@ -1579,17 +1609,27 @@ static int ext4_da_writepages(struct address_space *mapping,
*/
if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
- to_write -= wbc->nr_to_write;

+ to_write -= wbc->nr_to_write;
ret = mpage_da_writepages(mapping, wbc,
ext4_da_get_block_write);
ext4_journal_stop(handle);
- to_write += wbc->nr_to_write;
+ if (wbc->nr_to_write) {
+ /*
+ * There is no more writeout needed
+ * or we requested for a noblocking writeout
+ * and we found the device congested
+ */
+ to_write += wbc->nr_to_write;
+ break;
+ }
+ wbc->nr_to_write = to_write;
}

out_writepages:
wbc->nr_to_write = to_write;
- wbc->range_cyclic = range_cyclic;
+ if (range_start)
+ wbc->range_start = range_start;
return ret;
}

@@ -1720,11 +1760,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}

-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
-{
- return !buffer_mapped(bh) || buffer_delay(bh);
-}
-
/*
* Note that we don't need to start a transaction unless we're journaling data
* because we should have holes filled from ext4_page_mkwrite(). We even don't
diff --git a/fs/mpage.c b/fs/mpage.c
index cde7f11..c4376ec 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -849,13 +849,11 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
do {
if (cur_logical >= logical + blocks)
break;
-
if (buffer_delay(bh)) {
bh->b_blocknr = pblock;
clear_buffer_delay(bh);
- } else if (buffer_mapped(bh)) {
+ } else if (buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
- }

cur_logical++;
pblock++;
@@ -930,10 +928,10 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
if (buffer_delay(lbh))
mpage_put_bnr_to_bhs(mpd, next, &new);

- /* go for the remaining blocks */
- next += new.b_size >> mpd->inode->i_blkbits;
- remain -= new.b_size;
- }
+ /* go for the remaining blocks */
+ next += new.b_size >> mpd->inode->i_blkbits;
+ remain -= new.b_size;
+ }
}

#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
--
1.5.5.1.357.g1af8b.dirty


2008-06-06 18:25:54

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext4: delalloc block reservation fix

a) We need to decrement the meta data blocks that got allocated
from percpu s_freeblocks_counter

b) We need to protect the reservation block counter so that
reserve and release space doesn't race each other.

c) don't check for free space in ext4_mb_new_blocks with delalloc
We already reserved the space.

e) Don't release space for block allocation from fallocate space.
We don't reserve space for them

f) clear the delay bit in ext4_da_get_block_write instead of __block_write_full_page
so that we clear the delay bit for every successfull block allocation. We may fail
while marking inode dirty in ext4_da_get_block_write after allocating block. So
it is better to clear the delay bit in ext4_da_get_block_write rather than
__block_write_full_page

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 8 ++++++++
fs/ext4/ext4_i.h | 2 ++
fs/ext4/inode.c | 46 ++++++++++++++++++++++++++++++++--------------
fs/ext4/mballoc.c | 7 ++++++-
fs/ext4/super.c | 2 ++
fs/mpage.c | 36 ++++++++++++++++++++++++++----------
6 files changed, 76 insertions(+), 25 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 428e55f..9ccec61 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1973,6 +1973,14 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
ar.flags = 0;
ret = ext4_mb_new_blocks(handle, &ar, errp);
*count = ar.len;
+ /*
+ * Account for the allocated meta blocks
+ */
+ if (!(*errp) && (flags & EXT4_META_BLOCK)) {
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ }
return ret;
}

diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index fea6a5d..ef7409f 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -167,7 +167,9 @@ struct ext4_inode_info {
/* allocation reservation info for delalloc */
unsigned long i_reserved_data_blocks;
unsigned long i_reserved_meta_blocks;
+ unsigned long i_allocated_meta_blocks;
unsigned short i_delalloc_reserved_flag;
+ spinlock_t i_block_reservation_lock;
};

#endif /* _EXT4_I */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c9cb360..5d1c830 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1426,11 +1426,12 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned long md_needed, mdblocks, total = 0;

- /*
- * calculate the amount of metadata blocks to reserve
- * in order to allocate nrblocks
- * worse case is one extent per block
- */
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
mdblocks = ext4_ext_calc_metadata_amount(inode, total);
BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
@@ -1438,42 +1439,51 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
total = md_needed + nrblocks;

- if (ext4_has_free_blocks(sbi, total) < total)
+ if (ext4_has_free_blocks(sbi, total) < total) {
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return -ENOSPC;
+ }

/* reduce fs free blocks counter */
percpu_counter_sub(&sbi->s_freeblocks_counter, total);

EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
- EXT4_I(inode)->i_reserved_meta_blocks += md_needed;
+ EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;

+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return 0; /* success */
}

void ext4_da_release_space(struct inode *inode, int used, int to_free)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- int total, mdb, release;
+ int total, mdb, mdb_free, release;

- /* calculate the number of metablocks still need to be reserved */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ /* recalculate the number of metablocks still need to be reserved */
total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
mdb = ext4_ext_calc_metadata_amount(inode, total);

/* figure out how many metablocks to release */
BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- mdb = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+ mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;

- release = to_free + mdb;
+ /* Account for allocated meta_blocks */
+ mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+ release = to_free + mdb_free;

/* update fs free blocks counter for truncate case */
percpu_counter_add(&sbi->s_freeblocks_counter, release);

/* update per-inode reservations */
BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
- EXT4_I(inode)->i_reserved_data_blocks -= used + to_free;
+ EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);

BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- EXT4_I(inode)->i_reserved_meta_blocks -= mdb;
+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+ EXT4_I(inode)->i_allocated_meta_blocks = 0;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}

static void ext4_da_page_release_reservation(struct page *page,
@@ -1555,7 +1565,15 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
bh_result->b_size = (ret << inode->i_blkbits);

/* release reserved-but-unused meta blocks */
- ext4_da_release_space(inode, ret, 0);
+ if (buffer_delay(bh_result)) {
+ ext4_da_release_space(inode, ret, 0);
+ /*
+ * clear the delay bit now that we allocated
+ * blocks. If it is not a single block request
+ * we clear the delay bit in mpage_put_bnr_to_bhs
+ */
+ clear_buffer_delay(bh_result);
+ }

/*
* Update on-disk size along with block allocation
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 334e585..ec44d52 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4049,7 +4049,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
&(ar->len), errp);
return block;
}
- ar->len = ext4_has_free_blocks(sbi, ar->len);
+ if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+ /*
+ * With delalloc we already reserved the blocks
+ */
+ ar->len = ext4_has_free_blocks(sbi, ar->len);
+ }

if (ar->len == 0) {
*errp = -ENOSPC;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a990475..a33a0cf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -575,7 +575,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
spin_lock_init(&ei->i_prealloc_lock);
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
+ ei->i_allocated_meta_blocks = 0;
ei->i_delalloc_reserved_flag = 0;
+ spin_lock_init(&(ei->i_block_reservation_lock));
return &ei->vfs_inode;
}

diff --git a/fs/mpage.c b/fs/mpage.c
index c4376ec..b0db6bf 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -908,25 +908,41 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
new.b_blocknr = 0;
new.b_size = remain;
err = mpd->get_block(mpd->inode, next, &new, 1);
- if (err) {
+ /*
+ * we may have successfully allocated block. But
+ * failed to mark inode dirty. If we have allocated
+ * blocks update the buffer_head mappings
+ */
+ if (buffer_new(&new)) {
/*
- * Rather than implement own error handling
- * here, we just leave remaining blocks
- * unallocated and try again with ->writepage()
+ * buffer_head is only makred new if we have
+ * a successfull block allocation
*/
- break;
- }
- BUG_ON(new.b_size == 0);
-
- if (buffer_new(&new))
__unmap_underlying_blocks(mpd->inode, &new);
+ }

/*
* If blocks are delayed marked, we need to
* put actual blocknr and drop delayed bit
*/
- if (buffer_delay(lbh))
+ if (buffer_delay(lbh) && !buffer_delay(&new)) {
+ /*
+ * get_block if successfully allocated
+ * block will clear the delay bit of
+ * new buffer_head
+ */
mpage_put_bnr_to_bhs(mpd, next, &new);
+ } else if (err) {
+ /*
+ * Rather than implement own error handling
+ * here, we just leave remaining blocks
+ * unallocated and try again with ->writepage()
+ * we do this only if actually failed to allocate
+ * blocks.
+ */
+ break;
+ }
+ BUG_ON(new.b_size == 0);

/* go for the remaining blocks */
next += new.b_size >> mpd->inode->i_blkbits;
--
1.5.5.1.357.g1af8b.dirty


2008-06-07 01:12:57

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH] ext4: cleanup blockallocator

On Fri, 2008-06-06 at 23:54 +0530, Aneesh Kumar K.V wrote:
> Move the code for block allocation to a single function and add helpers
> for the allocation of data and meta data blocks
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/balloc.c | 74 ++++++++++++++++++++--------------------------------
> fs/ext4/ext4.h | 2 +-
> fs/ext4/mballoc.c | 2 +-
> 3 files changed, 31 insertions(+), 47 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index b961ad1..10c2d49 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -1645,7 +1645,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
> }
>
> /**
> - * ext4_new_blocks_old() -- core block(s) allocation function
> + * ext4_orlov_new_blocks() -- core block(s) allocation function

How about calling ext4_old_new_blocks? and expand the note as
core block bitmap based block allocation function?

I added a few comments to each block allocator, and clear up a few other
confusion about allocation, how about this?

---

fs/ext4/balloc.c | 74 +++++++++++++++++++++--------------------------------
fs/ext4/ext4.h | 2 +
fs/ext4/mballoc.c | 2 +
3 files changed, 31 insertions(+), 47 deletions(-)


Index: linux-2.6.26-rc5/fs/ext4/balloc.c
===================================================================
--- linux-2.6.26-rc5.orig/fs/ext4/balloc.c 2008-06-06 17:06:03.000000000 -0700
+++ linux-2.6.26-rc5/fs/ext4/balloc.c 2008-06-06 17:59:05.000000000 -0700
@@ -1645,20 +1645,24 @@ int ext4_should_retry_alloc(struct super
}

/**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
* @count: target number of blocks to allocate
* @errp: error code
*
- * ext4_new_blocks uses a goal block to assist allocation. It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
+ * the block bitmap directly to do block allocation. It tries to
+ * allocate block(s) from the block group contains the goal block first. If
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
*
*/
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
struct buffer_head *bitmap_bh = NULL;
@@ -1928,78 +1932,95 @@ out:
return 0;
}

-ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int *errp)
-{
- struct ext4_allocation_request ar;
- ext4_fsblk_t ret;
-
- if (!test_opt(inode->i_sb, MBALLOC)) {
- unsigned long count = 1;
- ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
- return ret;
- }
-
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = 1;
- ret = ext4_mb_new_blocks(handle, &ar, errp);
- return ret;
-}
-ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, unsigned long *count, int *errp)
-{
- struct ext4_allocation_request ar;
- ext4_fsblk_t ret;
-
- if (!test_opt(inode->i_sb, MBALLOC)) {
- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
- return ret;
- }
+#define EXT4_META_BLOCK 0x1

- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = *count;
- ret = ext4_mb_new_blocks(handle, &ar, errp);
- *count = ar.len;
- return ret;
-}
-
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock, ext4_fsblk_t goal,
- unsigned long *count, int *errp)
+ unsigned long *count, int *errp, int flags)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;

if (!test_opt(inode->i_sb, MBALLOC)) {
- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
- return ret;
+ return ext4_old_new_blocks(handle, inode, goal, count, errp);
}

memset(&ar, 0, sizeof(ar));
/* Fill with neighbour allocated blocks */
- ar.lleft = 0;
- ar.pleft = 0;
- ar.lright = 0;
- ar.pright = 0;

ar.inode = inode;
ar.goal = goal;
ar.len = *count;
ar.logical = iblock;
- if (S_ISREG(inode->i_mode))
+
+ if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+ /* enable in-core preallocation for data block allocation */
ar.flags = EXT4_MB_HINT_DATA;
else
/* disable in-core preallocation for non-regular files */
ar.flags = 0;
+
ret = ext4_mb_new_blocks(handle, &ar, errp);
*count = ar.len;
return ret;
}

+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) block
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @errp: error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, int *errp)
+{
+ unsigned long count = 1;
+ return do_blk_alloc(handle, inode, 0, goal,
+ &count, errp, EXT4_META_BLOCK);
+}
+
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, unsigned long *count, int *errp)
+{
+ return do_blk_alloc(handle, inode, 0, goal,
+ count, errp, EXT4_META_BLOCK);
+}
+
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp)
+{
+ return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}

/**
* ext4_count_free_blocks() -- count filesystem free blocks
Index: linux-2.6.26-rc5/fs/ext4/ext4.h
===================================================================
--- linux-2.6.26-rc5.orig/fs/ext4/ext4.h 2008-06-06 17:06:03.000000000 -0700
+++ linux-2.6.26-rc5/fs/ext4/ext4.h 2008-06-06 17:49:58.000000000 -0700
@@ -977,7 +977,7 @@ extern ext4_fsblk_t ext4_new_meta_blocks
extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock, ext4_fsblk_t goal,
unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count, int metadata);
Index: linux-2.6.26-rc5/fs/ext4/extents.c
===================================================================
--- linux-2.6.26-rc5.orig/fs/ext4/extents.c 2008-06-06 17:39:45.000000000 -0700
+++ linux-2.6.26-rc5/fs/ext4/extents.c 2008-06-06 17:54:03.000000000 -0700
@@ -180,8 +180,11 @@ static ext4_fsblk_t ext4_ext_find_goal(s
return bg_start + colour + block;
}

+/*
+ * Allocation for a meta data block
+ */
static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex, int *err)
{
@@ -688,7 +691,8 @@ static int ext4_ext_split(handle_t *hand
/* allocate all needed blocks */
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path,
+ newext, &err);
if (newblock == 0)
goto cleanup;
ablocks[a] = newblock;
@@ -884,7 +888,7 @@ static int ext4_ext_grow_indepth(handle_
ext4_fsblk_t newblock;
int err = 0;

- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
if (newblock == 0)
return err;

Index: linux-2.6.26-rc5/fs/ext4/inode.c
===================================================================
--- linux-2.6.26-rc5.orig/fs/ext4/inode.c 2008-06-06 17:56:09.000000000 -0700
+++ linux-2.6.26-rc5/fs/ext4/inode.c 2008-06-06 17:56:41.000000000 -0700
@@ -561,7 +561,7 @@ static int ext4_alloc_blocks(handle_t *h
goto allocated;
/* Now allocate data blocks */
count = target;
- /* allocating blocks for indirect blocks and direct blocks */
+ /* allocating blocks for data blocks */
current_block = ext4_new_blocks(handle, inode, iblock,
goal, &count, err);
if (*err && (target == blks)) {