2008-06-12 15:25:31

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH] ext4: Add ordered mode support for delalloc

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++--
fs/jbd2/commit.c | 41 ++++++++++++--
2 files changed, 198 insertions(+), 12 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 63355ab..7d87641 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
return !buffer_mapped(bh) || buffer_delay(bh);
}

-/* FIXME!! only support data=writeback mode */
/*
* get called vi ext4_da_writepages after taking page lock
* We may end up doing block allocation here in case
* mpage_da_map_blocks failed to allocate blocks.
*/
-static int ext4_da_writepage(struct page *page,
+static int ext4_da_writeback_writepage(struct page *page,
struct writeback_control *wbc)
{
int ret = 0;
@@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page,
return ret;
}

+/*
+ * get called vi ext4_da_writepages after taking page lock
+ * We may end up doing block allocation here in case
+ * mpage_da_map_blocks failed to allocate blocks.
+ *
+ * We also get called via journal_submit_inode_data_buffers
+ */
+static int ext4_da_ordered_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ loff_t size;
+ unsigned long len;
+ handle_t *handle = NULL;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;
+
+ handle = ext4_journal_current_handle();
+ if (!handle) {
+ /*
+ * This can happen when we aren't called via
+ * ext4_da_writepages() but directly (shrink_page_list).
+ * We cannot easily start a transaction here so we just skip
+ * writing the page in case we would have to do so.
+ */
+ size = i_size_read(inode);
+
+ page_bufs = page_buffers(page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (walk_page_buffers(NULL, page_bufs, 0,
+ len, NULL, ext4_bh_unmapped_or_delay)) {
+ /*
+ * We can't do block allocation under
+ * page lock without a handle . So redirty
+ * the page and return.
+ * We may reach here when we do a journal commit
+ * via journal_submit_inode_data_buffers.
+ * If we don't have mapping block we just ignore
+ * them
+ *
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+ ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
+
+ return ret;
+}

/*
* For now just follow the DIO way to estimate the max credits
@@ -1745,19 +1799,99 @@ static int ext4_da_writepages(struct address_space *mapping,
return ret;
}

+static int ext4_da_ordered_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ handle_t *handle = NULL;
+ int needed_blocks;
+ int ret = 0;
+ long to_write;
+ loff_t range_start = 0;
+
+
+ /*
+ * No pages to write? This is mainly a kludge to avoid starting
+ * a transaction for special inodes like journal inode on last iput()
+ * because that could violate lock ordering on umount
+ */
+ if (!mapping->nrpages)
+ return 0;
+
+ /*
+ * Estimate the worse case needed credits to write out
+ * EXT4_MAX_BUF_BLOCKS pages
+ */
+ needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+ to_write = wbc->nr_to_write;
+ if (!wbc->range_cyclic) {
+ /*
+ * If range_cyclic is not set force range_cont
+ * and save the old writeback_index
+ */
+ wbc->range_cont = 1;
+ range_start = wbc->range_start;
+ }
+
+ while (!ret && to_write) {
+ /* start a new transaction*/
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out_writepages;
+ }
+ /*
+ * set the max dirty pages could be write at a time
+ * to fit into the reserved transaction credits
+ */
+ if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+ wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+
+ to_write -= wbc->nr_to_write;
+ ret = mpage_da_writepages(mapping, wbc,
+ ext4_da_get_block_write);
+ ext4_journal_stop(handle);
+ if (wbc->nr_to_write) {
+ /*
+ * There is no more writeout needed
+ * or we requested for a noblocking writeout
+ * and we found the device congested
+ */
+ to_write += wbc->nr_to_write;
+ break;
+ }
+ wbc->nr_to_write = to_write;
+ }
+
+out_writepages:
+ wbc->nr_to_write = to_write;
+ if (range_start)
+ wbc->range_start = range_start;
+ return ret;
+}
+
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- int ret;
+ int ret, retries = 0;
struct page *page;
pgoff_t index;
unsigned from, to;
+ struct inode *inode = mapping->host;

index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;

+retry:
page = __grab_cache_page(mapping, index);
if (!page)
return -ENOMEM;
@@ -1770,6 +1904,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
page_cache_release(page);
}

+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
return ret;
}

@@ -2224,10 +2361,10 @@ static int ext4_journalled_set_page_dirty(struct page *page)
.releasepage = ext4_releasepage,
};

-static const struct address_space_operations ext4_da_aops = {
+static const struct address_space_operations ext4_da_writeback_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
- .writepage = ext4_da_writepage,
+ .writepage = ext4_da_writeback_writepage,
.writepages = ext4_da_writepages,
.sync_page = block_sync_page,
.write_begin = ext4_da_write_begin,
@@ -2239,13 +2376,31 @@ static int ext4_journalled_set_page_dirty(struct page *page)
.migratepage = buffer_migrate_page,
};

+static const struct address_space_operations ext4_da_ordered_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_da_ordered_writepage,
+ .writepages = ext4_da_ordered_writepages,
+ .sync_page = block_sync_page,
+ .write_begin = ext4_da_write_begin,
+ .write_end = generic_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_da_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+};
+
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode))
+ if (ext4_should_order_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_ordered_aops;
+ else if (ext4_should_order_data(inode))
inode->i_mapping->a_ops = &ext4_ordered_aops;
else if (ext4_should_writeback_data(inode) &&
test_opt(inode->i_sb, DELALLOC))
- inode->i_mapping->a_ops = &ext4_da_aops;
+ inode->i_mapping->a_ops = &ext4_da_writeback_aops;
else if (ext4_should_writeback_data(inode))
inode->i_mapping->a_ops = &ext4_writeback_aops;
else
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 483183d..32ca3c3 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>

/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -185,6 +187,30 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
}

/*
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
+ */
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
+{
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = mapping->nrpages * 2,
+ .range_start = 0,
+ .range_end = i_size_read(mapping->host),
+ .for_writepages = 1,
+ };
+
+ if (!mapping_cap_writeback_dirty(mapping))
+ return 0;
+
+ ret = generic_writepages(mapping, &wbc);
+ return ret;
+}
+
+/*
* Submit all the data buffers of inode associated with the transaction to
* disk.
*
@@ -192,7 +218,7 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
* our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
* operate on from being released while we write out pages.
*/
-static int journal_submit_inode_data_buffers(journal_t *journal,
+static int journal_submit_data_buffers(journal_t *journal,
transaction_t *commit_transaction)
{
struct jbd2_inode *jinode;
@@ -204,8 +230,13 @@ static int journal_submit_inode_data_buffers(journal_t *journal,
mapping = jinode->i_vfs_inode->i_mapping;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
- err = filemap_fdatawrite_range(mapping, 0,
- i_size_read(jinode->i_vfs_inode));
+ /*
+ * submit the inode data buffers. We use writepage
+ * instead of writepages. Because writepages can do
+ * block allocation with delalloc. We need to write
+ * only allocated blocks here.
+ */
+ err = journal_submit_inode_data_buffers(mapping);
if (!ret)
ret = err;
spin_lock(&journal->j_list_lock);
@@ -228,7 +259,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
struct jbd2_inode *jinode, *next_i;
int err, ret = 0;

- /* For locking, see the comment in journal_submit_inode_data_buffers() */
+ /* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
jinode->i_flags |= JI_COMMIT_RUNNING;
@@ -431,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
- err = journal_submit_inode_data_buffers(journal, commit_transaction);
+ err = journal_submit_data_buffers(journal, commit_transaction);
if (err)
jbd2_journal_abort(journal, err);

--
1.5.6.rc2.15.g457bb.dirty



2008-06-13 20:53:52

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [RFC PATCH] ext4: Add ordered mode support for delalloc

On Thu, Jun 12, 2008 at 08:55:16PM +0530, Aneesh Kumar K.V wrote:
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> fs/jbd2/commit.c | 41 ++++++++++++--
> 2 files changed, 198 insertions(+), 12 deletions(-)
>

I tested this with fsstress with fallocate, fsx-linux, fs_inode, ffsb,
and generic cp test of / to the new filesystem.

I will now do a crash test and will verify whether ordered behavior
is working correctly.


-aneesh


2008-06-13 23:01:08

by Mingming Cao

[permalink] [raw]
Subject: Re: [RFC PATCH] ext4: Add ordered mode support for delalloc

Thanks, some comments below...
On Thu, 2008-06-12 at 20:55 +0530, Aneesh Kumar K.V wrote:
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> fs/jbd2/commit.c | 41 ++++++++++++--
> 2 files changed, 198 insertions(+), 12 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 63355ab..7d87641 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
> return !buffer_mapped(bh) || buffer_delay(bh);
> }
>
> -/* FIXME!! only support data=writeback mode */
> /*
> * get called vi ext4_da_writepages after taking page lock
> * We may end up doing block allocation here in case
> * mpage_da_map_blocks failed to allocate blocks.
> */
> -static int ext4_da_writepage(struct page *page,
> +static int ext4_da_writeback_writepage(struct page *page,
> struct writeback_control *wbc)
> {
> int ret = 0;
> @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page,
> return ret;
> }
>
> +/*
> + * get called vi ext4_da_writepages after taking page lock
> + * We may end up doing block allocation here in case
> + * mpage_da_map_blocks failed to allocate blocks.
> + *
> + * We also get called via journal_submit_inode_data_buffers
> + */
> +static int ext4_da_ordered_writepage(struct page *page,
> + struct writeback_control *wbc)
> +{
> + int ret = 0;
> + loff_t size;
> + unsigned long len;
> + handle_t *handle = NULL;
> + struct buffer_head *page_bufs;
> + struct inode *inode = page->mapping->host;
> +
> + handle = ext4_journal_current_handle();
> + if (!handle) {
> + /*
> + * This can happen when we aren't called via
> + * ext4_da_writepages() but directly (shrink_page_list).
> + * We cannot easily start a transaction here so we just skip
> + * writing the page in case we would have to do so.
> + */
> + size = i_size_read(inode);
> +
> + page_bufs = page_buffers(page);
> + if (page->index == size >> PAGE_CACHE_SHIFT)
> + len = size & ~PAGE_CACHE_MASK;
> + else
> + len = PAGE_CACHE_SIZE;
> +
> + if (walk_page_buffers(NULL, page_bufs, 0,
> + len, NULL, ext4_bh_unmapped_or_delay)) {
> + /*
> + * We can't do block allocation under
> + * page lock without a handle . So redirty
> + * the page and return.
> + * We may reach here when we do a journal commit
> + * via journal_submit_inode_data_buffers.
> + * If we don't have mapping block we just ignore
> + * them
> + *
> + */
> + redirty_page_for_writepage(wbc, page);
> + unlock_page(page);
> + return 0;
> + }
> + }
> +
It seems we missed to file the inode to the journal list before calling
block_write_full_page(), since it's possible block_write_full_page()
could do block allocation.

something like this?

+ if (ext4_should_order_data(inode))
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ return ret;
+ }

> + ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
> +


> + return ret;
> +}
>

It seems this code is duplicated from
ext4_da_writeback_writepage()(except for the file inode to keep the
ordering), is there any reason not to making it one function for both
ordered mode and writeback mode?
> /*
> * For now just follow the DIO way to estimate the max credits
> @@ -1745,19 +1799,99 @@ static int ext4_da_writepages(struct address_space *mapping,
> return ret;
> }
>
> +static int ext4_da_ordered_writepages(struct address_space *mapping,
> + struct writeback_control *wbc)
> +{
> + struct inode *inode = mapping->host;
> + handle_t *handle = NULL;
> + int needed_blocks;
> + int ret = 0;
> + long to_write;
> + loff_t range_start = 0;
> +
> +
> + /*
> + * No pages to write? This is mainly a kludge to avoid starting
> + * a transaction for special inodes like journal inode on last iput()
> + * because that could violate lock ordering on umount
> + */
> + if (!mapping->nrpages)
> + return 0;
> +
> + /*
> + * Estimate the worse case needed credits to write out
> + * EXT4_MAX_BUF_BLOCKS pages
> + */
> + needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
> +
> + to_write = wbc->nr_to_write;
> + if (!wbc->range_cyclic) {
> + /*
> + * If range_cyclic is not set force range_cont
> + * and save the old writeback_index
> + */
> + wbc->range_cont = 1;
> + range_start = wbc->range_start;
> + }
> +
> + while (!ret && to_write) {
> + /* start a new transaction*/
> + handle = ext4_journal_start(inode, needed_blocks);
> + if (IS_ERR(handle)) {
> + ret = PTR_ERR(handle);
> + goto out_writepages;
> + }
> +
> + ret = ext4_jbd2_file_inode(handle, inode);
> + if (ret) {
> + ext4_journal_stop(handle);
> + goto out_writepages;
> + }
> + /*
> + * set the max dirty pages could be write at a time
> + * to fit into the reserved transaction credits
> + */
> + if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
> + wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
> +
> + to_write -= wbc->nr_to_write;
> + ret = mpage_da_writepages(mapping, wbc,
> + ext4_da_get_block_write);
> + ext4_journal_stop(handle);
> + if (wbc->nr_to_write) {
> + /*
> + * There is no more writeout needed
> + * or we requested for a noblocking writeout
> + * and we found the device congested
> + */
> + to_write += wbc->nr_to_write;
> + break;
> + }
> + wbc->nr_to_write = to_write;
> + }
> +
> +out_writepages:
> + wbc->nr_to_write = to_write;
> + if (range_start)
> + wbc->range_start = range_start;
> + return ret;
> +}
> +

It seems this code is duplicated from
ext4_da_writeback_writepages()also. The only part different is in
ordered mode, we need to file the inode to the journal list to keep the
ordering. I think we could use existing da_writepages() function for
both ordered mode and writeback mode as well.

> static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
> loff_t pos, unsigned len, unsigned flags,
> struct page **pagep, void **fsdata)
> {
> - int ret;
> + int ret, retries = 0;
> struct page *page;
> pgoff_t index;
> unsigned from, to;
> + struct inode *inode = mapping->host;
>
> index = pos >> PAGE_CACHE_SHIFT;
> from = pos & (PAGE_CACHE_SIZE - 1);
> to = from + len;
>
> +retry:
> page = __grab_cache_page(mapping, index);
> if (!page)
> return -ENOMEM;
> @@ -1770,6 +1904,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
> page_cache_release(page);
> }
>
> + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
> + goto retry;
> +
> return ret;
> }
>

In case of ENOSPC, instead of go back and try to do reservation (which
may overestimate the total number of metablocks to reserve) again, I
think we should not doing delayed allocation, instead call the real
get_block() function to try the real block allocation.

Just to clarify, this is not part of the ordered mode support, I think
we should make a separate patch for this kind of improvement.

> @@ -2224,10 +2361,10 @@ static int ext4_journalled_set_page_dirty(struct page *page)
> .releasepage = ext4_releasepage,
> };
>
> -static const struct address_space_operations ext4_da_aops = {
> +static const struct address_space_operations ext4_da_writeback_aops = {
> .readpage = ext4_readpage,
> .readpages = ext4_readpages,
> - .writepage = ext4_da_writepage,
> + .writepage = ext4_da_writeback_writepage,
> .writepages = ext4_da_writepages,
> .sync_page = block_sync_page,
> .write_begin = ext4_da_write_begin,
> @@ -2239,13 +2376,31 @@ static int ext4_journalled_set_page_dirty(struct page *page)
> .migratepage = buffer_migrate_page,
> };
>
> +static const struct address_space_operations ext4_da_ordered_aops = {
> + .readpage = ext4_readpage,
> + .readpages = ext4_readpages,
> + .writepage = ext4_da_ordered_writepage,
> + .writepages = ext4_da_ordered_writepages,
> + .sync_page = block_sync_page,
> + .write_begin = ext4_da_write_begin,
> + .write_end = generic_write_end,
> + .bmap = ext4_bmap,
> + .invalidatepage = ext4_da_invalidatepage,
> + .releasepage = ext4_releasepage,
> + .direct_IO = ext4_direct_IO,
> + .migratepage = buffer_migrate_page,
> +};
> +

With the new ordered mode, we could share the same address space
operations for delayed allocation over writeback and ordered mode.


> void ext4_set_aops(struct inode *inode)
> {
> - if (ext4_should_order_data(inode))
> + if (ext4_should_order_data(inode) &&
> + test_opt(inode->i_sb, DELALLOC))
> + inode->i_mapping->a_ops = &ext4_da_ordered_aops;
> + else if (ext4_should_order_data(inode))
> inode->i_mapping->a_ops = &ext4_ordered_aops;
> else if (ext4_should_writeback_data(inode) &&
> test_opt(inode->i_sb, DELALLOC))
> - inode->i_mapping->a_ops = &ext4_da_aops;
> + inode->i_mapping->a_ops = &ext4_da_writeback_aops;
> else if (ext4_should_writeback_data(inode))
> inode->i_mapping->a_ops = &ext4_writeback_aops;
> else
> diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
> index 483183d..32ca3c3 100644
> --- a/fs/jbd2/commit.c
> +++ b/fs/jbd2/commit.c
> @@ -22,6 +22,8 @@
> #include <linux/pagemap.h>
> #include <linux/jiffies.h>
> #include <linux/crc32.h>
> +#include <linux/writeback.h>
> +#include <linux/backing-dev.h>
>
> /*
> * Default IO end handler for temporary BJ_IO buffer_heads.
> @@ -185,6 +187,30 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
> }
>
> /*
> + * write the filemap data using writepage() address_space_operations.
> + * We don't do block allocation here even for delalloc. We don't
> + * use writepages() because with dealyed allocation we may be doing
> + * block allocation in writepages().
> + */
> +static int journal_submit_inode_data_buffers(struct address_space *mapping)
> +{
> + int ret;
> + struct writeback_control wbc = {
> + .sync_mode = WB_SYNC_ALL,
> + .nr_to_write = mapping->nrpages * 2,
> + .range_start = 0,
> + .range_end = i_size_read(mapping->host),
> + .for_writepages = 1,
> + };
> +
> + if (!mapping_cap_writeback_dirty(mapping))
> + return 0;
> +
> + ret = generic_writepages(mapping, &wbc);
> + return ret;
> +}
> +
> +/*
> * Submit all the data buffers of inode associated with the transaction to
> * disk.
> *
> @@ -192,7 +218,7 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
> * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
> * operate on from being released while we write out pages.
> */
> -static int journal_submit_inode_data_buffers(journal_t *journal,
> +static int journal_submit_data_buffers(journal_t *journal,
> transaction_t *commit_transaction)
> {
> struct jbd2_inode *jinode;
> @@ -204,8 +230,13 @@ static int journal_submit_inode_data_buffers(journal_t *journal,
> mapping = jinode->i_vfs_inode->i_mapping;
> jinode->i_flags |= JI_COMMIT_RUNNING;
> spin_unlock(&journal->j_list_lock);
> - err = filemap_fdatawrite_range(mapping, 0,
> - i_size_read(jinode->i_vfs_inode));
> + /*
> + * submit the inode data buffers. We use writepage
> + * instead of writepages. Because writepages can do
> + * block allocation with delalloc. We need to write
> + * only allocated blocks here.
> + */

Hmm, when writepage()->ext4_da_orderd_writepage() is called from here,
the handle is expecting to be NULL? Otherwise block_write_full_page()
could do block allocation, that's against the locking ordering...:(

> + err = journal_submit_inode_data_buffers(mapping);
> if (!ret)
> ret = err;
> spin_lock(&journal->j_list_lock);
> @@ -228,7 +259,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
> struct jbd2_inode *jinode, *next_i;
> int err, ret = 0;
>
> - /* For locking, see the comment in journal_submit_inode_data_buffers() */
> + /* For locking, see the comment in journal_submit_data_buffers() */
> spin_lock(&journal->j_list_lock);
> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
> jinode->i_flags |= JI_COMMIT_RUNNING;
> @@ -431,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
> * Now start flushing things to disk, in the order they appear
> * on the transaction lists. Data blocks go first.
> */
> - err = journal_submit_inode_data_buffers(journal, commit_transaction);
> + err = journal_submit_data_buffers(journal, commit_transaction);
> if (err)
> jbd2_journal_abort(journal, err);
>

Regards,
Mingming


2008-06-14 06:34:54

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [RFC PATCH] ext4: Add ordered mode support for delalloc

On Fri, Jun 13, 2008 at 04:01:22PM -0700, Mingming wrote:
> Thanks, some comments below...
> On Thu, 2008-06-12 at 20:55 +0530, Aneesh Kumar K.V wrote:
> > Signed-off-by: Aneesh Kumar K.V <[email protected]>
> > ---
> > fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> > fs/jbd2/commit.c | 41 ++++++++++++--
> > 2 files changed, 198 insertions(+), 12 deletions(-)
> >
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index 63355ab..7d87641 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
> > return !buffer_mapped(bh) || buffer_delay(bh);
> > }
> >
> > -/* FIXME!! only support data=writeback mode */
> > /*
> > * get called vi ext4_da_writepages after taking page lock
> > * We may end up doing block allocation here in case
> > * mpage_da_map_blocks failed to allocate blocks.
> > */
> > -static int ext4_da_writepage(struct page *page,
> > +static int ext4_da_writeback_writepage(struct page *page,
> > struct writeback_control *wbc)
> > {
> > int ret = 0;
> > @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page,
> > return ret;
> > }
> >
> > +/*
> > + * get called vi ext4_da_writepages after taking page lock
> > + * We may end up doing block allocation here in case
> > + * mpage_da_map_blocks failed to allocate blocks.
> > + *
> > + * We also get called via journal_submit_inode_data_buffers
> > + */
> > +static int ext4_da_ordered_writepage(struct page *page,
> > + struct writeback_control *wbc)
> > +{
> > + int ret = 0;
> > + loff_t size;
> > + unsigned long len;
> > + handle_t *handle = NULL;
> > + struct buffer_head *page_bufs;
> > + struct inode *inode = page->mapping->host;
> > +
> > + handle = ext4_journal_current_handle();
> > + if (!handle) {
> > + /*
> > + * This can happen when we aren't called via
> > + * ext4_da_writepages() but directly (shrink_page_list).
> > + * We cannot easily start a transaction here so we just skip
> > + * writing the page in case we would have to do so.
> > + */
> > + size = i_size_read(inode);
> > +
> > + page_bufs = page_buffers(page);
> > + if (page->index == size >> PAGE_CACHE_SHIFT)
> > + len = size & ~PAGE_CACHE_MASK;
> > + else
> > + len = PAGE_CACHE_SIZE;
> > +
> > + if (walk_page_buffers(NULL, page_bufs, 0,
> > + len, NULL, ext4_bh_unmapped_or_delay)) {
> > + /*
> > + * We can't do block allocation under
> > + * page lock without a handle . So redirty
> > + * the page and return.
> > + * We may reach here when we do a journal commit
> > + * via journal_submit_inode_data_buffers.
> > + * If we don't have mapping block we just ignore
> > + * them
> > + *
> > + */
> > + redirty_page_for_writepage(wbc, page);
> > + unlock_page(page);
> > + return 0;
> > + }
> > + }
> > +
> It seems we missed to file the inode to the journal list before calling
> block_write_full_page(), since it's possible block_write_full_page()
> could do block allocation.
>
> something like this?
>
> + if (ext4_should_order_data(inode))
> + ret = ext4_jbd2_file_inode(handle, inode);
> + if (ret) {
> + ext4_journal_stop(handle);
> + return ret;
> + }
>


No we can't do that. The writepage get called back in the following
case.

a) via background_writeout from pdflush
b) via do_sync via sync call
c) via shrink_page_list when under memory pressure from kswapd.


In both (a) and (b) we get called via writepages. That means in
both (a) and (b) we can do block allocation. In case of (c) we get
called directly and we can't do block allocation because we are already
called with page_lock and we can't start a journal transaction.

That means we do block allocation only via writepages. So in
ext4_da_ordered_writepages() we do

1844 ret = ext4_jbd2_file_inode(handle, inode);

for each journal start.

Now if you see I have also added journal_submit_inode_data_buffers
that will be called during journal commit. The function use writepage
instead of writepages. The idea here is to ensure that we submit only
the pages that have all the buffer_head mapped. Because we don't want to
do block allocation during a journal commit. It also have the advantage
that we don't wait for writing out all the buffer_head during a sync
from user space. Because not all buffer_head would be mapped at that
time. This should actually improve the case "sync take lot of time with
ordered mode on ext4"


To list out:

a) We do block allocation only when called via ext4_da_*_writepages. This
is true even for writeback mode.

c) If we call writepage and if we find any buffer_head in the page
unmapped or delayed we don't write the page. Instead we redirty the page
and return.

d) journal commit is updated to use writepage instead of writepages so
that we don't do block allocation during commit. This will result in
writeout of only those pages that have fully mapped buffer_heads.


e) Since we do block allocation only via ext4_da_*_writepages, we need
to add the inode to journal list only during ext4_da_*_writepages.
We do that for each journal_start in the loop.


> > + ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
> > +
>
>
> > + return ret;
> > +}
> >
>
> It seems this code is duplicated from
> ext4_da_writeback_writepage()(except for the file inode to keep the
> ordering), is there any reason not to making it one function for both
> ordered mode and writeback mode?


Nothing particular I will merge both to one function in the next post.


> > /*
> > * For now just follow the DIO way to estimate the max credits
> > @@ -1745,19 +1799,99 @@ static int ext4_da_writepages(struct address_space *mapping,
> > return ret;
> > }
> >
> > +static int ext4_da_ordered_writepages(struct address_space *mapping,
> > + struct writeback_control *wbc)
> > +{
> > + struct inode *inode = mapping->host;
> > + handle_t *handle = NULL;
> > + int needed_blocks;
> > + int ret = 0;
> > + long to_write;
> > + loff_t range_start = 0;
> > +
> > +
> > + /*
> > + * No pages to write? This is mainly a kludge to avoid starting
> > + * a transaction for special inodes like journal inode on last iput()
> > + * because that could violate lock ordering on umount
> > + */
> > + if (!mapping->nrpages)
> > + return 0;
> > +
> > + /*
> > + * Estimate the worse case needed credits to write out
> > + * EXT4_MAX_BUF_BLOCKS pages
> > + */
> > + needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
> > +
> > + to_write = wbc->nr_to_write;
> > + if (!wbc->range_cyclic) {
> > + /*
> > + * If range_cyclic is not set force range_cont
> > + * and save the old writeback_index
> > + */
> > + wbc->range_cont = 1;
> > + range_start = wbc->range_start;
> > + }
> > +
> > + while (!ret && to_write) {
> > + /* start a new transaction*/
> > + handle = ext4_journal_start(inode, needed_blocks);
> > + if (IS_ERR(handle)) {
> > + ret = PTR_ERR(handle);
> > + goto out_writepages;
> > + }
> > +
> > + ret = ext4_jbd2_file_inode(handle, inode);
> > + if (ret) {
> > + ext4_journal_stop(handle);
> > + goto out_writepages;
> > + }
> > + /*
> > + * set the max dirty pages could be write at a time
> > + * to fit into the reserved transaction credits
> > + */
> > + if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
> > + wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
> > +
> > + to_write -= wbc->nr_to_write;
> > + ret = mpage_da_writepages(mapping, wbc,
> > + ext4_da_get_block_write);
> > + ext4_journal_stop(handle);
> > + if (wbc->nr_to_write) {
> > + /*
> > + * There is no more writeout needed
> > + * or we requested for a noblocking writeout
> > + * and we found the device congested
> > + */
> > + to_write += wbc->nr_to_write;
> > + break;
> > + }
> > + wbc->nr_to_write = to_write;
> > + }
> > +
> > +out_writepages:
> > + wbc->nr_to_write = to_write;
> > + if (range_start)
> > + wbc->range_start = range_start;
> > + return ret;
> > +}
> > +
>
> It seems this code is duplicated from
> ext4_da_writeback_writepages()also. The only part different is in
> ordered mode, we need to file the inode to the journal list to keep the
> ordering. I think we could use existing da_writepages() function for
> both ordered mode and writeback mode as well.


Will do that in the next post.

>
> > static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
> > loff_t pos, unsigned len, unsigned flags,
> > struct page **pagep, void **fsdata)
> > {
> > - int ret;
> > + int ret, retries = 0;
> > struct page *page;
> > pgoff_t index;
> > unsigned from, to;
> > + struct inode *inode = mapping->host;
> >
> > index = pos >> PAGE_CACHE_SHIFT;
> > from = pos & (PAGE_CACHE_SIZE - 1);
> > to = from + len;
> >
> > +retry:
> > page = __grab_cache_page(mapping, index);
> > if (!page)
> > return -ENOMEM;
> > @@ -1770,6 +1904,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
> > page_cache_release(page);
> > }
> >
> > + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
> > + goto retry;
> > +
> > return ret;
> > }
> >
>
> In case of ENOSPC, instead of go back and try to do reservation (which
> may overestimate the total number of metablocks to reserve) again, I
> think we should not doing delayed allocation, instead call the real
> get_block() function to try the real block allocation.
>
> Just to clarify, this is not part of the ordered mode support, I think
> we should make a separate patch for this kind of improvement.


I already have a patch queued for the same. I have 16 small patches with
various cleanups sitting in my local repo. I should be sending out the
same in the next update.


>
> > @@ -2224,10 +2361,10 @@ static int ext4_journalled_set_page_dirty(struct page *page)
> > .releasepage = ext4_releasepage,
> > };
> >
> > -static const struct address_space_operations ext4_da_aops = {
> > +static const struct address_space_operations ext4_da_writeback_aops = {
> > .readpage = ext4_readpage,
> > .readpages = ext4_readpages,
> > - .writepage = ext4_da_writepage,
> > + .writepage = ext4_da_writeback_writepage,
> > .writepages = ext4_da_writepages,
> > .sync_page = block_sync_page,
> > .write_begin = ext4_da_write_begin,
> > @@ -2239,13 +2376,31 @@ static int ext4_journalled_set_page_dirty(struct page *page)
> > .migratepage = buffer_migrate_page,
> > };
> >
> > +static const struct address_space_operations ext4_da_ordered_aops = {
> > + .readpage = ext4_readpage,
> > + .readpages = ext4_readpages,
> > + .writepage = ext4_da_ordered_writepage,
> > + .writepages = ext4_da_ordered_writepages,
> > + .sync_page = block_sync_page,
> > + .write_begin = ext4_da_write_begin,
> > + .write_end = generic_write_end,
> > + .bmap = ext4_bmap,
> > + .invalidatepage = ext4_da_invalidatepage,
> > + .releasepage = ext4_releasepage,
> > + .direct_IO = ext4_direct_IO,
> > + .migratepage = buffer_migrate_page,
> > +};
> > +
>
> With the new ordered mode, we could share the same address space
> operations for delayed allocation over writeback and ordered mode.
>
>
> > void ext4_set_aops(struct inode *inode)
> > {
> > - if (ext4_should_order_data(inode))
> > + if (ext4_should_order_data(inode) &&
> > + test_opt(inode->i_sb, DELALLOC))
> > + inode->i_mapping->a_ops = &ext4_da_ordered_aops;
> > + else if (ext4_should_order_data(inode))
> > inode->i_mapping->a_ops = &ext4_ordered_aops;
> > else if (ext4_should_writeback_data(inode) &&
> > test_opt(inode->i_sb, DELALLOC))
> > - inode->i_mapping->a_ops = &ext4_da_aops;
> > + inode->i_mapping->a_ops = &ext4_da_writeback_aops;
> > else if (ext4_should_writeback_data(inode))
> > inode->i_mapping->a_ops = &ext4_writeback_aops;
> > else
> > diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
> > index 483183d..32ca3c3 100644
> > --- a/fs/jbd2/commit.c
> > +++ b/fs/jbd2/commit.c
> > @@ -22,6 +22,8 @@
> > #include <linux/pagemap.h>
> > #include <linux/jiffies.h>
> > #include <linux/crc32.h>
> > +#include <linux/writeback.h>
> > +#include <linux/backing-dev.h>
> >
> > /*
> > * Default IO end handler for temporary BJ_IO buffer_heads.
> > @@ -185,6 +187,30 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
> > }
> >
> > /*
> > + * write the filemap data using writepage() address_space_operations.
> > + * We don't do block allocation here even for delalloc. We don't
> > + * use writepages() because with dealyed allocation we may be doing
> > + * block allocation in writepages().
> > + */
> > +static int journal_submit_inode_data_buffers(struct address_space *mapping)
> > +{
> > + int ret;
> > + struct writeback_control wbc = {
> > + .sync_mode = WB_SYNC_ALL,
> > + .nr_to_write = mapping->nrpages * 2,
> > + .range_start = 0,
> > + .range_end = i_size_read(mapping->host),
> > + .for_writepages = 1,
> > + };
> > +
> > + if (!mapping_cap_writeback_dirty(mapping))
> > + return 0;
> > +
> > + ret = generic_writepages(mapping, &wbc);
> > + return ret;
> > +}
> > +
> > +/*
> > * Submit all the data buffers of inode associated with the transaction to
> > * disk.
> > *
> > @@ -192,7 +218,7 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
> > * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
> > * operate on from being released while we write out pages.
> > */
> > -static int journal_submit_inode_data_buffers(journal_t *journal,
> > +static int journal_submit_data_buffers(journal_t *journal,
> > transaction_t *commit_transaction)
> > {
> > struct jbd2_inode *jinode;
> > @@ -204,8 +230,13 @@ static int journal_submit_inode_data_buffers(journal_t *journal,
> > mapping = jinode->i_vfs_inode->i_mapping;
> > jinode->i_flags |= JI_COMMIT_RUNNING;
> > spin_unlock(&journal->j_list_lock);
> > - err = filemap_fdatawrite_range(mapping, 0,
> > - i_size_read(jinode->i_vfs_inode));
> > + /*
> > + * submit the inode data buffers. We use writepage
> > + * instead of writepages. Because writepages can do
> > + * block allocation with delalloc. We need to write
> > + * only allocated blocks here.
> > + */
>
> Hmm, when writepage()->ext4_da_orderd_writepage() is called from here,
> the handle is expecting to be NULL? Otherwise block_write_full_page()
> could do block allocation, that's against the locking ordering...:(


I didn't quiet get that. We don't want to do block allocation when
called via journal_submit_data_buffers. In writepage if handle is null
and if we have some buffer_head as delayed or not mapped we don't write
the page.




>
> > + err = journal_submit_inode_data_buffers(mapping);
> > if (!ret)
> > ret = err;
> > spin_lock(&journal->j_list_lock);
> > @@ -228,7 +259,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
> > struct jbd2_inode *jinode, *next_i;
> > int err, ret = 0;
> >
> > - /* For locking, see the comment in journal_submit_inode_data_buffers() */
> > + /* For locking, see the comment in journal_submit_data_buffers() */
> > spin_lock(&journal->j_list_lock);
> > list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
> > jinode->i_flags |= JI_COMMIT_RUNNING;
> > @@ -431,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
> > * Now start flushing things to disk, in the order they appear
> > * on the transaction lists. Data blocks go first.
> > */
> > - err = journal_submit_inode_data_buffers(journal, commit_transaction);
> > + err = journal_submit_data_buffers(journal, commit_transaction);
> > if (err)
> > jbd2_journal_abort(journal, err);
> >


-aneesh

2008-06-16 15:05:34

by Jan Kara

[permalink] [raw]
Subject: Re: [RFC] ext4: Semantics of delalloc,data=ordered

Hi Aneesh,

First, I'd like to see some short comment on what semantics
delalloc,data=ordered is going to have. At least I can imagine at least
two sensible approaches:
1) All we guarantee is that user is not going to see uninitialized data.
We send writes to disk (and allocate blocks) whenever it fits our needs
(usually when pdflush finds them).
2) We guarantee that when transaction commits, your data is on disk -
i.e., we allocate actual blocks on transaction commit.

Both these possibilities have their pros and cons. Most importantly,
1) gives better disk layout while 2) gives higher consistency
guarantees. Note that with 1), it can under some circumstances happen,
that after a crash you see block 1 and 3 of your 3-block-write on disk,
while block 2 is still a hole. 1) is easy to implement (you mostly did
it below), 2) is harder. I think there should be broader consensus on
what the semantics should be (changed subject to catch more attention
;).

A few comments to your patch are also below.

Honza

> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> fs/jbd2/commit.c | 41 ++++++++++++--
> 2 files changed, 198 insertions(+), 12 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 63355ab..7d87641 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
> return !buffer_mapped(bh) || buffer_delay(bh);
> }
>
> -/* FIXME!! only support data=writeback mode */
> /*
> * get called vi ext4_da_writepages after taking page lock
> * We may end up doing block allocation here in case
> * mpage_da_map_blocks failed to allocate blocks.
> */
> -static int ext4_da_writepage(struct page *page,
> +static int ext4_da_writeback_writepage(struct page *page,
> struct writeback_control *wbc)
> {
> int ret = 0;
> @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page,
> return ret;
> }
>
> +/*
> + * get called vi ext4_da_writepages after taking page lock
> + * We may end up doing block allocation here in case
> + * mpage_da_map_blocks failed to allocate blocks.
> + *
> + * We also get called via journal_submit_inode_data_buffers
> + */
> +static int ext4_da_ordered_writepage(struct page *page,
> + struct writeback_control *wbc)
> +{
> + int ret = 0;
> + loff_t size;
> + unsigned long len;
> + handle_t *handle = NULL;
> + struct buffer_head *page_bufs;
> + struct inode *inode = page->mapping->host;
> +
> + handle = ext4_journal_current_handle();
> + if (!handle) {
> + /*
> + * This can happen when we aren't called via
> + * ext4_da_writepages() but directly (shrink_page_list).
> + * We cannot easily start a transaction here so we just skip
> + * writing the page in case we would have to do so.
> + */
> + size = i_size_read(inode);
> +
> + page_bufs = page_buffers(page);
> + if (page->index == size >> PAGE_CACHE_SHIFT)
> + len = size & ~PAGE_CACHE_MASK;
> + else
> + len = PAGE_CACHE_SIZE;
> +
> + if (walk_page_buffers(NULL, page_bufs, 0,
> + len, NULL, ext4_bh_unmapped_or_delay)) {
> + /*
> + * We can't do block allocation under
> + * page lock without a handle . So redirty
> + * the page and return.
> + * We may reach here when we do a journal commit
> + * via journal_submit_inode_data_buffers.
> + * If we don't have mapping block we just ignore
> + * them
> + *
> + */
> + redirty_page_for_writepage(wbc, page);
> + unlock_page(page);
> + return 0;
> + }
> + }
> +
> + ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
> +
> + return ret;
> +}
If you're going to use this writepage() implementation from commit
code, you cannot simply do redirty_page_for_writepage() and bail out
when there's an unmapped buffer. You must write out at least mapped
buffers to satisfy ordering guarantees (think of filesystems with
blocksize < page size).

<snip>

Please separate changes to JBD2 into a separate patch - they should be
probably merged into ordered-mode rewrite patch later on...

> diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
> index 483183d..32ca3c3 100644
> --- a/fs/jbd2/commit.c
> +++ b/fs/jbd2/commit.c
> @@ -22,6 +22,8 @@
> #include <linux/pagemap.h>
> #include <linux/jiffies.h>
> #include <linux/crc32.h>
> +#include <linux/writeback.h>
> +#include <linux/backing-dev.h>
>
> /*
> * Default IO end handler for temporary BJ_IO buffer_heads.
> @@ -185,6 +187,30 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
> }
>
> /*
> + * write the filemap data using writepage() address_space_operations.
> + * We don't do block allocation here even for delalloc. We don't
> + * use writepages() because with dealyed allocation we may be doing
> + * block allocation in writepages().
> + */
> +static int journal_submit_inode_data_buffers(struct address_space *mapping)
> +{
> + int ret;
> + struct writeback_control wbc = {
> + .sync_mode = WB_SYNC_ALL,
> + .nr_to_write = mapping->nrpages * 2,
> + .range_start = 0,
> + .range_end = i_size_read(mapping->host),
> + .for_writepages = 1,
> + };
> +
> + if (!mapping_cap_writeback_dirty(mapping))
> + return 0;
> +
> + ret = generic_writepages(mapping, &wbc);
> + return ret;
> +}
> +
> +/*
> * Submit all the data buffers of inode associated with the transaction to
> * disk.
> *
> @@ -192,7 +218,7 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
> * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
> * operate on from being released while we write out pages.
> */
> -static int journal_submit_inode_data_buffers(journal_t *journal,
> +static int journal_submit_data_buffers(journal_t *journal,
> transaction_t *commit_transaction)
journal_submit_data_buffers() isn't a lucky name. This is how the
old function was called and so it would clash with it in the patch
series... I'd be for keeping this name and call the above
journal_submit_mapping_buffers() or just fold the above function into
journal_submit_inode_data_buffers().

> {
> struct jbd2_inode *jinode;
> @@ -204,8 +230,13 @@ static int journal_submit_inode_data_buffers(journal_t *journal,
> mapping = jinode->i_vfs_inode->i_mapping;
> jinode->i_flags |= JI_COMMIT_RUNNING;
> spin_unlock(&journal->j_list_lock);
> - err = filemap_fdatawrite_range(mapping, 0,
> - i_size_read(jinode->i_vfs_inode));
> + /*
> + * submit the inode data buffers. We use writepage
> + * instead of writepages. Because writepages can do
> + * block allocation with delalloc. We need to write
> + * only allocated blocks here.
> + */
> + err = journal_submit_inode_data_buffers(mapping);
> if (!ret)
> ret = err;
> spin_lock(&journal->j_list_lock);
> @@ -228,7 +259,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
> struct jbd2_inode *jinode, *next_i;
> int err, ret = 0;
>
> - /* For locking, see the comment in journal_submit_inode_data_buffers() */
> + /* For locking, see the comment in journal_submit_data_buffers() */
> spin_lock(&journal->j_list_lock);
> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
> jinode->i_flags |= JI_COMMIT_RUNNING;
> @@ -431,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
> * Now start flushing things to disk, in the order they appear
> * on the transaction lists. Data blocks go first.
> */
> - err = journal_submit_inode_data_buffers(journal, commit_transaction);
> + err = journal_submit_data_buffers(journal, commit_transaction);
> if (err)
> jbd2_journal_abort(journal, err);
--
Jan Kara <[email protected]>
SuSE CR Labs

2008-06-16 16:03:27

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [RFC] ext4: Semantics of delalloc,data=ordered

On Mon, Jun 16, 2008 at 05:05:33PM +0200, Jan Kara wrote:
> Hi Aneesh,
>
> First, I'd like to see some short comment on what semantics
> delalloc,data=ordered is going to have. At least I can imagine at least
> two sensible approaches:
> 1) All we guarantee is that user is not going to see uninitialized data.
> We send writes to disk (and allocate blocks) whenever it fits our needs
> (usually when pdflush finds them).
> 2) We guarantee that when transaction commits, your data is on disk -
> i.e., we allocate actual blocks on transaction commit.
>
> Both these possibilities have their pros and cons. Most importantly,
> 1) gives better disk layout while 2) gives higher consistency
> guarantees. Note that with 1), it can under some circumstances happen,
> that after a crash you see block 1 and 3 of your 3-block-write on disk,
> while block 2 is still a hole. 1) is easy to implement (you mostly did
> it below), 2) is harder. I think there should be broader consensus on
> what the semantics should be (changed subject to catch more attention
> ;).
>
> A few comments to your patch are also below.
>
> Honza

The way I was looking at ordered mode was, we only guarantee that the
meta-data blocks corresponding to the data block allocated get committed
only after the data-blocks are written to the disk. As long as we don't
allocate blocks corresponding to a page we don't write the page to
disk. This should also speed up the "sync slowness" that lot of people
are reporting with ordered mode. Can you explain
"
1), it can under some circumstances happen, that after a crash you see
block 1 and 3 of your 3-block-write on disk, while block 2 is still a hole.
"


>
> > Signed-off-by: Aneesh Kumar K.V <[email protected]>
> > ---
> > fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> > fs/jbd2/commit.c | 41 ++++++++++++--
> > 2 files changed, 198 insertions(+), 12 deletions(-)
> >
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index 63355ab..7d87641 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
> > return !buffer_mapped(bh) || buffer_delay(bh);
> > }
> >
> > -/* FIXME!! only support data=writeback mode */
> > /*
> > * get called vi ext4_da_writepages after taking page lock
> > * We may end up doing block allocation here in case
> > * mpage_da_map_blocks failed to allocate blocks.
> > */
> > -static int ext4_da_writepage(struct page *page,
> > +static int ext4_da_writeback_writepage(struct page *page,
> > struct writeback_control *wbc)
> > {
> > int ret = 0;
> > @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page,
> > return ret;
> > }
> >
> > +/*
> > + * get called vi ext4_da_writepages after taking page lock
> > + * We may end up doing block allocation here in case
> > + * mpage_da_map_blocks failed to allocate blocks.
> > + *
> > + * We also get called via journal_submit_inode_data_buffers
> > + */
> > +static int ext4_da_ordered_writepage(struct page *page,
> > + struct writeback_control *wbc)
> > +{
> > + int ret = 0;
> > + loff_t size;
> > + unsigned long len;
> > + handle_t *handle = NULL;
> > + struct buffer_head *page_bufs;
> > + struct inode *inode = page->mapping->host;
> > +
> > + handle = ext4_journal_current_handle();
> > + if (!handle) {
> > + /*
> > + * This can happen when we aren't called via
> > + * ext4_da_writepages() but directly (shrink_page_list).
> > + * We cannot easily start a transaction here so we just skip
> > + * writing the page in case we would have to do so.
> > + */
> > + size = i_size_read(inode);
> > +
> > + page_bufs = page_buffers(page);
> > + if (page->index == size >> PAGE_CACHE_SHIFT)
> > + len = size & ~PAGE_CACHE_MASK;
> > + else
> > + len = PAGE_CACHE_SIZE;
> > +
> > + if (walk_page_buffers(NULL, page_bufs, 0,
> > + len, NULL, ext4_bh_unmapped_or_delay)) {
> > + /*
> > + * We can't do block allocation under
> > + * page lock without a handle . So redirty
> > + * the page and return.
> > + * We may reach here when we do a journal commit
> > + * via journal_submit_inode_data_buffers.
> > + * If we don't have mapping block we just ignore
> > + * them
> > + *
> > + */
> > + redirty_page_for_writepage(wbc, page);
> > + unlock_page(page);
> > + return 0;
> > + }
> > + }
> > +
> > + ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
> > +
> > + return ret;
> > +}
> If you're going to use this writepage() implementation from commit
> code, you cannot simply do redirty_page_for_writepage() and bail out
> when there's an unmapped buffer. You must write out at least mapped
> buffers to satisfy ordering guarantees (think of filesystems with
> blocksize < page size).

With delalloc is it possible to have a page that have some buffer_heads
marked delay ?



-aneesh

2008-06-16 17:20:47

by Jan Kara

[permalink] [raw]
Subject: Re: [RFC] ext4: Semantics of delalloc,data=ordered

> On Mon, Jun 16, 2008 at 05:05:33PM +0200, Jan Kara wrote:
> > Hi Aneesh,
> >
> > First, I'd like to see some short comment on what semantics
> > delalloc,data=ordered is going to have. At least I can imagine at least
> > two sensible approaches:
> > 1) All we guarantee is that user is not going to see uninitialized data.
> > We send writes to disk (and allocate blocks) whenever it fits our needs
> > (usually when pdflush finds them).
> > 2) We guarantee that when transaction commits, your data is on disk -
> > i.e., we allocate actual blocks on transaction commit.
> >
> > Both these possibilities have their pros and cons. Most importantly,
> > 1) gives better disk layout while 2) gives higher consistency
> > guarantees. Note that with 1), it can under some circumstances happen,
> > that after a crash you see block 1 and 3 of your 3-block-write on disk,
> > while block 2 is still a hole. 1) is easy to implement (you mostly did
> > it below), 2) is harder. I think there should be broader consensus on
> > what the semantics should be (changed subject to catch more attention
> > ;).
> >
> > A few comments to your patch are also below.
> >
> > Honza
>
> The way I was looking at ordered mode was, we only guarantee that the
> meta-data blocks corresponding to the data block allocated get committed
> only after the data-blocks are written to the disk. As long as we don't
> allocate blocks corresponding to a page we don't write the page to
> disk. This should also speed up the "sync slowness" that lot of people
> are reporting with ordered mode.
I'm not sure if it helps - tons of dirty data have to get to
transaction at some time even with delayed alloc and at that moment any
"interactive application" is going to be starved.

> Can you explain
> "
> 1), it can under some circumstances happen, that after a crash you see
> block 1 and 3 of your 3-block-write on disk, while block 2 is still a hole.
> "
Imagine you have a file with blocks 1 and 3 allocated and block 2 is a
hole. You write blocks 1-3. Block 2 isn't allocated because of delalloc.
Now if inode is already in the current transaction's list, during commit
writes to blocks 1 and 3 will land on disk but write to block 2 will
happen only after pdflush finds it.

> > > Signed-off-by: Aneesh Kumar K.V <[email protected]>
> > > ---
> > > fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> > > fs/jbd2/commit.c | 41 ++++++++++++--
> > > 2 files changed, 198 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > index 63355ab..7d87641 100644
> > > --- a/fs/ext4/inode.c
> > > +++ b/fs/ext4/inode.c
> > > @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
> > > return !buffer_mapped(bh) || buffer_delay(bh);
> > > }
> > >
> > > -/* FIXME!! only support data=writeback mode */
> > > /*
> > > * get called vi ext4_da_writepages after taking page lock
> > > * We may end up doing block allocation here in case
> > > * mpage_da_map_blocks failed to allocate blocks.
> > > */
> > > -static int ext4_da_writepage(struct page *page,
> > > +static int ext4_da_writeback_writepage(struct page *page,
> > > struct writeback_control *wbc)
> > > {
> > > int ret = 0;
> > > @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page,
> > > return ret;
> > > }
> > >
> > > +/*
> > > + * get called vi ext4_da_writepages after taking page lock
> > > + * We may end up doing block allocation here in case
> > > + * mpage_da_map_blocks failed to allocate blocks.
> > > + *
> > > + * We also get called via journal_submit_inode_data_buffers
> > > + */
> > > +static int ext4_da_ordered_writepage(struct page *page,
> > > + struct writeback_control *wbc)
> > > +{
> > > + int ret = 0;
> > > + loff_t size;
> > > + unsigned long len;
> > > + handle_t *handle = NULL;
> > > + struct buffer_head *page_bufs;
> > > + struct inode *inode = page->mapping->host;
> > > +
> > > + handle = ext4_journal_current_handle();
> > > + if (!handle) {
> > > + /*
> > > + * This can happen when we aren't called via
> > > + * ext4_da_writepages() but directly (shrink_page_list).
> > > + * We cannot easily start a transaction here so we just skip
> > > + * writing the page in case we would have to do so.
> > > + */
> > > + size = i_size_read(inode);
> > > +
> > > + page_bufs = page_buffers(page);
> > > + if (page->index == size >> PAGE_CACHE_SHIFT)
> > > + len = size & ~PAGE_CACHE_MASK;
> > > + else
> > > + len = PAGE_CACHE_SIZE;
> > > +
> > > + if (walk_page_buffers(NULL, page_bufs, 0,
> > > + len, NULL, ext4_bh_unmapped_or_delay)) {
> > > + /*
> > > + * We can't do block allocation under
> > > + * page lock without a handle . So redirty
> > > + * the page and return.
> > > + * We may reach here when we do a journal commit
> > > + * via journal_submit_inode_data_buffers.
> > > + * If we don't have mapping block we just ignore
> > > + * them
> > > + *
> > > + */
> > > + redirty_page_for_writepage(wbc, page);
> > > + unlock_page(page);
> > > + return 0;
> > > + }
> > > + }
> > > +
> > > + ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
> > > +
> > > + return ret;
> > > +}
> > If you're going to use this writepage() implementation from commit
> > code, you cannot simply do redirty_page_for_writepage() and bail out
> > when there's an unmapped buffer. You must write out at least mapped
> > buffers to satisfy ordering guarantees (think of filesystems with
> > blocksize < page size).
>
> With delalloc is it possible to have a page that have some buffer_heads
> marked delay ?
I thought more about the case where some buffers are mapped and some
aren't (because there's a hole in the middle of the page)...

Honza
--
Jan Kara <[email protected]>
SuSE CR Labs

2008-06-16 18:12:38

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [RFC] ext4: Semantics of delalloc,data=ordered

On Mon, Jun 16, 2008 at 07:20:46PM +0200, Jan Kara wrote:
> > On Mon, Jun 16, 2008 at 05:05:33PM +0200, Jan Kara wrote:
> > > Hi Aneesh,
> > >
> > > First, I'd like to see some short comment on what semantics
> > > delalloc,data=ordered is going to have. At least I can imagine at least
> > > two sensible approaches:
> > > 1) All we guarantee is that user is not going to see uninitialized data.
> > > We send writes to disk (and allocate blocks) whenever it fits our needs
> > > (usually when pdflush finds them).
> > > 2) We guarantee that when transaction commits, your data is on disk -
> > > i.e., we allocate actual blocks on transaction commit.
> > >
> > > Both these possibilities have their pros and cons. Most importantly,
> > > 1) gives better disk layout while 2) gives higher consistency
> > > guarantees. Note that with 1), it can under some circumstances happen,
> > > that after a crash you see block 1 and 3 of your 3-block-write on disk,
> > > while block 2 is still a hole. 1) is easy to implement (you mostly did
> > > it below), 2) is harder. I think there should be broader consensus on
> > > what the semantics should be (changed subject to catch more attention
> > > ;).
> > >
> > > A few comments to your patch are also below.
> > >
> > > Honza
> >
> > The way I was looking at ordered mode was, we only guarantee that the
> > meta-data blocks corresponding to the data block allocated get committed
> > only after the data-blocks are written to the disk. As long as we don't
> > allocate blocks corresponding to a page we don't write the page to
> > disk. This should also speed up the "sync slowness" that lot of people
> > are reporting with ordered mode.
> I'm not sure if it helps - tons of dirty data have to get to
> transaction at some time even with delayed alloc and at that moment any
> "interactive application" is going to be starved.
>
> > Can you explain
> > "
> > 1), it can under some circumstances happen, that after a crash you see
> > block 1 and 3 of your 3-block-write on disk, while block 2 is still a hole.
> > "
> Imagine you have a file with blocks 1 and 3 allocated and block 2 is a
> hole. You write blocks 1-3. Block 2 isn't allocated because of delalloc.
> Now if inode is already in the current transaction's list, during commit
> writes to blocks 1 and 3 will land on disk but write to block 2 will
> happen only after pdflush finds it.

And that should be fine with data=ordered mode right ?. Because since
block 2 is not yet allocated we don't have associated meta-data. So
even if we crash we have meta-data pointing to 1 and 3 and not 2. The
problem is only when we write the meta-data pointing to block 2 and not
block 2 itself ?.


>
> > > > Signed-off-by: Aneesh Kumar K.V <[email protected]>
> > > > ---
> > > > fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> > > > fs/jbd2/commit.c | 41 ++++++++++++--
> > > > 2 files changed, 198 insertions(+), 12 deletions(-)
> > > >
> > > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > > index 63355ab..7d87641 100644
> > > > --- a/fs/ext4/inode.c
> > > > +++ b/fs/ext4/inode.c
> > > > @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
> > > > return !buffer_mapped(bh) || buffer_delay(bh);
> > > > }
> > > >
> > > > -/* FIXME!! only support data=writeback mode */
> > > > /*
> > > > * get called vi ext4_da_writepages after taking page lock
> > > > * We may end up doing block allocation here in case
> > > > * mpage_da_map_blocks failed to allocate blocks.
> > > > */
> > > > -static int ext4_da_writepage(struct page *page,
> > > > +static int ext4_da_writeback_writepage(struct page *page,
> > > > struct writeback_control *wbc)
> > > > {
> > > > int ret = 0;
> > > > @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page,
> > > > return ret;
> > > > }
> > > >
> > > > +/*
> > > > + * get called vi ext4_da_writepages after taking page lock
> > > > + * We may end up doing block allocation here in case
> > > > + * mpage_da_map_blocks failed to allocate blocks.
> > > > + *
> > > > + * We also get called via journal_submit_inode_data_buffers
> > > > + */
> > > > +static int ext4_da_ordered_writepage(struct page *page,
> > > > + struct writeback_control *wbc)
> > > > +{
> > > > + int ret = 0;
> > > > + loff_t size;
> > > > + unsigned long len;
> > > > + handle_t *handle = NULL;
> > > > + struct buffer_head *page_bufs;
> > > > + struct inode *inode = page->mapping->host;
> > > > +
> > > > + handle = ext4_journal_current_handle();
> > > > + if (!handle) {
> > > > + /*
> > > > + * This can happen when we aren't called via
> > > > + * ext4_da_writepages() but directly (shrink_page_list).
> > > > + * We cannot easily start a transaction here so we just skip
> > > > + * writing the page in case we would have to do so.
> > > > + */
> > > > + size = i_size_read(inode);
> > > > +
> > > > + page_bufs = page_buffers(page);
> > > > + if (page->index == size >> PAGE_CACHE_SHIFT)
> > > > + len = size & ~PAGE_CACHE_MASK;
> > > > + else
> > > > + len = PAGE_CACHE_SIZE;
> > > > +
> > > > + if (walk_page_buffers(NULL, page_bufs, 0,
> > > > + len, NULL, ext4_bh_unmapped_or_delay)) {
> > > > + /*
> > > > + * We can't do block allocation under
> > > > + * page lock without a handle . So redirty
> > > > + * the page and return.
> > > > + * We may reach here when we do a journal commit
> > > > + * via journal_submit_inode_data_buffers.
> > > > + * If we don't have mapping block we just ignore
> > > > + * them
> > > > + *
> > > > + */
> > > > + redirty_page_for_writepage(wbc, page);
> > > > + unlock_page(page);
> > > > + return 0;
> > > > + }
> > > > + }
> > > > +
> > > > + ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
> > > > +
> > > > + return ret;
> > > > +}
> > > If you're going to use this writepage() implementation from commit
> > > code, you cannot simply do redirty_page_for_writepage() and bail out
> > > when there's an unmapped buffer. You must write out at least mapped
> > > buffers to satisfy ordering guarantees (think of filesystems with
> > > blocksize < page size).
> >
> > With delalloc is it possible to have a page that have some buffer_heads
> > marked delay ?
> I thought more about the case where some buffers are mapped and some
> aren't (because there's a hole in the middle of the page)...

With delalloc it won't be a problem. This is what i understood.

We allocate blocks only during writepages. That means we allocate blocks
and write then via block_write_full_page at the same time. Also we add
meta-data to the journal list. We also add inode to the journal list.
We also mark page_writeback on the page. Now when the journal_commit
happens we again walk through the inode and try to write the page. The
page may already have finished writeback by then or it may be in
writeback. In both the case we won't actually be sending any data block to disk
on journal commit. If is it in writeback we would have page_writeback
set and journal commit would wait via filemap_fdatawait.


-aneesh

2008-06-16 18:55:30

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC] ext4: Semantics of delalloc,data=ordered

On Jun 16, 2008 17:05 +0200, Jan Kara wrote:
> First, I'd like to see some short comment on what semantics
> delalloc,data=ordered is going to have. At least I can imagine at least
> two sensible approaches:
> 1) All we guarantee is that user is not going to see uninitialized data.
> We send writes to disk (and allocate blocks) whenever it fits our needs
> (usually when pdflush finds them).
> 2) We guarantee that when transaction commits, your data is on disk -
> i.e., we allocate actual blocks on transaction commit.
>
> Both these possibilities have their pros and cons. Most importantly,
> 1) gives better disk layout while 2) gives higher consistency
> guarantees. Note that with 1), it can under some circumstances happen,
> that after a crash you see block 1 and 3 of your 3-block-write on disk,
> while block 2 is still a hole. 1) is easy to implement (you mostly did
> it below), 2) is harder. I think there should be broader consensus on
> what the semantics should be (changed subject to catch more attention ;).

IMHO, the semantic should be (1) and not (2). Applications don't understand
"when the transaction commits" so it doesn't provide any useful guarantee
to userspace, and if they actually need the data on disk (e.g. MTA) then
they need to call fsync to ensure this.

While I agree it is theoretically possible to have the "hole in data
where there shouldn't be one" scenario, in real life these blocks would be
allocated together by delalloc+mballoc and this situation should not happen.

As for "sync with heavy IO causing slowness" problem of Firefox, I think
that delalloc will help this noticably, but I agree we can still get into
cases where a lot of dirty data was just allocated and now needs to be
flushed to disk to commit the transaction.

In the short term I don't think this can be completely fixed, but in the
long term I think it can be fixed by having mballoc do "reservations" of
space on disk, in which the dirty pages can be written. Only after the
data is on disk does the "reservation" turn into an "allocation" in the
journal (i.e. filesystem buffers added to transaction and modified).
At that point a sync operation only has to write out the journal blocks,
because all of the data is on disk already.

I don't think it is a huge difference from what we have today, but I
also don't think it should be in the first implementation. We would
need to split up handling of the in-memory block bitmaps so that only
the in-memory ones are updated first, then the on-disk bitmaps are
later marked in use in a transaction after the data blocks are on disk.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


2008-06-16 19:00:12

by Eric Sandeen

[permalink] [raw]
Subject: Re: [RFC] ext4: Semantics of delalloc,data=ordered

Andreas Dilger wrote:
> On Jun 16, 2008 17:05 +0200, Jan Kara wrote:
>> First, I'd like to see some short comment on what semantics
>> delalloc,data=ordered is going to have. At least I can imagine at least
>> two sensible approaches:
>> 1) All we guarantee is that user is not going to see uninitialized data.
>> We send writes to disk (and allocate blocks) whenever it fits our needs
>> (usually when pdflush finds them).
>> 2) We guarantee that when transaction commits, your data is on disk -
>> i.e., we allocate actual blocks on transaction commit.
>>
>> Both these possibilities have their pros and cons. Most importantly,
>> 1) gives better disk layout while 2) gives higher consistency
>> guarantees. Note that with 1), it can under some circumstances happen,
>> that after a crash you see block 1 and 3 of your 3-block-write on disk,
>> while block 2 is still a hole. 1) is easy to implement (you mostly did
>> it below), 2) is harder. I think there should be broader consensus on
>> what the semantics should be (changed subject to catch more attention ;).
>
> IMHO, the semantic should be (1) and not (2). Applications don't understand
> "when the transaction commits" so it doesn't provide any useful guarantee
> to userspace, and if they actually need the data on disk (e.g. MTA) then
> they need to call fsync to ensure this.
>
> While I agree it is theoretically possible to have the "hole in data
> where there shouldn't be one" scenario, in real life these blocks would be
> allocated together by delalloc+mballoc and this situation should not happen.

I'm not sure that's true; filling in holes is not that uncommon.

But, I'm not sure that it actually leads to a problem, as the metadata
gets "created" for the hole-fill-in only when the block actually gets
allocated right?

-Eric


2008-06-16 19:17:29

by Jan Kara

[permalink] [raw]
Subject: Re: [RFC] ext4: Semantics of delalloc,data=ordered

On Mon 16-06-08 23:41:52, Aneesh Kumar K.V wrote:
> On Mon, Jun 16, 2008 at 07:20:46PM +0200, Jan Kara wrote:
> > > On Mon, Jun 16, 2008 at 05:05:33PM +0200, Jan Kara wrote:
> > > > Hi Aneesh,
> > > >
> > > > First, I'd like to see some short comment on what semantics
> > > > delalloc,data=ordered is going to have. At least I can imagine at least
> > > > two sensible approaches:
> > > > 1) All we guarantee is that user is not going to see uninitialized data.
> > > > We send writes to disk (and allocate blocks) whenever it fits our needs
> > > > (usually when pdflush finds them).
> > > > 2) We guarantee that when transaction commits, your data is on disk -
> > > > i.e., we allocate actual blocks on transaction commit.
> > > >
> > > > Both these possibilities have their pros and cons. Most importantly,
> > > > 1) gives better disk layout while 2) gives higher consistency
> > > > guarantees. Note that with 1), it can under some circumstances happen,
> > > > that after a crash you see block 1 and 3 of your 3-block-write on disk,
> > > > while block 2 is still a hole. 1) is easy to implement (you mostly did
> > > > it below), 2) is harder. I think there should be broader consensus on
> > > > what the semantics should be (changed subject to catch more attention
> > > > ;).
> > > >
> > > > A few comments to your patch are also below.
> > > >
> > > > Honza
> > >
> > > The way I was looking at ordered mode was, we only guarantee that the
> > > meta-data blocks corresponding to the data block allocated get committed
> > > only after the data-blocks are written to the disk. As long as we don't
> > > allocate blocks corresponding to a page we don't write the page to
> > > disk. This should also speed up the "sync slowness" that lot of people
> > > are reporting with ordered mode.
> > I'm not sure if it helps - tons of dirty data have to get to
> > transaction at some time even with delayed alloc and at that moment any
> > "interactive application" is going to be starved.
> >
> > > Can you explain
> > > "
> > > 1), it can under some circumstances happen, that after a crash you see
> > > block 1 and 3 of your 3-block-write on disk, while block 2 is still a hole.
> > > "
> > Imagine you have a file with blocks 1 and 3 allocated and block 2 is a
> > hole. You write blocks 1-3. Block 2 isn't allocated because of delalloc.
> > Now if inode is already in the current transaction's list, during commit
> > writes to blocks 1 and 3 will land on disk but write to block 2 will
> > happen only after pdflush finds it.
>
> And that should be fine with data=ordered mode right ?. Because since
> block 2 is not yet allocated we don't have associated meta-data. So
> even if we crash we have meta-data pointing to 1 and 3 and not 2. The
> problem is only when we write the meta-data pointing to block 2 and not
> block 2 itself ?.
Yes, it is correct. I may be just surprising (we didn't do things like
this in data=ordered mode before).

> > > > > Signed-off-by: Aneesh Kumar K.V <[email protected]>
> > > > > ---
> > > > > fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> > > > > fs/jbd2/commit.c | 41 ++++++++++++--
> > > > > 2 files changed, 198 insertions(+), 12 deletions(-)
> > > > >
> > > > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > > > index 63355ab..7d87641 100644
> > > > > --- a/fs/ext4/inode.c
> > > > > +++ b/fs/ext4/inode.c
> > > > > @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
> > > > > return !buffer_mapped(bh) || buffer_delay(bh);
> > > > > }
> > > > >
> > > > > -/* FIXME!! only support data=writeback mode */
> > > > > /*
> > > > > * get called vi ext4_da_writepages after taking page lock
> > > > > * We may end up doing block allocation here in case
> > > > > * mpage_da_map_blocks failed to allocate blocks.
> > > > > */
> > > > > -static int ext4_da_writepage(struct page *page,
> > > > > +static int ext4_da_writeback_writepage(struct page *page,
> > > > > struct writeback_control *wbc)
> > > > > {
> > > > > int ret = 0;
> > > > > @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page,
> > > > > return ret;
> > > > > }
> > > > >
> > > > > +/*
> > > > > + * get called vi ext4_da_writepages after taking page lock
> > > > > + * We may end up doing block allocation here in case
> > > > > + * mpage_da_map_blocks failed to allocate blocks.
> > > > > + *
> > > > > + * We also get called via journal_submit_inode_data_buffers
> > > > > + */
> > > > > +static int ext4_da_ordered_writepage(struct page *page,
> > > > > + struct writeback_control *wbc)
> > > > > +{
> > > > > + int ret = 0;
> > > > > + loff_t size;
> > > > > + unsigned long len;
> > > > > + handle_t *handle = NULL;
> > > > > + struct buffer_head *page_bufs;
> > > > > + struct inode *inode = page->mapping->host;
> > > > > +
> > > > > + handle = ext4_journal_current_handle();
> > > > > + if (!handle) {
> > > > > + /*
> > > > > + * This can happen when we aren't called via
> > > > > + * ext4_da_writepages() but directly (shrink_page_list).
> > > > > + * We cannot easily start a transaction here so we just skip
> > > > > + * writing the page in case we would have to do so.
> > > > > + */
> > > > > + size = i_size_read(inode);
> > > > > +
> > > > > + page_bufs = page_buffers(page);
> > > > > + if (page->index == size >> PAGE_CACHE_SHIFT)
> > > > > + len = size & ~PAGE_CACHE_MASK;
> > > > > + else
> > > > > + len = PAGE_CACHE_SIZE;
> > > > > +
> > > > > + if (walk_page_buffers(NULL, page_bufs, 0,
> > > > > + len, NULL, ext4_bh_unmapped_or_delay)) {
> > > > > + /*
> > > > > + * We can't do block allocation under
> > > > > + * page lock without a handle . So redirty
> > > > > + * the page and return.
> > > > > + * We may reach here when we do a journal commit
> > > > > + * via journal_submit_inode_data_buffers.
> > > > > + * If we don't have mapping block we just ignore
> > > > > + * them
> > > > > + *
> > > > > + */
> > > > > + redirty_page_for_writepage(wbc, page);
> > > > > + unlock_page(page);
> > > > > + return 0;
> > > > > + }
> > > > > + }
> > > > > +
> > > > > + ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
> > > > > +
> > > > > + return ret;
> > > > > +}
> > > > If you're going to use this writepage() implementation from commit
> > > > code, you cannot simply do redirty_page_for_writepage() and bail out
> > > > when there's an unmapped buffer. You must write out at least mapped
> > > > buffers to satisfy ordering guarantees (think of filesystems with
> > > > blocksize < page size).
> > >
> > > With delalloc is it possible to have a page that have some buffer_heads
> > > marked delay ?
> > I thought more about the case where some buffers are mapped and some
> > aren't (because there's a hole in the middle of the page)...
>
> With delalloc it won't be a problem. This is what i understood.
>
> We allocate blocks only during writepages. That means we allocate blocks
> and write then via block_write_full_page at the same time. Also we add
> meta-data to the journal list. We also add inode to the journal list.
> We also mark page_writeback on the page. Now when the journal_commit
> happens we again walk through the inode and try to write the page. The
> page may already have finished writeback by then or it may be in
> writeback. In both the case we won't actually be sending any data block to disk
> on journal commit. If is it in writeback we would have page_writeback
> set and journal commit would wait via filemap_fdatawait.
Ah, you are right. We need to ensure ordering only for new block
allocations and for them, everything is fine. Sorry for confusion.

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2008-06-16 19:22:12

by Jan Kara

[permalink] [raw]
Subject: Re: [RFC] ext4: Semantics of delalloc,data=ordered

On Mon 16-06-08 13:58:57, Eric Sandeen wrote:
> Andreas Dilger wrote:
> > On Jun 16, 2008 17:05 +0200, Jan Kara wrote:
> >> First, I'd like to see some short comment on what semantics
> >> delalloc,data=ordered is going to have. At least I can imagine at least
> >> two sensible approaches:
> >> 1) All we guarantee is that user is not going to see uninitialized data.
> >> We send writes to disk (and allocate blocks) whenever it fits our needs
> >> (usually when pdflush finds them).
> >> 2) We guarantee that when transaction commits, your data is on disk -
> >> i.e., we allocate actual blocks on transaction commit.
> >>
> >> Both these possibilities have their pros and cons. Most importantly,
> >> 1) gives better disk layout while 2) gives higher consistency
> >> guarantees. Note that with 1), it can under some circumstances happen,
> >> that after a crash you see block 1 and 3 of your 3-block-write on disk,
> >> while block 2 is still a hole. 1) is easy to implement (you mostly did
> >> it below), 2) is harder. I think there should be broader consensus on
> >> what the semantics should be (changed subject to catch more attention ;).
> >
> > IMHO, the semantic should be (1) and not (2). Applications don't understand
> > "when the transaction commits" so it doesn't provide any useful guarantee
> > to userspace, and if they actually need the data on disk (e.g. MTA) then
> > they need to call fsync to ensure this.
> >
> > While I agree it is theoretically possible to have the "hole in data
> > where there shouldn't be one" scenario, in real life these blocks would be
> > allocated together by delalloc+mballoc and this situation should not happen.
>
> I'm not sure that's true; filling in holes is not that uncommon.
>
> But, I'm not sure that it actually leads to a problem, as the metadata
> gets "created" for the hole-fill-in only when the block actually gets
> allocated right?
From filesystem point of view everything is correct. Just application may
get confused that only a (non-continuous) subset of it's write made it to
disk. This didn't use to happen before in data=ordered mode.

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2008-06-16 19:23:39

by Eric Sandeen

[permalink] [raw]
Subject: Re: [RFC] ext4: Semantics of delalloc,data=ordered

Jan Kara wrote:
>>> Imagine you have a file with blocks 1 and 3 allocated and block 2 is a
>>> hole. You write blocks 1-3. Block 2 isn't allocated because of delalloc.
>>> Now if inode is already in the current transaction's list, during commit
>>> writes to blocks 1 and 3 will land on disk but write to block 2 will
>>> happen only after pdflush finds it.
>> And that should be fine with data=ordered mode right ?. Because since
>> block 2 is not yet allocated we don't have associated meta-data. So
>> even if we crash we have meta-data pointing to 1 and 3 and not 2. The
>> problem is only when we write the meta-data pointing to block 2 and not
>> block 2 itself ?.
> Yes, it is correct. I may be just surprising (we didn't do things like
> this in data=ordered mode before).

Will it even be surprising? "fill-in-hole; crash;" today may give you
the same thing, right? It's just that with delalloc it might be a
bigger window in time for this to happen?

-Eric

2008-06-16 19:41:00

by Jan Kara

[permalink] [raw]
Subject: Re: [RFC] ext4: Semantics of delalloc,data=ordered

On Mon 16-06-08 12:55:24, Andreas Dilger wrote:
> On Jun 16, 2008 17:05 +0200, Jan Kara wrote:
> > First, I'd like to see some short comment on what semantics
> > delalloc,data=ordered is going to have. At least I can imagine at least
> > two sensible approaches:
> > 1) All we guarantee is that user is not going to see uninitialized data.
> > We send writes to disk (and allocate blocks) whenever it fits our needs
> > (usually when pdflush finds them).
> > 2) We guarantee that when transaction commits, your data is on disk -
> > i.e., we allocate actual blocks on transaction commit.
> >
> > Both these possibilities have their pros and cons. Most importantly,
> > 1) gives better disk layout while 2) gives higher consistency
> > guarantees. Note that with 1), it can under some circumstances happen,
> > that after a crash you see block 1 and 3 of your 3-block-write on disk,
> > while block 2 is still a hole. 1) is easy to implement (you mostly did
> > it below), 2) is harder. I think there should be broader consensus on
> > what the semantics should be (changed subject to catch more attention ;).
>
> IMHO, the semantic should be (1) and not (2). Applications don't understand
> "when the transaction commits" so it doesn't provide any useful guarantee
> to userspace, and if they actually need the data on disk (e.g. MTA) then
> they need to call fsync to ensure this.
Well, in principle I agree with you. But let's take example of firefox /
konqueror. They call fsync() on each modification to their "browsing
history" file. And when I asked one Konqueror developer why do they do such
a stupid thing, he told me that XFS used to zero-out portions of the file
on crash (because of delalloc) and this broke the browser in funny ways.
Now the point I wanted to make is that if the filesystem behaves
"reasonably" apps often do not need to call fsync (Konquerror is fine with
how current ext3 in data=ordered mode behaves without using fsync, I'm not
sure if it would be fine with delalloc,data=ordered of ext4).
So I'm not really decided whether the additional performance is worth the
decreased data consistency guarantees...

> While I agree it is theoretically possible to have the "hole in data
> where there shouldn't be one" scenario, in real life these blocks would be
> allocated together by delalloc+mballoc and this situation should not happen.
>
> As for "sync with heavy IO causing slowness" problem of Firefox, I think
> that delalloc will help this noticably, but I agree we can still get into
> cases where a lot of dirty data was just allocated and now needs to be
> flushed to disk to commit the transaction.
>
> In the short term I don't think this can be completely fixed, but in the
> long term I think it can be fixed by having mballoc do "reservations" of
> space on disk, in which the dirty pages can be written. Only after the
> data is on disk does the "reservation" turn into an "allocation" in the
> journal (i.e. filesystem buffers added to transaction and modified).
> At that point a sync operation only has to write out the journal blocks,
> because all of the data is on disk already.
Interesting idea, this seems like a viable way to fix the problem.

> I don't think it is a huge difference from what we have today, but I
> also don't think it should be in the first implementation. We would
> need to split up handling of the in-memory block bitmaps so that only
> the in-memory ones are updated first, then the on-disk bitmaps are
> later marked in use in a transaction after the data blocks are on disk.

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2008-06-16 19:45:50

by Jan Kara

[permalink] [raw]
Subject: Re: [RFC] ext4: Semantics of delalloc,data=ordered

On Mon 16-06-08 14:22:24, Eric Sandeen wrote:
> Jan Kara wrote:
> >>> Imagine you have a file with blocks 1 and 3 allocated and block 2 is a
> >>> hole. You write blocks 1-3. Block 2 isn't allocated because of delalloc.
> >>> Now if inode is already in the current transaction's list, during commit
> >>> writes to blocks 1 and 3 will land on disk but write to block 2 will
> >>> happen only after pdflush finds it.
> >> And that should be fine with data=ordered mode right ?. Because since
> >> block 2 is not yet allocated we don't have associated meta-data. So
> >> even if we crash we have meta-data pointing to 1 and 3 and not 2. The
> >> problem is only when we write the meta-data pointing to block 2 and not
> >> block 2 itself ?.
> > Yes, it is correct. I may be just surprising (we didn't do things like
> > this in data=ordered mode before).
>
> Will it even be surprising? "fill-in-hole; crash;" today may give you
> the same thing, right? It's just that with delalloc it might be a
> bigger window in time for this to happen?
Thinking more about it, yes, it could happen even with current ordered
mode. If the crash happens between writeback of data buffers and commit of
transaction, block in the middle would remain to be a hole.
OK, so we just make the window larger. You are starting to convince me
that 1) is really the better choice ;).

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR