Hi,
two patches below fix warnings and possible BUGs in ext3/ext4 when running
in data=journal mode (see https://bugzilla.kernel.org/show_bug.cgi?id=14156).
Admittedly, the workaround I've chosen is kind of ugly but reimplementing
block_prepare_write() just for the data=journal mode seems even uglier to me.
If anybody has better solution, I'd gladly hear it.
Honza
block_prepare_write() can dirty freshly created buffer. This is a problem
for data=journal mode because data buffers shouldn't be dirty unless they
are undergoing checkpoint. So we have to tweak get_block function for
data=journal mode to catch the case when block_prepare_write would dirty
the buffer, do the work instead of block_prepare_write, and properly handle
dirty buffer as data=journal mode requires it.
It might be cleaner to avoid using block_prepare_write() for data=journal
mode writes but that would require us to duplicate most of the function
which isn't nice either...
Signed-off-by: Jan Kara <[email protected]>
---
fs/ext3/inode.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++-------
1 files changed, 48 insertions(+), 8 deletions(-)
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ea33bdf..2b61cc4 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -993,6 +993,43 @@ out:
return ret;
}
+static int ext3_journalled_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ handle_t *handle = ext3_journal_current_handle();
+ int ret;
+
+ /* This function should ever be used only for real buffers */
+ BUG_ON(!bh->b_page);
+
+ ret = ext3_get_blocks_handle(handle, inode, iblock, 1, bh, create);
+ if (ret > 0) {
+ if (buffer_new(bh)) {
+ struct page *page = bh->b_page;
+
+ /*
+ * This is a terrible hack to avoid block_prepare_write
+ * marking our buffer as dirty
+ */
+ if (PageUptodate(page)) {
+ ret = ext3_journal_get_write_access(handle, bh);
+ if (ret < 0)
+ goto out;
+ unmap_underlying_metadata(bh->b_bdev,
+ bh->b_blocknr);
+ clear_buffer_new(bh);
+ set_buffer_uptodate(bh);
+ ret = ext3_journal_dirty_metadata(handle, bh);
+ if (ret < 0)
+ goto out;
+ }
+ }
+ ret = 0;
+ }
+out:
+ return ret;
+}
+
int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
@@ -1196,15 +1233,18 @@ retry:
ret = PTR_ERR(handle);
goto out;
}
- ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- ext3_get_block);
- if (ret)
- goto write_begin_failed;
-
if (ext3_should_journal_data(inode)) {
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, do_journal_get_write_access);
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+ fsdata, ext3_journalled_get_block);
+ if (ret)
+ goto write_begin_failed;
+ ret = walk_page_buffers(handle, page_buffers(page), from, to,
+ NULL, do_journal_get_write_access);
+ } else {
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+ fsdata, ext3_get_block);
}
+
write_begin_failed:
if (ret) {
/*
@@ -1668,7 +1708,7 @@ static int ext3_journalled_writepage(struct page *page,
*/
ClearPageChecked(page);
ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext3_get_block);
+ ext3_journalled_get_block);
if (ret != 0) {
ext3_journal_stop(handle);
goto out_unlock;
--
1.6.4.2
block_prepare_write() can dirty freshly created buffer. This is a problem
for data=journal mode because data buffers shouldn't be dirty unless they
are undergoing checkpoint. So we have to tweak get_block function for
data=journal mode to catch the case when block_prepare_write would dirty
the buffer, do the work instead of block_prepare_write, and properly handle
dirty buffer as data=journal mode requires it.
It might be cleaner to avoid using block_prepare_write() for data=journal
mode writes but that would require us to duplicate most of the function
which isn't nice either...
CC: [email protected]
Signed-off-by: Jan Kara <[email protected]>
---
fs/ext4/inode.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
1 files changed, 43 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 81d6054..526404b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1388,6 +1388,43 @@ out:
return ret;
}
+static int ext4_journalled_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ handle_t *handle = ext4_journal_current_handle();
+ int ret;
+
+ /* This function should ever be used only for real buffers */
+ BUG_ON(!bh->b_page);
+
+ ret = ext4_get_blocks(handle, inode, iblock, 1, bh,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
+ if (ret > 0) {
+ if (buffer_new(bh)) {
+ struct page *page = bh->b_page;
+
+ /*
+ * This is a terrible hack to avoid block_prepare_write
+ * marking our buffer as dirty
+ */
+ if (PageUptodate(page)) {
+ ret = ext4_journal_get_write_access(handle, bh);
+ if (ret < 0)
+ return ret;
+ unmap_underlying_metadata(bh->b_bdev,
+ bh->b_blocknr);
+ clear_buffer_new(bh);
+ set_buffer_uptodate(bh);
+ ret = jbd2_journal_dirty_metadata(handle, bh);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ ret = 0;
+ }
+ return ret;
+}
+
/*
* `handle' can be NULL if create is zero
*/
@@ -1599,12 +1636,14 @@ retry:
if (ext4_should_dioread_nolock(inode))
ret = block_write_begin(file, mapping, pos, len, flags, pagep,
fsdata, ext4_get_block_write);
- else
+ else if (!ext4_should_journal_data(inode))
ret = block_write_begin(file, mapping, pos, len, flags, pagep,
fsdata, ext4_get_block);
Hi,
> two patches below fix warnings and possible BUGs in ext3/ext4 when running
> in data=journal mode (see https://bugzilla.kernel.org/show_bug.cgi?id=14156).
> Admittedly, the workaround I've chosen is kind of ugly but reimplementing
> block_prepare_write() just for the data=journal mode seems even uglier to me.
> If anybody has better solution, I'd gladly hear it.
Ted, could you have a look at the ext4 patch and merge it if you're fine
with it? Thanks.
Honza
--
Jan Kara <[email protected]>
SuSE CR Labs
On Mon, Jun 21, 2010 at 12:42:52PM +0200, Jan Kara wrote:
> block_prepare_write() can dirty freshly created buffer. This is a problem
> for data=journal mode because data buffers shouldn't be dirty unless they
> are undergoing checkpoint. So we have to tweak get_block function for
> data=journal mode to catch the case when block_prepare_write would dirty
> the buffer, do the work instead of block_prepare_write, and properly handle
> dirty buffer as data=journal mode requires it.
>
> It might be cleaner to avoid using block_prepare_write() for data=journal
> mode writes but that would require us to duplicate most of the function
> which isn't nice either...
>
> Signed-off-by: Jan Kara <[email protected]>
> ---
> fs/ext3/inode.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++-------
> 1 files changed, 48 insertions(+), 8 deletions(-)
>
> diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
> index ea33bdf..2b61cc4 100644
> --- a/fs/ext3/inode.c
> +++ b/fs/ext3/inode.c
> @@ -993,6 +993,43 @@ out:
> return ret;
> }
>
> +static int ext3_journalled_get_block(struct inode *inode, sector_t iblock,
> + struct buffer_head *bh, int create)
> +{
> + handle_t *handle = ext3_journal_current_handle();
> + int ret;
> +
> + /* This function should ever be used only for real buffers */
> + BUG_ON(!bh->b_page);
> +
> + ret = ext3_get_blocks_handle(handle, inode, iblock, 1, bh, create);
> + if (ret > 0) {
> + if (buffer_new(bh)) {
> + struct page *page = bh->b_page;
> +
> + /*
> + * This is a terrible hack to avoid block_prepare_write
> + * marking our buffer as dirty
> + */
> + if (PageUptodate(page)) {
> + ret = ext3_journal_get_write_access(handle, bh);
> + if (ret < 0)
> + goto out;
> + unmap_underlying_metadata(bh->b_bdev,
> + bh->b_blocknr);
> + clear_buffer_new(bh);
> + set_buffer_uptodate(bh);
> + ret = ext3_journal_dirty_metadata(handle, bh);
> + if (ret < 0)
> + goto out;
> + }
> + }
Hey Jan,
It looks like in __block_prepare_write we zero out the end of the page if we're
not writing to the entire block, but you short-circuit this behavior with this
get_block. So it's possible that if we only write to half of the block, the
last half is going to have whatever stale data was in there before, right?
Thanks,
Josef
On Mon, Jul 19, 2010 at 02:02:22PM -0400, Josef Bacik wrote:
> On Mon, Jun 21, 2010 at 12:42:52PM +0200, Jan Kara wrote:
> > block_prepare_write() can dirty freshly created buffer. This is a problem
> > for data=journal mode because data buffers shouldn't be dirty unless they
> > are undergoing checkpoint. So we have to tweak get_block function for
> > data=journal mode to catch the case when block_prepare_write would dirty
> > the buffer, do the work instead of block_prepare_write, and properly handle
> > dirty buffer as data=journal mode requires it.
> >
> > It might be cleaner to avoid using block_prepare_write() for data=journal
> > mode writes but that would require us to duplicate most of the function
> > which isn't nice either...
> >
> > Signed-off-by: Jan Kara <[email protected]>
> > ---
> > fs/ext3/inode.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++-------
> > 1 files changed, 48 insertions(+), 8 deletions(-)
> >
> > diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
> > index ea33bdf..2b61cc4 100644
> > --- a/fs/ext3/inode.c
> > +++ b/fs/ext3/inode.c
> > @@ -993,6 +993,43 @@ out:
> > return ret;
> > }
> >
> > +static int ext3_journalled_get_block(struct inode *inode, sector_t iblock,
> > + struct buffer_head *bh, int create)
> > +{
> > + handle_t *handle = ext3_journal_current_handle();
> > + int ret;
> > +
> > + /* This function should ever be used only for real buffers */
> > + BUG_ON(!bh->b_page);
> > +
> > + ret = ext3_get_blocks_handle(handle, inode, iblock, 1, bh, create);
> > + if (ret > 0) {
> > + if (buffer_new(bh)) {
> > + struct page *page = bh->b_page;
> > +
> > + /*
> > + * This is a terrible hack to avoid block_prepare_write
> > + * marking our buffer as dirty
> > + */
> > + if (PageUptodate(page)) {
> > + ret = ext3_journal_get_write_access(handle, bh);
> > + if (ret < 0)
> > + goto out;
> > + unmap_underlying_metadata(bh->b_bdev,
> > + bh->b_blocknr);
> > + clear_buffer_new(bh);
> > + set_buffer_uptodate(bh);
> > + ret = ext3_journal_dirty_metadata(handle, bh);
> > + if (ret < 0)
> > + goto out;
> > + }
> > + }
>
> Hey Jan,
>
> It looks like in __block_prepare_write we zero out the end of the page if we're
> not writing to the entire block, but you short-circuit this behavior with this
> get_block. So it's possible that if we only write to half of the block, the
> last half is going to have whatever stale data was in there before, right?
> Thanks,
>
Oops, ignore me, nothing changes for the !PageUptodate() case, which is where
the page zero'ing part happens. Carry on :),
Josef
> block_prepare_write() can dirty freshly created buffer. This is a problem
> for data=journal mode because data buffers shouldn't be dirty unless they
> are undergoing checkpoint. So we have to tweak get_block function for
> data=journal mode to catch the case when block_prepare_write would dirty
> the buffer, do the work instead of block_prepare_write, and properly handle
> dirty buffer as data=journal mode requires it.
>
> It might be cleaner to avoid using block_prepare_write() for data=journal
> mode writes but that would require us to duplicate most of the function
> which isn't nice either...
Ted, how about this patch? Do you agree with it? Thanks.
Honza
> CC: [email protected]
> Signed-off-by: Jan Kara <[email protected]>
> ---
> fs/ext4/inode.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
> 1 files changed, 43 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 81d6054..526404b 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1388,6 +1388,43 @@ out:
> return ret;
> }
>
> +static int ext4_journalled_get_block(struct inode *inode, sector_t iblock,
> + struct buffer_head *bh, int create)
> +{
> + handle_t *handle = ext4_journal_current_handle();
> + int ret;
> +
> + /* This function should ever be used only for real buffers */
> + BUG_ON(!bh->b_page);
> +
> + ret = ext4_get_blocks(handle, inode, iblock, 1, bh,
> + create ? EXT4_GET_BLOCKS_CREATE : 0);
> + if (ret > 0) {
> + if (buffer_new(bh)) {
> + struct page *page = bh->b_page;
> +
> + /*
> + * This is a terrible hack to avoid block_prepare_write
> + * marking our buffer as dirty
> + */
> + if (PageUptodate(page)) {
> + ret = ext4_journal_get_write_access(handle, bh);
> + if (ret < 0)
> + return ret;
> + unmap_underlying_metadata(bh->b_bdev,
> + bh->b_blocknr);
> + clear_buffer_new(bh);
> + set_buffer_uptodate(bh);
> + ret = jbd2_journal_dirty_metadata(handle, bh);
> + if (ret < 0)
> + return ret;
> + }
> + }
> + ret = 0;
> + }
> + return ret;
> +}
> +
> /*
> * `handle' can be NULL if create is zero
> */
> @@ -1599,12 +1636,14 @@ retry:
> if (ext4_should_dioread_nolock(inode))
> ret = block_write_begin(file, mapping, pos, len, flags, pagep,
> fsdata, ext4_get_block_write);
> - else
> + else if (!ext4_should_journal_data(inode))
> ret = block_write_begin(file, mapping, pos, len, flags, pagep,
> fsdata, ext4_get_block);
> -
> - if (!ret && ext4_should_journal_data(inode)) {
> - ret = walk_page_buffers(handle, page_buffers(page),
> + else {
> + ret = block_write_begin(file, mapping, pos, len, flags, pagep,
> + fsdata, ext4_journalled_get_block);
> + if (!ret)
> + ret = walk_page_buffers(handle, page_buffers(page),
> from, to, NULL, do_journal_get_write_access);
> }
>
> --
> 1.6.4.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Jan Kara <[email protected]>
SuSE CR Labs