LinuxLists.cc - [PATCH 1/3] ext3: Fix data / filesystem corruption when write fails to copy data

2009-12-02 19:17:05

Subject: [PATCH 1/3] ext3: Fix data / filesystem corruption when write fails to copy data

When ext3_write_begin fails after allocating some blocks or
generic_perform_write fails to copy data to write, we truncate blocks already
instantiated beyond i_size. Although these blocks were never inside i_size, we
have to truncate pagecache of these blocks so that corresponding buffers get
unmapped. Otherwise subsequent __block_prepare_write (called because we are
retrying the write) will find the buffers mapped, not call ->get_block, and
thus the page will be backed by already freed blocks leading to filesystem and
data corruption.

CC: [email protected]
Reported-by: James Y Knight <[email protected]>
Signed-off-by: Jan Kara <[email protected]>
---
fs/ext3/inode.c | 18 ++++++++++++++----
1 files changed, 14 insertions(+), 4 deletions(-)

I will take care of merging this patch. I'm just sending it for completeness...

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 354ed3b..f9d6937 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1151,6 +1151,16 @@ static int do_journal_get_write_access(handle_t *handle,
return ext3_journal_get_write_access(handle, bh);
}

+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext3_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext3_truncate(inode);
+}
+
static int ext3_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1209,7 +1219,7 @@ write_begin_failed:
unlock_page(page);
page_cache_release(page);
if (pos + len > inode->i_size)
- ext3_truncate(inode);
+ ext3_truncate_failed_write(inode);
}
if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -1304,7 +1314,7 @@ static int ext3_ordered_write_end(struct file *file,
page_cache_release(page);

if (pos + len > inode->i_size)
- ext3_truncate(inode);
+ ext3_truncate_failed_write(inode);
return ret ? ret : copied;
}

@@ -1330,7 +1340,7 @@ static int ext3_writeback_write_end(struct file *file,
page_cache_release(page);

if (pos + len > inode->i_size)
- ext3_truncate(inode);
+ ext3_truncate_failed_write(inode);
return ret ? ret : copied;
}

@@ -1383,7 +1393,7 @@ static int ext3_journalled_write_end(struct file *file,
page_cache_release(page);

if (pos + len > inode->i_size)
- ext3_truncate(inode);
+ ext3_truncate_failed_write(inode);
return ret ? ret : copied;
}

--
1.6.4.2

2009-12-02 19:17:21

by Jan Kara

[permalink] [raw]

Subject: [PATCH 2/3] ext4: Fix data / filesystem corruption when write fails to copy data

When ext4_write_begin fails after allocating some blocks or
generic_perform_write fails to copy data to write, we truncate blocks already
instantiated beyond i_size. Although these blocks were never inside i_size, we
have to truncate pagecache of these blocks so that corresponding buffers get
unmapped. Otherwise subsequent __block_prepare_write (called because we are
retrying the write) will find the buffers mapped, not call ->get_block, and
thus the page will be backed by already freed blocks leading to filesystem and
data corruption.

CC: [email protected]
CC: [email protected]
Signed-off-by: Jan Kara <[email protected]>
---
fs/ext4/inode.c | 20 +++++++++++++++-----
1 files changed, 15 insertions(+), 5 deletions(-)

Ted, will you please merge this patch? Thanks.

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c8caa5..18b9416 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1534,6 +1534,16 @@ static int do_journal_get_write_access(handle_t *handle,
return ext4_journal_get_write_access(handle, bh);
}

+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext4_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext4_truncate(inode);
+}
+
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1599,7 +1609,7 @@ retry:

ext4_journal_stop(handle);
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
* still be on the orphan list; we need to
@@ -1709,7 +1719,7 @@ static int ext4_ordered_write_end(struct file *file,
ret = ret2;

if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1751,7 +1761,7 @@ static int ext4_writeback_write_end(struct file *file,
ret = ret2;

if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1814,7 +1824,7 @@ static int ext4_journalled_write_end(struct file *file,
if (!ret)
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -3091,7 +3101,7 @@ retry:
* i_size_read because we hold i_mutex.
*/
if (pos + len > inode->i_size)
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
}

if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
--
1.6.4.2

2009-12-02 19:17:26

by Jan Kara

[permalink] [raw]

Subject: [PATCH 3/3] reiserfs: Truncate blocks not used by write (v3)

It can happen that write does not use all the blocks allocated in
write_begin either because of some filesystem error (like ENOSPC) or
because page with data to write has been removed from memory. We truncate
these blocks so that we don't have dangling blocks beyond i_size.

CC: Jeff Mahoney <[email protected]>
Signed-off-by: Jan Kara <[email protected]>
---
fs/reiserfs/inode.c | 17 ++++++++++++++---
1 files changed, 14 insertions(+), 3 deletions(-)

Andrew, this replaces the patch with the same name in your tree.

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a14d6cd..d240c15 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2531,6 +2531,12 @@ static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
return reiserfs_write_full_page(page, wbc);
}

+static void reiserfs_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ reiserfs_truncate_file(inode, 0);
+}
+
static int reiserfs_write_begin(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -2597,6 +2603,8 @@ static int reiserfs_write_begin(struct file *file,
if (ret) {
unlock_page(page);
page_cache_release(page);
+ /* Truncate allocated blocks */
+ reiserfs_truncate_failed_write(inode);
}
return ret;
}
@@ -2689,8 +2697,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
** transaction tracking stuff when the size changes. So, we have
** to do the i_size updates here.
*/
- pos += copied;
- if (pos > inode->i_size) {
+ if (pos + copied > inode->i_size) {
struct reiserfs_transaction_handle myth;
reiserfs_write_lock(inode->i_sb);
/* If the file have grown beyond the border where it
@@ -2708,7 +2715,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
goto journal_error;
}
reiserfs_update_inode_transaction(inode);
- inode->i_size = pos;
+ inode->i_size = pos + copied;
/*
* this will just nest into our transaction. It's important
* to use mark_inode_dirty so the inode gets pushed around on the
@@ -2735,6 +2742,10 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
out:
unlock_page(page);
page_cache_release(page);
+
+ if (pos + len > inode->i_size)
+ reiserfs_truncate_failed_write(inode);
+
return ret == 0 ? copied : ret;

journal_error:
--
1.6.4.2

2009-12-09 02:26:24

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH 2/3] ext4: Fix data / filesystem corruption when write fails to copy data

On Wed, Dec 02, 2009 at 08:16:48PM +0100, Jan Kara wrote:
> When ext4_write_begin fails after allocating some blocks or
> generic_perform_write fails to copy data to write, we truncate blocks already
> instantiated beyond i_size. Although these blocks were never inside i_size, we
> have to truncate pagecache of these blocks so that corresponding buffers get
> unmapped. Otherwise subsequent __block_prepare_write (called because we are
> retrying the write) will find the buffers mapped, not call ->get_block, and
> thus the page will be backed by already freed blocks leading to filesystem and
> data corruption.

Added to the ext4 patch queue.

- Ted

2009-12-09 15:42:09

by saeed bishara

[permalink] [raw]

Subject: Re: [PATCH 1/3] ext3: Fix data / filesystem corruption when write fails to copy data

Hi,
I came a cross data corruption bug when using ext3, this patch fixed
it. the bug exists in 2.6.31 and 32.
saeed

On Wed, Dec 2, 2009 at 9:16 PM, Jan Kara <[email protected]> wrote:
> When ext3_write_begin fails after allocating some blocks or
> generic_perform_write fails to copy data to write, we truncate blocks already
> instantiated beyond i_size. Although these blocks were never inside i_size, we
> have to truncate pagecache of these blocks so that corresponding buffers get
> unmapped. Otherwise subsequent __block_prepare_write (called because we are
> retrying the write) will find the buffers mapped, not call ->get_block, and
> thus the page will be backed by already freed blocks leading to filesystem and
> data corruption.
>
> CC: [email protected]
> Reported-by: James Y Knight <[email protected]>
> Signed-off-by: Jan Kara <[email protected]>
> ---
> fs/ext3/inode.c | 18 ++++++++++++++----
> 1 files changed, 14 insertions(+), 4 deletions(-)
>
> I will take care of merging this patch. I'm just sending it for completeness...
>
> diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
> index 354ed3b..f9d6937 100644
> --- a/fs/ext3/inode.c
> +++ b/fs/ext3/inode.c
> @@ -1151,6 +1151,16 @@ static int do_journal_get_write_access(handle_t *handle,
> return ext3_journal_get_write_access(handle, bh);
> }
>
> +/*
> + * Truncate blocks that were not used by write. We have to truncate the
> + * pagecache as well so that corresponding buffers get properly unmapped.
> + */
> +static void ext3_truncate_failed_write(struct inode *inode)
> +{
> + truncate_inode_pages(inode->i_mapping, inode->i_size);
> + ext3_truncate(inode);
> +}
> +
> static int ext3_write_begin(struct file *file, struct address_space *mapping,
> loff_t pos, unsigned len, unsigned flags,
> struct page **pagep, void **fsdata)
> @@ -1209,7 +1219,7 @@ write_begin_failed:
> unlock_page(page);
> page_cache_release(page);
> if (pos + len > inode->i_size)
> - ext3_truncate(inode);
> + ext3_truncate_failed_write(inode);
> }
> if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
> goto retry;
> @@ -1304,7 +1314,7 @@ static int ext3_ordered_write_end(struct file *file,
> page_cache_release(page);
>
> if (pos + len > inode->i_size)
> - ext3_truncate(inode);
> + ext3_truncate_failed_write(inode);
> return ret ? ret : copied;
> }
>
> @@ -1330,7 +1340,7 @@ static int ext3_writeback_write_end(struct file *file,
> page_cache_release(page);
>
> if (pos + len > inode->i_size)
> - ext3_truncate(inode);
> + ext3_truncate_failed_write(inode);
> return ret ? ret : copied;
> }
>
> @@ -1383,7 +1393,7 @@ static int ext3_journalled_write_end(struct file *file,
> page_cache_release(page);
>
> if (pos + len > inode->i_size)
> - ext3_truncate(inode);
> + ext3_truncate_failed_write(inode);
> return ret ? ret : copied;
> }
>
> --
> 1.6.4.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2009-12-09 16:10:59

by Jan Kara

[permalink] [raw]

Subject: Re: [PATCH 1/3] ext3: Fix data / filesystem corruption when write fails to copy data

Hi,

On Wed 09-12-09 17:42:12, saeed bishara wrote:
> I came a cross data corruption bug when using ext3, this patch fixed
> it. the bug exists in 2.6.31 and 32.
Yes, I plan to send the fix to [email protected] so that it gets fixed in
the stable releases for these kernels as well. Thanks for your notice.

Honza

> On Wed, Dec 2, 2009 at 9:16 PM, Jan Kara <[email protected]> wrote:
> > When ext3_write_begin fails after allocating some blocks or
> > generic_perform_write fails to copy data to write, we truncate blocks already
> > instantiated beyond i_size. Although these blocks were never inside i_size, we
> > have to truncate pagecache of these blocks so that corresponding buffers get
> > unmapped. Otherwise subsequent __block_prepare_write (called because we are
> > retrying the write) will find the buffers mapped, not call ->get_block, and
> > thus the page will be backed by already freed blocks leading to filesystem and
> > data corruption.
> >
> > CC: [email protected]
> > Reported-by: James Y Knight <[email protected]>
> > Signed-off-by: Jan Kara <[email protected]>
> > ---
> > ?fs/ext3/inode.c | ? 18 ++++++++++++++----
> > ?1 files changed, 14 insertions(+), 4 deletions(-)
> >
> > I will take care of merging this patch. I'm just sending it for completeness...
> >
> > diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
> > index 354ed3b..f9d6937 100644
> > --- a/fs/ext3/inode.c
> > +++ b/fs/ext3/inode.c
> > @@ -1151,6 +1151,16 @@ static int do_journal_get_write_access(handle_t *handle,
> > ? ? ? ?return ext3_journal_get_write_access(handle, bh);
> > ?}
> >
> > +/*
> > + * Truncate blocks that were not used by write. We have to truncate the
> > + * pagecache as well so that corresponding buffers get properly unmapped.
> > + */
> > +static void ext3_truncate_failed_write(struct inode *inode)
> > +{
> > + ? ? ? truncate_inode_pages(inode->i_mapping, inode->i_size);
> > + ? ? ? ext3_truncate(inode);
> > +}
> > +
> > ?static int ext3_write_begin(struct file *file, struct address_space *mapping,
> > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?loff_t pos, unsigned len, unsigned flags,
> > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct page **pagep, void **fsdata)
> > @@ -1209,7 +1219,7 @@ write_begin_failed:
> > ? ? ? ? ? ? ? ?unlock_page(page);
> > ? ? ? ? ? ? ? ?page_cache_release(page);
> > ? ? ? ? ? ? ? ?if (pos + len > inode->i_size)
> > - ? ? ? ? ? ? ? ? ? ? ? ext3_truncate(inode);
> > + ? ? ? ? ? ? ? ? ? ? ? ext3_truncate_failed_write(inode);
> > ? ? ? ?}
> > ? ? ? ?if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
> > ? ? ? ? ? ? ? ?goto retry;
> > @@ -1304,7 +1314,7 @@ static int ext3_ordered_write_end(struct file *file,
> > ? ? ? ?page_cache_release(page);
> >
> > ? ? ? ?if (pos + len > inode->i_size)
> > - ? ? ? ? ? ? ? ext3_truncate(inode);
> > + ? ? ? ? ? ? ? ext3_truncate_failed_write(inode);
> > ? ? ? ?return ret ? ret : copied;
> > ?}
> >
> > @@ -1330,7 +1340,7 @@ static int ext3_writeback_write_end(struct file *file,
> > ? ? ? ?page_cache_release(page);
> >
> > ? ? ? ?if (pos + len > inode->i_size)
> > - ? ? ? ? ? ? ? ext3_truncate(inode);
> > + ? ? ? ? ? ? ? ext3_truncate_failed_write(inode);
> > ? ? ? ?return ret ? ret : copied;
> > ?}
> >
> > @@ -1383,7 +1393,7 @@ static int ext3_journalled_write_end(struct file *file,
> > ? ? ? ?page_cache_release(page);
> >
> > ? ? ? ?if (pos + len > inode->i_size)
> > - ? ? ? ? ? ? ? ext3_truncate(inode);
> > + ? ? ? ? ? ? ? ext3_truncate_failed_write(inode);
> > ? ? ? ?return ret ? ret : copied;
> > ?}
> >
> > --
> > 1.6.4.2
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [email protected]
> > More majordomo info at ?http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at ?http://www.tux.org/lkml/
> >
--
Jan Kara <[email protected]>
SUSE Labs, CR