2009-12-02 19:17:04

by Jan Kara

[permalink] [raw]
Subject: [PATCH 1/3] ext3: Fix data / filesystem corruption when write fails to copy data

When ext3_write_begin fails after allocating some blocks or
generic_perform_write fails to copy data to write, we truncate blocks already
instantiated beyond i_size. Although these blocks were never inside i_size, we
have to truncate pagecache of these blocks so that corresponding buffers get
unmapped. Otherwise subsequent __block_prepare_write (called because we are
retrying the write) will find the buffers mapped, not call ->get_block, and
thus the page will be backed by already freed blocks leading to filesystem and
data corruption.

CC: [email protected]
Reported-by: James Y Knight <[email protected]>
Signed-off-by: Jan Kara <[email protected]>
---
fs/ext3/inode.c | 18 ++++++++++++++----
1 files changed, 14 insertions(+), 4 deletions(-)

I will take care of merging this patch. I'm just sending it for completeness...

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 354ed3b..f9d6937 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1151,6 +1151,16 @@ static int do_journal_get_write_access(handle_t *handle,
return ext3_journal_get_write_access(handle, bh);
}

+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext3_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext3_truncate(inode);
+}
+
static int ext3_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1209,7 +1219,7 @@ write_begin_failed:
unlock_page(page);
page_cache_release(page);
if (pos + len > inode->i_size)
- ext3_truncate(inode);
+ ext3_truncate_failed_write(inode);
}
if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -1304,7 +1314,7 @@ static int ext3_ordered_write_end(struct file *file,
page_cache_release(page);

if (pos + len > inode->i_size)
- ext3_truncate(inode);
+ ext3_truncate_failed_write(inode);
return ret ? ret : copied;
}

@@ -1330,7 +1340,7 @@ static int ext3_writeback_write_end(struct file *file,
page_cache_release(page);

if (pos + len > inode->i_size)
- ext3_truncate(inode);
+ ext3_truncate_failed_write(inode);
return ret ? ret : copied;
}

@@ -1383,7 +1393,7 @@ static int ext3_journalled_write_end(struct file *file,
page_cache_release(page);

if (pos + len > inode->i_size)
- ext3_truncate(inode);
+ ext3_truncate_failed_write(inode);
return ret ? ret : copied;
}

--
1.6.4.2



2009-12-02 19:17:19

by Jan Kara

[permalink] [raw]
Subject: [PATCH 2/3] ext4: Fix data / filesystem corruption when write fails to copy data

When ext4_write_begin fails after allocating some blocks or
generic_perform_write fails to copy data to write, we truncate blocks already
instantiated beyond i_size. Although these blocks were never inside i_size, we
have to truncate pagecache of these blocks so that corresponding buffers get
unmapped. Otherwise subsequent __block_prepare_write (called because we are
retrying the write) will find the buffers mapped, not call ->get_block, and
thus the page will be backed by already freed blocks leading to filesystem and
data corruption.

CC: [email protected]
CC: [email protected]
Signed-off-by: Jan Kara <[email protected]>
---
fs/ext4/inode.c | 20 +++++++++++++++-----
1 files changed, 15 insertions(+), 5 deletions(-)

Ted, will you please merge this patch? Thanks.

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c8caa5..18b9416 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1534,6 +1534,16 @@ static int do_journal_get_write_access(handle_t *handle,
return ext4_journal_get_write_access(handle, bh);
}

+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext4_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext4_truncate(inode);
+}
+
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1599,7 +1609,7 @@ retry:

ext4_journal_stop(handle);
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
* still be on the orphan list; we need to
@@ -1709,7 +1719,7 @@ static int ext4_ordered_write_end(struct file *file,
ret = ret2;

if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1751,7 +1761,7 @@ static int ext4_writeback_write_end(struct file *file,
ret = ret2;

if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1814,7 +1824,7 @@ static int ext4_journalled_write_end(struct file *file,
if (!ret)
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -3091,7 +3101,7 @@ retry:
* i_size_read because we hold i_mutex.
*/
if (pos + len > inode->i_size)
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
}

if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
--
1.6.4.2


2009-12-09 02:26:22

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH 2/3] ext4: Fix data / filesystem corruption when write fails to copy data

On Wed, Dec 02, 2009 at 08:16:48PM +0100, Jan Kara wrote:
> When ext4_write_begin fails after allocating some blocks or
> generic_perform_write fails to copy data to write, we truncate blocks already
> instantiated beyond i_size. Although these blocks were never inside i_size, we
> have to truncate pagecache of these blocks so that corresponding buffers get
> unmapped. Otherwise subsequent __block_prepare_write (called because we are
> retrying the write) will find the buffers mapped, not call ->get_block, and
> thus the page will be backed by already freed blocks leading to filesystem and
> data corruption.

Added to the ext4 patch queue.

- Ted

2009-12-09 15:42:12

by saeed bishara

[permalink] [raw]
Subject: Re: [PATCH 1/3] ext3: Fix data / filesystem corruption when write fails to copy data

Hi,
I came a cross data corruption bug when using ext3, this patch fixed
it. the bug exists in 2.6.31 and 32.
saeed


On Wed, Dec 2, 2009 at 9:16 PM, Jan Kara <[email protected]> wrote:
> When ext3_write_begin fails after allocating some blocks or
> generic_perform_write fails to copy data to write, we truncate blocks already
> instantiated beyond i_size. Although these blocks were never inside i_size, we
> have to truncate pagecache of these blocks so that corresponding buffers get
> unmapped. Otherwise subsequent __block_prepare_write (called because we are
> retrying the write) will find the buffers mapped, not call ->get_block, and
> thus the page will be backed by already freed blocks leading to filesystem and
> data corruption.
>
> CC: [email protected]
> Reported-by: James Y Knight <[email protected]>
> Signed-off-by: Jan Kara <[email protected]>
> ---
>  fs/ext3/inode.c |   18 ++++++++++++++----
>  1 files changed, 14 insertions(+), 4 deletions(-)
>
> I will take care of merging this patch. I'm just sending it for completeness...
>
> diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
> index 354ed3b..f9d6937 100644
> --- a/fs/ext3/inode.c
> +++ b/fs/ext3/inode.c
> @@ -1151,6 +1151,16 @@ static int do_journal_get_write_access(handle_t *handle,
>        return ext3_journal_get_write_access(handle, bh);
>  }
>
> +/*
> + * Truncate blocks that were not used by write. We have to truncate the
> + * pagecache as well so that corresponding buffers get properly unmapped.
> + */
> +static void ext3_truncate_failed_write(struct inode *inode)
> +{
> +       truncate_inode_pages(inode->i_mapping, inode->i_size);
> +       ext3_truncate(inode);
> +}
> +
>  static int ext3_write_begin(struct file *file, struct address_space *mapping,
>                                loff_t pos, unsigned len, unsigned flags,
>                                struct page **pagep, void **fsdata)
> @@ -1209,7 +1219,7 @@ write_begin_failed:
>                unlock_page(page);
>                page_cache_release(page);
>                if (pos + len > inode->i_size)
> -                       ext3_truncate(inode);
> +                       ext3_truncate_failed_write(inode);
>        }
>        if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
>                goto retry;
> @@ -1304,7 +1314,7 @@ static int ext3_ordered_write_end(struct file *file,
>        page_cache_release(page);
>
>        if (pos + len > inode->i_size)
> -               ext3_truncate(inode);
> +               ext3_truncate_failed_write(inode);
>        return ret ? ret : copied;
>  }
>
> @@ -1330,7 +1340,7 @@ static int ext3_writeback_write_end(struct file *file,
>        page_cache_release(page);
>
>        if (pos + len > inode->i_size)
> -               ext3_truncate(inode);
> +               ext3_truncate_failed_write(inode);
>        return ret ? ret : copied;
>  }
>
> @@ -1383,7 +1393,7 @@ static int ext3_journalled_write_end(struct file *file,
>        page_cache_release(page);
>
>        if (pos + len > inode->i_size)
> -               ext3_truncate(inode);
> +               ext3_truncate_failed_write(inode);
>        return ret ? ret : copied;
>  }
>
> --
> 1.6.4.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>

2009-12-09 16:07:38

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 1/3] ext3: Fix data / filesystem corruption when write fails to copy data

Hi,

On Wed 09-12-09 17:42:12, saeed bishara wrote:
> I came a cross data corruption bug when using ext3, this patch fixed
> it. the bug exists in 2.6.31 and 32.
Yes, I plan to send the fix to [email protected] so that it gets fixed in
the stable releases for these kernels as well. Thanks for your notice.

Honza

> On Wed, Dec 2, 2009 at 9:16 PM, Jan Kara <[email protected]> wrote:
> > When ext3_write_begin fails after allocating some blocks or
> > generic_perform_write fails to copy data to write, we truncate blocks already
> > instantiated beyond i_size. Although these blocks were never inside i_size, we
> > have to truncate pagecache of these blocks so that corresponding buffers get
> > unmapped. Otherwise subsequent __block_prepare_write (called because we are
> > retrying the write) will find the buffers mapped, not call ->get_block, and
> > thus the page will be backed by already freed blocks leading to filesystem and
> > data corruption.
> >
> > CC: [email protected]
> > Reported-by: James Y Knight <[email protected]>
> > Signed-off-by: Jan Kara <[email protected]>
> > ---
> > ?fs/ext3/inode.c | ? 18 ++++++++++++++----
> > ?1 files changed, 14 insertions(+), 4 deletions(-)
> >
> > I will take care of merging this patch. I'm just sending it for completeness...
> >
> > diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
> > index 354ed3b..f9d6937 100644
> > --- a/fs/ext3/inode.c
> > +++ b/fs/ext3/inode.c
> > @@ -1151,6 +1151,16 @@ static int do_journal_get_write_access(handle_t *handle,
> > ? ? ? ?return ext3_journal_get_write_access(handle, bh);
> > ?}
> >
> > +/*
> > + * Truncate blocks that were not used by write. We have to truncate the
> > + * pagecache as well so that corresponding buffers get properly unmapped.
> > + */
> > +static void ext3_truncate_failed_write(struct inode *inode)
> > +{
> > + ? ? ? truncate_inode_pages(inode->i_mapping, inode->i_size);
> > + ? ? ? ext3_truncate(inode);
> > +}
> > +
> > ?static int ext3_write_begin(struct file *file, struct address_space *mapping,
> > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?loff_t pos, unsigned len, unsigned flags,
> > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct page **pagep, void **fsdata)
> > @@ -1209,7 +1219,7 @@ write_begin_failed:
> > ? ? ? ? ? ? ? ?unlock_page(page);
> > ? ? ? ? ? ? ? ?page_cache_release(page);
> > ? ? ? ? ? ? ? ?if (pos + len > inode->i_size)
> > - ? ? ? ? ? ? ? ? ? ? ? ext3_truncate(inode);
> > + ? ? ? ? ? ? ? ? ? ? ? ext3_truncate_failed_write(inode);
> > ? ? ? ?}
> > ? ? ? ?if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
> > ? ? ? ? ? ? ? ?goto retry;
> > @@ -1304,7 +1314,7 @@ static int ext3_ordered_write_end(struct file *file,
> > ? ? ? ?page_cache_release(page);
> >
> > ? ? ? ?if (pos + len > inode->i_size)
> > - ? ? ? ? ? ? ? ext3_truncate(inode);
> > + ? ? ? ? ? ? ? ext3_truncate_failed_write(inode);
> > ? ? ? ?return ret ? ret : copied;
> > ?}
> >
> > @@ -1330,7 +1340,7 @@ static int ext3_writeback_write_end(struct file *file,
> > ? ? ? ?page_cache_release(page);
> >
> > ? ? ? ?if (pos + len > inode->i_size)
> > - ? ? ? ? ? ? ? ext3_truncate(inode);
> > + ? ? ? ? ? ? ? ext3_truncate_failed_write(inode);
> > ? ? ? ?return ret ? ret : copied;
> > ?}
> >
> > @@ -1383,7 +1393,7 @@ static int ext3_journalled_write_end(struct file *file,
> > ? ? ? ?page_cache_release(page);
> >
> > ? ? ? ?if (pos + len > inode->i_size)
> > - ? ? ? ? ? ? ? ext3_truncate(inode);
> > + ? ? ? ? ? ? ? ext3_truncate_failed_write(inode);
> > ? ? ? ?return ret ? ret : copied;
> > ?}
> >
> > --
> > 1.6.4.2
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [email protected]
> > More majordomo info at ?http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at ?http://www.tux.org/lkml/
> >
--
Jan Kara <[email protected]>
SUSE Labs, CR